From f42231f9193952b45cd9ba4642adcca392a7ce14 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Fri, 22 Sep 2023 10:19:16 -0400
Subject: [PATCH 001/118] v23.12 Updates [skip ci]

---
 .github/workflows/build.yaml                  | 16 +++++------
 .github/workflows/pr.yaml                     | 28 +++++++++----------
 .github/workflows/test.yaml                   | 16 +++++------
 README.md                                     |  2 +-
 ci/build_docs.sh                              |  2 +-
 ci/check_style.sh                             |  2 +-
 ci/test_wheel_dask_cudf.sh                    |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             |  8 +++---
 .../all_cuda-120_arch-x86_64.yaml             |  8 +++---
 cpp/CMakeLists.txt                            |  2 +-
 cpp/doxygen/Doxyfile                          |  4 +--
 cpp/examples/basic/CMakeLists.txt             |  2 +-
 cpp/examples/strings/CMakeLists.txt           |  2 +-
 cpp/libcudf_kafka/CMakeLists.txt              |  2 +-
 dependencies.yaml                             | 16 +++++------
 docs/cudf/source/conf.py                      |  4 +--
 docs/dask_cudf/source/conf.py                 |  4 +--
 fetch_rapids.cmake                            |  2 +-
 java/ci/README.md                             |  4 +--
 java/pom.xml                                  |  2 +-
 java/src/main/native/CMakeLists.txt           |  2 +-
 python/cudf/CMakeLists.txt                    |  2 +-
 python/cudf/cudf/__init__.py                  |  2 +-
 python/cudf/pyproject.toml                    |  6 ++--
 python/cudf_kafka/pyproject.toml              |  4 +--
 python/custreamz/pyproject.toml               |  6 ++--
 python/dask_cudf/dask_cudf/__init__.py        |  2 +-
 python/dask_cudf/pyproject.toml               |  6 ++--
 28 files changed, 79 insertions(+), 79 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 0e120d34bb1..ab028eb89cc 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -100,7 +100,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 054ea7968c8..214f9c90b41 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -26,34 +26,34 @@ jobs:
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.12
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.12
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: pull-request
       test_script: "ci/test_python_cudf.sh"
@@ -61,14 +61,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: pull-request
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -78,7 +78,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -88,7 +88,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -98,21 +98,21 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-dask-cudf:
     needs: wheel-tests-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
@@ -120,7 +120,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 030f2e41db4..9ca32bcfe03 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -36,7 +36,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -67,7 +67,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: nightly
diff --git a/README.md b/README.md
index 64c980d0cb3..5f2ce014dba 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ cuDF can be installed with conda (via [miniconda](https://conda.io/miniconda.htm
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=23.10 python=3.10 cuda-version=11.8
+    cudf=23.12 python=3.10 cuda-version=11.8
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 1ed047a500b..11e7a96b751 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -25,7 +25,7 @@ rapids-mamba-retry install \
   --channel "${PYTHON_CHANNEL}" \
   libcudf cudf dask-cudf
 
-export RAPIDS_VERSION_NUMBER="23.10"
+export RAPIDS_VERSION_NUMBER="23.12"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
diff --git a/ci/check_style.sh b/ci/check_style.sh
index e96ad8bf1db..a01cf4dcc6b 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -14,7 +14,7 @@ rapids-dependency-file-generator \
 rapids-mamba-retry env create --force -f env.yaml -n checks
 conda activate checks
 
-FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/cmake-format-rapids-cmake.json
+FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.12/cmake-format-rapids-cmake.json
 export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
 mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
 wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index d6e7f4bf65e..050aa4561c7 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -11,7 +11,7 @@ RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from
 python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
 
 # Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.10
+python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.12
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index d4abc28cf13..151d250d7e9 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -25,7 +25,7 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.0
 - dask-core>=2023.7.1
-- dask-cuda==23.10.*
+- dask-cuda==23.12.*
 - dask>=2023.7.1
 - distributed>=2023.7.1
 - dlpack>=0.5,<0.6.0a0
@@ -44,9 +44,9 @@ dependencies:
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
-- libkvikio==23.10.*
+- libkvikio==23.12.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==23.10.*
+- librmm==23.12.*
 - make
 - mimesis>=4.1.0
 - moto>=4.0.8
@@ -80,7 +80,7 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
-- rmm==23.10.*
+- rmm==23.12.*
 - s3fs>=2022.3.0
 - scikit-build>=0.13.1
 - scipy
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 9a98e400e6d..ad52b8f8b97 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -26,7 +26,7 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.0
 - dask-core>=2023.7.1
-- dask-cuda==23.10.*
+- dask-cuda==23.12.*
 - dask>=2023.7.1
 - distributed>=2023.7.1
 - dlpack>=0.5,<0.6.0a0
@@ -43,9 +43,9 @@ dependencies:
 - libarrow==12.0.1.*
 - libcufile-dev
 - libcurand-dev
-- libkvikio==23.10.*
+- libkvikio==23.12.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==23.10.*
+- librmm==23.12.*
 - make
 - mimesis>=4.1.0
 - moto>=4.0.8
@@ -77,7 +77,7 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
-- rmm==23.10.*
+- rmm==23.12.*
 - s3fs>=2022.3.0
 - scikit-build>=0.13.1
 - scipy
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a84f7bd5224..38713d99622 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ rapids_cuda_init_architectures(CUDF)
 
 project(
   CUDF
-  VERSION 23.10.00
+  VERSION 23.12.00
   LANGUAGES C CXX CUDA
 )
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5)
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index b072d252881..adefaaa1479 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = libcudf
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 23.10.00
+PROJECT_NUMBER         = 23.12.00
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -2226,7 +2226,7 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/23.10
+TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/23.12
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index 1c1952c4616..9ff716f41e4 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -16,7 +16,7 @@ file(
 )
 include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 
-set(CUDF_TAG branch-23.10)
+set(CUDF_TAG branch-23.12)
 CPMFindPackage(
   NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
   GIT_TAG ${CUDF_TAG}
diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt
index 31a6b12a4bc..4b500d3a92e 100644
--- a/cpp/examples/strings/CMakeLists.txt
+++ b/cpp/examples/strings/CMakeLists.txt
@@ -16,7 +16,7 @@ file(
 )
 include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 
-set(CUDF_TAG branch-23.10)
+set(CUDF_TAG branch-23.12)
 CPMFindPackage(
   NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
   GIT_TAG ${CUDF_TAG}
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index 33bd04fffb3..1a15a3ec2cd 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -22,7 +22,7 @@ include(rapids-find)
 
 project(
   CUDA_KAFKA
-  VERSION 23.10.00
+  VERSION 23.12.00
   LANGUAGES CXX
 )
 
diff --git a/dependencies.yaml b/dependencies.yaml
index 376e43094a7..66417b214ff 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -214,8 +214,8 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - librmm==23.10.*
-          - libkvikio==23.10.*
+          - librmm==23.12.*
+          - libkvikio==23.12.*
       - output_types: conda
         packages:
           - fmt>=9.1.0,<10
@@ -266,7 +266,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - scikit-build>=0.13.1
-          - rmm==23.10.*
+          - rmm==23.12.*
       - output_types: conda
         packages:
           - &protobuf protobuf>=4.21,<5
@@ -385,7 +385,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - dask-cuda==23.10.*
+          - dask-cuda==23.12.*
           - *doxygen
           - make
           - myst-nb
@@ -437,7 +437,7 @@ dependencies:
           - &numba numba>=0.57,<0.58
           - nvtx>=0.2.1
           - packaging
-          - rmm==23.10.*
+          - rmm==23.12.*
           - typing_extensions>=4.0.0
           - *protobuf
       - output_types: conda
@@ -498,7 +498,7 @@ dependencies:
           - dask-core>=2023.7.1  # dask-core in conda is the actual package & dask is the meta package
       - output_types: pyproject
         packages:
-          - &cudf cudf==23.10.*
+          - &cudf cudf==23.12.*
           - *cupy_pip
   run_cudf_kafka:
     common:
@@ -517,7 +517,7 @@ dependencies:
         packages:
           - confluent-kafka>=1.9.0,<1.10.0a0
           - *cudf
-          - cudf_kafka==23.10.*
+          - cudf_kafka==23.12.*
   test_cpp:
     common:
       - output_types: conda
@@ -599,5 +599,5 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask-cuda==23.10.*
+          - dask-cuda==23.12.*
           - *numba
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 03b1bb7039b..acb2a5d17f3 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -79,9 +79,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '23.10'
+version = '23.12'
 # The full version, including alpha/beta/rc tags.
-release = '23.10.00'
+release = '23.12.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py
index ad629b5e949..6861a9b90f6 100644
--- a/docs/dask_cudf/source/conf.py
+++ b/docs/dask_cudf/source/conf.py
@@ -11,8 +11,8 @@
 project = "dask-cudf"
 copyright = "2018-2023, NVIDIA Corporation"
 author = "NVIDIA Corporation"
-version = '23.10'
-release = '23.10.00'
+version = '23.12'
+release = '23.12.00'
 
 language = "en"
 
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index 4a68c7dbc60..e79d9d86fce 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.12/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake
   )
 endif()
diff --git a/java/ci/README.md b/java/ci/README.md
index e9599b33bf1..12a2bb2dc51 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-centos7 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-23.10
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-23.12
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,4 +47,4 @@ scl enable devtoolset-11 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-23.10.0-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-23.12.0-SNAPSHOT-cuda11.jar.
diff --git a/java/pom.xml b/java/pom.xml
index afcc0e15a2c..cc880312d34 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>23.10.0-SNAPSHOT</version>
+    <version>23.12.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 128989fe77f..0dcfee2cffe 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -28,7 +28,7 @@ rapids_cuda_init_architectures(CUDF_JNI)
 
 project(
   CUDF_JNI
-  VERSION 23.10.00
+  VERSION 23.12.00
   LANGUAGES C CXX CUDA
 )
 
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index 6f3e428d291..a8b91c27095 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(cudf_version 23.10.00)
+set(cudf_version 23.12.00)
 
 include(../../fetch_rapids.cmake)
 include(rapids-cuda)
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index e5c78fca893..8d25d478676 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -99,7 +99,7 @@
 rmm.register_reinitialize_hook(clear_cache)
 
 
-__version__ = "23.10.00"
+__version__ = "23.12.00"
 
 __all__ = [
     "BaseIndex",
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 085d78afc7c..39a8dca0267 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
     "numpy>=1.21,<1.25",
     "protoc-wheel",
     "pyarrow==12.0.1.*",
-    "rmm==23.10.*",
+    "rmm==23.12.*",
     "scikit-build>=0.13.1",
     "setuptools",
     "wheel",
@@ -17,7 +17,7 @@ requires = [
 
 [project]
 name = "cudf"
-version = "23.10.00"
+version = "23.12.00"
 description = "cuDF - GPU Dataframe"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -39,7 +39,7 @@ dependencies = [
     "protobuf>=4.21,<5",
     "ptxcompiler",
     "pyarrow==12.*",
-    "rmm==23.10.*",
+    "rmm==23.12.*",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 386cdc32ab1..78a7a83ac3a 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -12,7 +12,7 @@ requires = [
 
 [project]
 name = "cudf_kafka"
-version = "23.10.00"
+version = "23.12.00"
 description = "cuDF Kafka Datasource"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -21,7 +21,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==23.10.*",
+    "cudf==23.12.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 47ade91b5eb..e6328ed045d 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 
 [project]
 name = "custreamz"
-version = "23.10.00"
+version = "23.12.00"
 description = "cuStreamz - GPU Accelerated Streaming"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -19,8 +19,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "confluent-kafka>=1.9.0,<1.10.0a0",
-    "cudf==23.10.*",
-    "cudf_kafka==23.10.*",
+    "cudf==23.12.*",
+    "cudf_kafka==23.12.*",
     "streamz",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index 6952c3d5882..7c81f5da481 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -14,7 +14,7 @@
 except ImportError:
     pass
 
-__version__ = "23.10.00"
+__version__ = "23.12.00"
 
 __all__ = [
     "DataFrame",
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 922da366422..08441c6b5f7 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 
 [project]
 name = "dask_cudf"
-version = "23.10.00"
+version = "23.12.00"
 description = "Utilities for Dask and cuDF interactions"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -18,7 +18,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==23.10.*",
+    "cudf==23.12.*",
     "cupy-cuda11x>=12.0.0",
     "dask>=2023.7.1",
     "distributed>=2023.7.1",
@@ -39,7 +39,7 @@ dynamic = ["entry-points"]
 
 [project.optional-dependencies]
 test = [
-    "dask-cuda==23.10.*",
+    "dask-cuda==23.12.*",
     "numba>=0.57,<0.58",
     "pytest",
     "pytest-cov",

From cdc03a73db880e294f8c4916d942a4568a64d5db Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 27 Sep 2023 14:15:33 +0100
Subject: [PATCH 002/118] Marginally reduce memory footprint of joins (#14197)

If we drop the gather maps as soon as we are done with them, we have a little more headroom for joins that are close to hitting the device memory limit.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/14197
---
 python/cudf/cudf/core/join/join.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 6a6e37180ca..b94f8f583f4 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -203,6 +203,7 @@ def perform_merge(self) -> cudf.DataFrame:
             if left_rows is not None
             else cudf.DataFrame._from_data({})
         )
+        del left_rows
         right_result = (
             self.rhs._gather(
                 GatherMap.from_column_unchecked(
@@ -213,7 +214,7 @@ def perform_merge(self) -> cudf.DataFrame:
             if right_rows is not None
             else cudf.DataFrame._from_data({})
         )
-
+        del right_rows
         result = cudf.DataFrame._from_data(
             *self._merge_results(left_result, right_result)
         )

From 23d24d43fac8615166c38231f13fe8751a8aec42 Mon Sep 17 00:00:00 2001
From: Martin Marenz <martin.marenz+github@gmail.com>
Date: Thu, 28 Sep 2023 19:08:55 +0200
Subject: [PATCH 003/118] Add `bytes_per_second` to distinct_count of
 stream_compaction nvbench. (#14172)

This patch relates to #13735.

Benchmark:  [benchmark_distinct_count.txt](https://github.com/rapidsai/cudf/files/12700496/benchmark_distinct_count.txt)

Authors:
  - Martin Marenz (https://github.com/Blonck)
  - Mark Harris (https://github.com/harrism)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Karthikeyan (https://github.com/karthikeyann)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/14172
---
 cpp/benchmarks/stream_compaction/distinct_count.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cpp/benchmarks/stream_compaction/distinct_count.cpp b/cpp/benchmarks/stream_compaction/distinct_count.cpp
index 2b2c901b90f..3e324013d4e 100644
--- a/cpp/benchmarks/stream_compaction/distinct_count.cpp
+++ b/cpp/benchmarks/stream_compaction/distinct_count.cpp
@@ -40,6 +40,14 @@ static void bench_distinct_count(nvbench::state& state, nvbench::type_list<Type>
   auto const& data_column = data_table->get_column(0);
   auto const input_table  = cudf::table_view{{data_column, data_column, data_column}};
 
+  // Collect memory statistics for input and output.
+  state.add_global_memory_reads<Type>(input_table.num_rows() * input_table.num_columns());
+  state.add_global_memory_writes<cudf::size_type>(1);
+  if (null_probability > 0) {
+    state.add_global_memory_reads<nvbench::int8_t>(
+      input_table.num_columns() * cudf::bitmask_allocation_size_bytes(input_table.num_rows()));
+  }
+
   auto mem_stats_logger = cudf::memory_stats_logger();  // init stats logger
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {

From 59b09fd097e39bd15646eac1156889692974dc5f Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 29 Sep 2023 11:10:25 -0500
Subject: [PATCH 004/118] cuDF: Build CUDA 12.0 ARM conda packages. (#14112)

This PR builds conda packages using CUDA 12 on ARM.

This work is targeting 23.12 and depends on https://github.com/rapidsai/rmm/pull/1330.

Closes #14128.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14112
---
 .github/workflows/build.yaml | 16 ++++++++--------
 .github/workflows/pr.yaml    | 28 ++++++++++++++--------------
 .github/workflows/test.yaml  | 16 ++++++++--------
 dependencies.yaml            | 20 ++------------------
 4 files changed, 32 insertions(+), 48 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index ab028eb89cc..dc2c81d1c77 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -100,7 +100,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@cuda-120-arm
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 214f9c90b41..047b80f2e5c 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -26,34 +26,34 @@ jobs:
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@cuda-120-arm
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@cuda-120-arm
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-120-arm
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-120-arm
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120-arm
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
     with:
       build_type: pull-request
       test_script: "ci/test_python_cudf.sh"
@@ -61,14 +61,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
     with:
       build_type: pull-request
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -78,7 +78,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -88,7 +88,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -98,21 +98,21 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-dask-cudf:
     needs: wheel-tests-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
@@ -120,7 +120,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 9ca32bcfe03..e58227c30dc 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -36,7 +36,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -67,7 +67,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: nightly
diff --git a/dependencies.yaml b/dependencies.yaml
index c8ee66bd99f..c19e8765be3 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -227,25 +227,9 @@ dependencies:
           # in sync with the version pinned in get_arrow.cmake.
           - libarrow==12.0.1.*
           - librdkafka>=1.9.0,<1.10.0a0
+          # Align nvcomp version with rapids-cmake
+          - nvcomp==2.6.1
           - spdlog>=1.11.0,<1.12
-    specific:
-      - output_types: conda
-        matrices:
-          - matrix:
-              arch: x86_64
-            packages:
-              # Align nvcomp version with rapids-cmake
-              # TODO: not yet available for aarch64 CUDA 12
-              - &nvcomp nvcomp==2.6.1
-          - matrix:
-              arch: aarch64
-              cuda: "11.8"
-            packages:
-              - *nvcomp
-          # TODO: Fallback matrix for aarch64 CUDA 12. After migrating to nvcomp 3,
-          # all CUDA/arch combinations should be supported by existing packages.
-          - matrix:
-            packages:
   build_wheels:
     common:
       - output_types: pyproject

From 29556a2514f4d274164a27a80539410da7e132d6 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 3 Oct 2023 14:44:28 -0700
Subject: [PATCH 005/118] Remove the use of volatile in ORC (#14175)

`volatile` should no be required in our code, unless there are compiler or synchronization issues.
This PR removes the use in ORC reader and writer.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14175
---
 cpp/src/io/orc/stats_enc.cu   |  4 +-
 cpp/src/io/orc/stripe_data.cu | 82 +++++++++++++++--------------------
 cpp/src/io/orc/stripe_enc.cu  | 14 +++---
 cpp/src/io/orc/stripe_init.cu |  2 +-
 4 files changed, 46 insertions(+), 56 deletions(-)

diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 95f1db5bfd1..479a2dfada3 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -76,8 +76,8 @@ __global__ void __launch_bounds__(block_size, 1)
 {
   using block_scan = cub::BlockScan<uint32_t, block_size, cub::BLOCK_SCAN_WARP_SCANS>;
   __shared__ typename block_scan::TempStorage temp_storage;
-  volatile uint32_t stats_size = 0;
-  auto t                       = threadIdx.x;
+  uint32_t stats_size = 0;
+  auto t              = threadIdx.x;
   __syncthreads();
   for (thread_index_type start = 0; start < statistics_count; start += block_size) {
     uint32_t stats_len = 0, stats_pos;
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 3edcd3d83b2..0b249bbdafe 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -142,9 +142,7 @@ struct orcdec_state_s {
  * @param[in] base Pointer to raw byte stream data
  * @param[in] len Stream length in bytes
  */
-static __device__ void bytestream_init(volatile orc_bytestream_s* bs,
-                                       uint8_t const* base,
-                                       uint32_t len)
+static __device__ void bytestream_init(orc_bytestream_s* bs, uint8_t const* base, uint32_t len)
 {
   uint32_t pos   = (len > 0) ? static_cast<uint32_t>(7 & reinterpret_cast<size_t>(base)) : 0;
   bs->base       = base - pos;
@@ -160,8 +158,7 @@ static __device__ void bytestream_init(volatile orc_bytestream_s* bs,
  * @param[in] bs Byte stream input
  * @param[in] bytes_consumed Number of bytes that were consumed
  */
-static __device__ void bytestream_flush_bytes(volatile orc_bytestream_s* bs,
-                                              uint32_t bytes_consumed)
+static __device__ void bytestream_flush_bytes(orc_bytestream_s* bs, uint32_t bytes_consumed)
 {
   uint32_t pos     = bs->pos;
   uint32_t len     = bs->len;
@@ -197,7 +194,7 @@ static __device__ void bytestream_fill(orc_bytestream_s* bs, int t)
  * @param[in] pos Position in byte stream
  * @return byte
  */
-inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s* bs, int pos)
+inline __device__ uint8_t bytestream_readbyte(orc_bytestream_s* bs, int pos)
 {
   return bs->buf.u8[pos & (bytestream_buffer_size - 1)];
 }
@@ -209,7 +206,7 @@ inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s* bs, int
  * @param[in] pos Position in byte stream
  * @result bits
  */
-inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s* bs, int pos)
+inline __device__ uint32_t bytestream_readu32(orc_bytestream_s* bs, int pos)
 {
   uint32_t a = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2];
   uint32_t b = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2];
@@ -224,7 +221,7 @@ inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s* bs, int
  * @param[in] numbits number of bits
  * @return bits
  */
-inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s* bs, int pos)
+inline __device__ uint64_t bytestream_readu64(orc_bytestream_s* bs, int pos)
 {
   uint32_t a    = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2];
   uint32_t b    = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2];
@@ -245,9 +242,7 @@ inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s* bs, int
  * @param[in] numbits number of bits
  * @return decoded value
  */
-inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s* bs,
-                                               int bitpos,
-                                               uint32_t numbits)
+inline __device__ uint32_t bytestream_readbits(orc_bytestream_s* bs, int bitpos, uint32_t numbits)
 {
   int idx    = bitpos >> 5;
   uint32_t a = __byte_perm(bs->buf.u32[(idx + 0) & bytestream_buffer_mask], 0, 0x0123);
@@ -263,9 +258,7 @@ inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @return decoded value
  */
-inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s* bs,
-                                                 int bitpos,
-                                                 uint32_t numbits)
+inline __device__ uint64_t bytestream_readbits64(orc_bytestream_s* bs, int bitpos, uint32_t numbits)
 {
   int idx       = bitpos >> 5;
   uint32_t a    = __byte_perm(bs->buf.u32[(idx + 0) & bytestream_buffer_mask], 0, 0x0123);
@@ -288,7 +281,7 @@ inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
+inline __device__ void bytestream_readbe(orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
                                          uint32_t& result)
@@ -304,7 +297,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
+inline __device__ void bytestream_readbe(orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
                                          int32_t& result)
@@ -321,7 +314,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
+inline __device__ void bytestream_readbe(orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
                                          uint64_t& result)
@@ -337,7 +330,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
  * @param[in] numbits number of bits
  * @param[out] result decoded value
  */
-inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
+inline __device__ void bytestream_readbe(orc_bytestream_s* bs,
                                          int bitpos,
                                          uint32_t numbits,
                                          int64_t& result)
@@ -354,7 +347,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs,
  * @return length of varint in bytes
  */
 template <class T>
-inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos)
+inline __device__ uint32_t varint_length(orc_bytestream_s* bs, int pos)
 {
   if (bytestream_readbyte(bs, pos) > 0x7f) {
     uint32_t next32 = bytestream_readu32(bs, pos + 1);
@@ -392,7 +385,7 @@ inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos)
  * @return new position in byte stream buffer
  */
 template <class T>
-inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int pos, T& result)
+inline __device__ int decode_base128_varint(orc_bytestream_s* bs, int pos, T& result)
 {
   uint32_t v = bytestream_readbyte(bs, pos++);
   if (v > 0x7f) {
@@ -446,7 +439,7 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int p
 /**
  * @brief Decodes a signed int128 encoded as base-128 varint (used for decimals)
  */
-inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int pos)
+inline __device__ __int128_t decode_varint128(orc_bytestream_s* bs, int pos)
 {
   auto byte                  = bytestream_readbyte(bs, pos++);
   __int128_t const sign_mask = -(int32_t)(byte & 1);
@@ -463,7 +456,7 @@ inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int
 /**
  * @brief Decodes an unsigned 32-bit varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint32_t& result)
+inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, uint32_t& result)
 {
   uint32_t u;
   pos    = decode_base128_varint<uint32_t>(bs, pos, u);
@@ -474,7 +467,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint
 /**
  * @brief Decodes an unsigned 64-bit varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint64_t& result)
+inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, uint64_t& result)
 {
   uint64_t u;
   pos    = decode_base128_varint<uint64_t>(bs, pos, u);
@@ -485,7 +478,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint
 /**
  * @brief Signed version of 32-bit decode_varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int32_t& result)
+inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, int32_t& result)
 {
   uint32_t u;
   pos    = decode_base128_varint<uint32_t>(bs, pos, u);
@@ -496,7 +489,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int3
 /**
  * @brief Signed version of 64-bit decode_varint
  */
-inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int64_t& result)
+inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, int64_t& result)
 {
   uint64_t u;
   pos    = decode_base128_varint<uint64_t>(bs, pos, u);
@@ -514,7 +507,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int6
  * @return number of values decoded
  */
 template <class T>
-inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t)
+inline __device__ void lengths_to_positions(T* vals, uint32_t numvals, unsigned int t)
 {
   for (uint32_t n = 1; n < numvals; n <<= 1) {
     __syncthreads();
@@ -534,8 +527,8 @@ inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals,
  * @return number of values decoded
  */
 template <class T>
-static __device__ uint32_t Integer_RLEv1(
-  orc_bytestream_s* bs, volatile orc_rlev1_state_s* rle, volatile T* vals, uint32_t maxvals, int t)
+static __device__ uint32_t
+Integer_RLEv1(orc_bytestream_s* bs, orc_rlev1_state_s* rle, T* vals, uint32_t maxvals, int t)
 {
   uint32_t numvals, numruns;
   if (t == 0) {
@@ -642,8 +635,8 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = {
  */
 template <class T>
 static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
-                                         volatile orc_rlev2_state_s* rle,
-                                         volatile T* vals,
+                                         orc_rlev2_state_s* rle,
+                                         T* vals,
                                          uint32_t maxvals,
                                          int t,
                                          bool has_buffered_values = false)
@@ -883,7 +876,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
  *
  * @return 32-bit value
  */
-inline __device__ uint32_t rle8_read_bool32(volatile uint32_t* vals, uint32_t bitpos)
+inline __device__ uint32_t rle8_read_bool32(uint32_t* vals, uint32_t bitpos)
 {
   uint32_t a = vals[(bitpos >> 5) + 0];
   uint32_t b = vals[(bitpos >> 5) + 1];
@@ -903,11 +896,8 @@ inline __device__ uint32_t rle8_read_bool32(volatile uint32_t* vals, uint32_t bi
  *
  * @return number of values decoded
  */
-static __device__ uint32_t Byte_RLE(orc_bytestream_s* bs,
-                                    volatile orc_byterle_state_s* rle,
-                                    volatile uint8_t* vals,
-                                    uint32_t maxvals,
-                                    int t)
+static __device__ uint32_t
+Byte_RLE(orc_bytestream_s* bs, orc_byterle_state_s* rle, uint8_t* vals, uint32_t maxvals, int t)
 {
   uint32_t numvals, numruns;
   int r, tr;
@@ -1006,8 +996,8 @@ static const __device__ __constant__ int64_t kPow5i[28] = {1,
  * @return number of values decoded
  */
 static __device__ int Decode_Decimals(orc_bytestream_s* bs,
-                                      volatile orc_byterle_state_s* scratch,
-                                      volatile orcdec_state_s::values& vals,
+                                      orc_byterle_state_s* scratch,
+                                      orcdec_state_s::values& vals,
                                       int val_scale,
                                       int numvals,
                                       type_id dtype_id,
@@ -1241,8 +1231,8 @@ __global__ void __launch_bounds__(block_size)
       }
       __syncthreads();
       while (s->top.dict.dict_len > 0) {
-        uint32_t numvals        = min(s->top.dict.dict_len, blockDim.x), len;
-        volatile uint32_t* vals = s->vals.u32;
+        uint32_t numvals = min(s->top.dict.dict_len, blockDim.x), len;
+        uint32_t* vals   = s->vals.u32;
         bytestream_fill(&s->bs, t);
         __syncthreads();
         if (is_rlev1(s->chunk.encoding_kind)) {
@@ -1310,12 +1300,12 @@ static __device__ void DecodeRowPositions(orcdec_state_s* s,
                          min((row_decoder_buffer_size - s->u.rowdec.nz_count) * 2, blockDim.x));
     if (s->chunk.valid_map_base != nullptr) {
       // We have a present stream
-      uint32_t rmax  = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row);
-      auto r         = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row);
-      uint32_t valid = (t < nrows && r < rmax)
-                         ? (((uint8_t const*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1
-                         : 0;
-      volatile auto* row_ofs_plus1 = (volatile uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count];
+      uint32_t rmax       = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row);
+      auto r              = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row);
+      uint32_t valid      = (t < nrows && r < rmax)
+                              ? (((uint8_t const*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1
+                              : 0;
+      auto* row_ofs_plus1 = (uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count];
       uint32_t nz_pos, row_plus1, nz_count = s->u.rowdec.nz_count, last_row;
       if (t < nrows) { row_ofs_plus1[t] = valid; }
       lengths_to_positions<uint16_t>(row_ofs_plus1, nrows, t);
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 73c41e2bbcd..4841fb1141a 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -53,7 +53,7 @@ constexpr bool zero_pll_war = true;
 struct byterle_enc_state_s {
   uint32_t literal_run;
   uint32_t repeat_run;
-  volatile uint32_t rpt_map[(512 / 32) + 1];
+  uint32_t rpt_map[(512 / 32) + 1];
 };
 
 struct intrle_enc_state_s {
@@ -63,7 +63,7 @@ struct intrle_enc_state_s {
   uint32_t literal_w;
   uint32_t hdr_bytes;
   uint32_t pl_bytes;
-  volatile uint32_t delta_map[(512 / 32) + 1];
+  uint32_t delta_map[(512 / 32) + 1];
 };
 
 struct strdata_enc_state_s {
@@ -366,7 +366,7 @@ static __device__ uint32_t IntegerRLE(
   using block_reduce = cub::BlockReduce<T, block_size>;
   uint8_t* dst       = s->stream.data_ptrs[cid] + s->strm_pos[cid];
   uint32_t out_cnt   = 0;
-  __shared__ volatile uint64_t block_vmin;
+  __shared__ uint64_t block_vmin;
 
   while (numvals > 0) {
     T v0               = (t < numvals) ? inbuf[(inpos + t) & inmask] : 0;
@@ -615,7 +615,7 @@ static __device__ void StoreStringData(uint8_t* dst,
  * @param[in] t thread id
  */
 template <class T>
-inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t)
+inline __device__ void lengths_to_positions(T* vals, uint32_t numvals, unsigned int t)
 {
   for (uint32_t n = 1; n < numvals; n <<= 1) {
     __syncthreads();
@@ -1143,7 +1143,7 @@ __global__ void __launch_bounds__(256)
                            uint32_t comp_block_align)
 {
   __shared__ __align__(16) StripeStream ss;
-  __shared__ uint8_t* volatile uncomp_base_g;
+  __shared__ uint8_t* uncomp_base_g;
 
   auto const padded_block_header_size = util::round_up_unsafe(block_header_size, comp_block_align);
   auto const padded_comp_block_size   = util::round_up_unsafe(max_comp_blk_size, comp_block_align);
@@ -1196,8 +1196,8 @@ __global__ void __launch_bounds__(1024)
                              uint32_t max_comp_blk_size)
 {
   __shared__ __align__(16) StripeStream ss;
-  __shared__ uint8_t const* volatile comp_src_g;
-  __shared__ uint32_t volatile comp_len_g;
+  __shared__ uint8_t const* comp_src_g;
+  __shared__ uint32_t comp_len_g;
 
   auto const stripe_id = blockIdx.x;
   auto const stream_id = blockIdx.y;
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 8eeca504121..b31a4a081d1 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -499,7 +499,7 @@ __global__ void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_gr
           : row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].start_row;
       for (int j = t4; j < rowgroup_size4; j += 4) {
         ((uint32_t*)&row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x])[j] =
-          ((volatile uint32_t*)&s->rowgroups[i])[j];
+          ((uint32_t*)&s->rowgroups[i])[j];
       }
       row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].num_rows = num_rows;
       // Updating in case of struct

From d87e181daa67d8fb1a029fc2c09e2f561d1e7234 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 4 Oct 2023 13:25:56 -0700
Subject: [PATCH 006/118] Expose streams in binaryop APIs (#14187)

Contributes to #925

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/14187
---
 cpp/include/cudf/binaryop.hpp           |   8 ++
 cpp/src/binaryop/binaryop.cpp           |  12 ++-
 cpp/src/binaryop/compiled/binary_ops.cu |   6 +-
 cpp/tests/CMakeLists.txt                |   1 +
 cpp/tests/streams/binaryop_test.cpp     | 126 ++++++++++++++++++++++++
 5 files changed, 147 insertions(+), 6 deletions(-)
 create mode 100644 cpp/tests/streams/binaryop_test.cpp

diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 77d6a4d1e89..9df4b4eb00f 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -102,6 +102,7 @@ enum class binary_operator : int32_t {
  * @param rhs         The right operand column
  * @param op          The binary operator
  * @param output_type The desired data type of the output column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
@@ -115,6 +116,7 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -131,6 +133,7 @@ std::unique_ptr<column> binary_operation(
  * @param rhs         The right operand scalar
  * @param op          The binary operator
  * @param output_type The desired data type of the output column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
@@ -144,6 +147,7 @@ std::unique_ptr<column> binary_operation(
   scalar const& rhs,
   binary_operator op,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -158,6 +162,7 @@ std::unique_ptr<column> binary_operation(
  * @param rhs         The right operand column
  * @param op          The binary operator
  * @param output_type The desired data type of the output column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
@@ -172,6 +177,7 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -189,6 +195,7 @@ std::unique_ptr<column> binary_operation(
  * @param output_type The desired data type of the output column. It is assumed
  *                    that output_type is compatible with the output data type
  *                    of the function in the PTX code
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr          Device memory resource used to allocate the returned column's device memory
  * @return            Output column of `output_type` type containing the result of
  *                    the binary operation
@@ -201,6 +208,7 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   std::string const& ptx,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index ef07de8c461..6b413ab2be4 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -405,38 +405,42 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
                                          data_type output_type,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 }
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          scalar const& rhs,
                                          binary_operator op,
                                          data_type output_type,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 }
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          column_view const& rhs,
                                          binary_operator op,
                                          data_type output_type,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
 }
 
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          column_view const& rhs,
                                          std::string const& ptx,
                                          data_type output_type,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, ptx, output_type, cudf::get_default_stream(), mr);
+  return detail::binary_operation(lhs, rhs, ptx, output_type, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 1f7f342632a..85ab5c6d6cb 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -47,14 +47,16 @@ namespace {
 struct scalar_as_column_view {
   using return_type = typename std::pair<column_view, std::unique_ptr<column>>;
   template <typename T, CUDF_ENABLE_IF(is_fixed_width<T>())>
-  return_type operator()(scalar const& s, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+  return_type operator()(scalar const& s,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource*)
   {
     auto& h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
     auto col_v               = column_view(s.type(),
                              1,
                              h_scalar_type_view.data(),
                              reinterpret_cast<bitmask_type const*>(s.validity_data()),
-                             !s.is_valid());
+                             !s.is_valid(stream));
     return std::pair{col_v, std::unique_ptr<column>(nullptr)};
   }
   template <typename T, CUDF_ENABLE_IF(!is_fixed_width<T>())>
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 04939f3cd6d..ac13c121530 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -622,6 +622,7 @@ ConfigureTest(
   STREAM_IDENTIFICATION_TEST identify_stream_usage/test_default_stream_identification.cu
 )
 
+ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/binaryop_test.cpp b/cpp/tests/streams/binaryop_test.cpp
new file mode 100644
index 00000000000..2520aed0458
--- /dev/null
+++ b/cpp/tests/streams/binaryop_test.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/binaryop/util/runtime_support.h>
+
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class BinaryopTest : public cudf::test::BaseFixture {};
+
+TEST_F(BinaryopTest, ColumnColumn)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs{10, 20, 30, 40, 50};
+  cudf::test::fixed_width_column_wrapper<int32_t> rhs{15, 25, 35, 45, 55};
+
+  cudf::binary_operation(lhs,
+                         rhs,
+                         cudf::binary_operator::ADD,
+                         cudf::data_type(cudf::type_to_id<int32_t>()),
+                         cudf::test::get_default_stream());
+}
+
+TEST_F(BinaryopTest, ColumnScalar)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs{10, 20, 30, 40, 50};
+  cudf::numeric_scalar<int32_t> rhs{23, true, cudf::test::get_default_stream()};
+
+  cudf::binary_operation(lhs,
+                         rhs,
+                         cudf::binary_operator::ADD,
+                         cudf::data_type(cudf::type_to_id<int32_t>()),
+                         cudf::test::get_default_stream());
+}
+
+TEST_F(BinaryopTest, ScalarColumn)
+{
+  cudf::numeric_scalar<int32_t> lhs{42, true, cudf::test::get_default_stream()};
+  cudf::test::fixed_width_column_wrapper<int32_t> rhs{15, 25, 35, 45, 55};
+
+  cudf::binary_operation(lhs,
+                         rhs,
+                         cudf::binary_operator::ADD,
+                         cudf::data_type(cudf::type_to_id<int32_t>()),
+                         cudf::test::get_default_stream());
+}
+
+class BinaryopPTXTest : public BinaryopTest {
+ protected:
+  void SetUp() override
+  {
+    if (!can_do_runtime_jit()) { GTEST_SKIP() << "Skipping tests that require 11.5 runtime"; }
+  }
+};
+
+TEST_F(BinaryopPTXTest, ColumnColumnPTX)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs{10, 20, 30, 40, 50};
+  cudf::test::fixed_width_column_wrapper<int64_t> rhs{15, 25, 35, 45, 55};
+
+  // c = a*a*a + b*b
+  char const* ptx =
+    R"***(
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-24817639
+// Cuda compilation tools, release 10.0, V10.0.130
+// Based on LLVM 3.4svn
+//
+
+.version 6.3
+.target sm_70
+.address_size 64
+
+	// .globl	_ZN8__main__7add$241Eix
+.common .global .align 8 .u64 _ZN08NumbaEnv8__main__7add$241Eix;
+.common .global .align 8 .u64 _ZN08NumbaEnv5numba7targets7numbers14int_power_impl12$3clocals$3e13int_power$242Exx;
+
+.visible .func  (.param .b32 func_retval0) _ZN8__main__7add$241Eix(
+	.param .b64 _ZN8__main__7add$241Eix_param_0,
+	.param .b32 _ZN8__main__7add$241Eix_param_1,
+	.param .b64 _ZN8__main__7add$241Eix_param_2
+)
+{
+	.reg .b32 	%r<3>;
+	.reg .b64 	%rd<8>;
+
+
+	ld.param.u64 	%rd1, [_ZN8__main__7add$241Eix_param_0];
+	ld.param.u32 	%r1, [_ZN8__main__7add$241Eix_param_1];
+	ld.param.u64 	%rd2, [_ZN8__main__7add$241Eix_param_2];
+	cvt.s64.s32	%rd3, %r1;
+	mul.wide.s32 	%rd4, %r1, %r1;
+	mul.lo.s64 	%rd5, %rd4, %rd3;
+	mul.lo.s64 	%rd6, %rd2, %rd2;
+	add.s64 	%rd7, %rd6, %rd5;
+	st.u64 	[%rd1], %rd7;
+	mov.u32 	%r2, 0;
+	st.param.b32	[func_retval0+0], %r2;
+	ret;
+}
+
+)***";
+
+  cudf::binary_operation(
+    lhs, rhs, ptx, cudf::data_type(cudf::type_to_id<int32_t>()), cudf::test::get_default_stream());
+  cudf::binary_operation(lhs, rhs, ptx, cudf::data_type(cudf::type_to_id<int64_t>()));
+}

From b120f7e73e882b4eaa6b5a2cb91aeed20bf1198d Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 4 Oct 2023 14:23:24 -0700
Subject: [PATCH 007/118] Improve `contains_column` by invoking
 `contains_table` (#14238)

Part of #https://github.com/rapidsai/cudf/issues/12261

This PR simplifies the `contains_column` implementation by invoking `contains_table` and gets rid of the use of the cudf `unordered_multiset`. It also removes the `unordered_multiset` header file from libcudf.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14238
---
 cpp/src/hash/unordered_multiset.cuh | 159 ----------------------------
 cpp/src/search/contains_column.cu   |  67 +-----------
 2 files changed, 1 insertion(+), 225 deletions(-)
 delete mode 100644 cpp/src/hash/unordered_multiset.cuh

diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh
deleted file mode 100644
index 183042fc0f4..00000000000
--- a/cpp/src/hash/unordered_multiset.cuh
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/hashing/detail/default_hash.cuh>
-#include <cudf/hashing/detail/helper_functions.cuh>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/copy.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/scan.h>
-
-#include <cuda/atomic>
-
-namespace cudf {
-namespace detail {
-/*
- *  Device view of the unordered multiset
- */
-template <typename Element,
-          typename Hasher   = cudf::hashing::detail::default_hash<Element>,
-          typename Equality = equal_to<Element>>
-class unordered_multiset_device_view {
- public:
-  unordered_multiset_device_view(size_type hash_size,
-                                 size_type const* hash_begin,
-                                 Element const* hash_data)
-    : hash_size{hash_size}, hash_begin{hash_begin}, hash_data{hash_data}, hasher(), equals()
-  {
-  }
-
-  bool __device__ contains(Element e) const
-  {
-    size_type loc = hasher(e) % (2 * hash_size);
-
-    for (size_type i = hash_begin[loc]; i < hash_begin[loc + 1]; ++i) {
-      if (equals(hash_data[i], e)) return true;
-    }
-
-    return false;
-  }
-
- private:
-  Hasher hasher;
-  Equality equals;
-  size_type hash_size;
-  size_type const* hash_begin;
-  Element const* hash_data;
-};
-
-/*
- * Fixed size set on a device.
- */
-template <typename Element,
-          typename Hasher   = cudf::hashing::detail::default_hash<Element>,
-          typename Equality = equal_to<Element>>
-class unordered_multiset {
- public:
-  /**
-   * @brief Factory to construct a new unordered_multiset
-   */
-  static unordered_multiset<Element> create(column_view const& col, rmm::cuda_stream_view stream)
-  {
-    auto d_column = column_device_view::create(col, stream);
-    auto d_col    = *d_column;
-
-    auto hash_bins_start = cudf::detail::make_zeroed_device_uvector_async<size_type>(
-      2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource());
-    auto hash_bins_end = cudf::detail::make_zeroed_device_uvector_async<size_type>(
-      2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource());
-    auto hash_data = rmm::device_uvector<Element>(d_col.size(), stream);
-
-    Hasher hasher;
-    size_type* d_hash_bins_start = hash_bins_start.data();
-    size_type* d_hash_bins_end   = hash_bins_end.data();
-    Element* d_hash_data         = hash_data.data();
-
-    thrust::for_each(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(col.size()),
-      [d_hash_bins_start, d_col, hasher] __device__(size_t idx) {
-        if (!d_col.is_null(idx)) {
-          Element e     = d_col.element<Element>(idx);
-          size_type tmp = hasher(e) % (2 * d_col.size());
-          cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*(d_hash_bins_start + tmp)};
-          ref.fetch_add(1, cuda::std::memory_order_relaxed);
-        }
-      });
-
-    thrust::exclusive_scan(rmm::exec_policy(stream),
-                           hash_bins_start.begin(),
-                           hash_bins_start.end(),
-                           hash_bins_end.begin());
-
-    thrust::copy(rmm::exec_policy(stream),
-                 hash_bins_end.begin(),
-                 hash_bins_end.end(),
-                 hash_bins_start.begin());
-
-    thrust::for_each(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(col.size()),
-      [d_hash_bins_end, d_hash_data, d_col, hasher] __device__(size_t idx) {
-        if (!d_col.is_null(idx)) {
-          Element e     = d_col.element<Element>(idx);
-          size_type tmp = hasher(e) % (2 * d_col.size());
-          cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*(d_hash_bins_end + tmp)};
-          size_type offset    = ref.fetch_add(1, cuda::std::memory_order_relaxed);
-          d_hash_data[offset] = e;
-        }
-      });
-
-    return unordered_multiset(d_col.size(), std::move(hash_bins_start), std::move(hash_data));
-  }
-
-  unordered_multiset_device_view<Element, Hasher, Equality> to_device() const
-  {
-    return unordered_multiset_device_view<Element, Hasher, Equality>(
-      size, hash_bins.data(), hash_data.data());
-  }
-
- private:
-  unordered_multiset(size_type size,
-                     rmm::device_uvector<size_type>&& hash_bins,
-                     rmm::device_uvector<Element>&& hash_data)
-    : size{size}, hash_bins{std::move(hash_bins)}, hash_data{std::move(hash_data)}
-  {
-  }
-
-  size_type size;
-  rmm::device_uvector<size_type> hash_bins;
-  rmm::device_uvector<Element> hash_data;
-};
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu
index 4363bd212fe..85971647434 100644
--- a/cpp/src/search/contains_column.cu
+++ b/cpp/src/search/contains_column.cu
@@ -14,23 +14,14 @@
  * limitations under the License.
  */
 
-#include <hash/unordered_multiset.cuh>
-
-#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/search.hpp>
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform.h>
-#include <thrust/uninitialized_fill.h>
 
 namespace cudf {
 namespace detail {
@@ -38,61 +29,7 @@ namespace detail {
 namespace {
 
 struct contains_column_dispatch {
-  template <typename Element, typename Haystack>
-  struct contains_fn {
-    bool __device__ operator()(size_type const idx) const
-    {
-      if (needles_have_nulls && needles.is_null_nocheck(idx)) {
-        // Exit early. The value doesn't matter, and will be masked as a null element.
-        return true;
-      }
-
-      return haystack.contains(needles.template element<Element>(idx));
-    }
-
-    Haystack const haystack;
-    column_device_view const needles;
-    bool const needles_have_nulls;
-  };
-
-  template <typename Element, CUDF_ENABLE_IF(!is_nested<Element>())>
-  std::unique_ptr<column> operator()(column_view const& haystack,
-                                     column_view const& needles,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
-  {
-    auto result = make_numeric_column(data_type{type_to_id<bool>()},
-                                      needles.size(),
-                                      copy_bitmask(needles, stream, mr),
-                                      needles.null_count(),
-                                      stream,
-                                      mr);
-    if (needles.is_empty()) { return result; }
-
-    auto const out_begin = result->mutable_view().template begin<bool>();
-    if (haystack.is_empty()) {
-      thrust::uninitialized_fill(
-        rmm::exec_policy(stream), out_begin, out_begin + needles.size(), false);
-      return result;
-    }
-
-    auto const haystack_set = cudf::detail::unordered_multiset<Element>::create(haystack, stream);
-    auto const haystack_set_dv = haystack_set.to_device();
-    auto const needles_cdv_ptr = column_device_view::create(needles, stream);
-
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(needles.size()),
-                      out_begin,
-                      contains_fn<Element, decltype(haystack_set_dv)>{
-                        haystack_set_dv, *needles_cdv_ptr, needles.has_nulls()});
-
-    result->set_null_count(needles.null_count());
-
-    return result;
-  }
-
-  template <typename Element, CUDF_ENABLE_IF(is_nested<Element>())>
+  template <typename Element>
   std::unique_ptr<column> operator()(column_view const& haystack,
                                      column_view const& needles,
                                      rmm::cuda_stream_view stream,
@@ -144,8 +81,6 @@ std::unique_ptr<column> contains(column_view const& haystack,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(haystack.type() == needles.type(), "DTYPE mismatch");
-
   return cudf::type_dispatcher(
     haystack.type(), contains_column_dispatch{}, haystack, needles, stream, mr);
 }

From 5d311ea76ddc8bdbb357b6afdf64dfce6ece39a7 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 5 Oct 2023 13:21:31 -0400
Subject: [PATCH 008/118] Fix strings replace for adjacent, identical
 multi-byte UTF-8 character targets (#14235)

Fixes bug that can occur when replacing all occurrences in a string using a multi-byte UTF-8 target when the target matches sequentially in the same string -- some characters were missed.
Specialized gtest is also added.

Found while working on #13891

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14235
---
 cpp/src/strings/replace/replace.cu  |  2 +-
 cpp/tests/strings/replace_tests.cpp | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index a622d1a742d..acc1502f4d6 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -97,7 +97,7 @@ struct replace_row_parallel_fn {
       } else {
         bytes += d_repl.size_bytes() - d_target.size_bytes();
       }
-      position = d_str.find(d_target, position + d_target.size_bytes());
+      position = d_str.find(d_target, position + d_target.length());
       --max_n;
     }
     if (out_ptr)  // copy whats left (or right depending on your point of view)
diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp
index f143983aded..f04bb832f09 100644
--- a/cpp/tests/strings/replace_tests.cpp
+++ b/cpp/tests/strings/replace_tests.cpp
@@ -246,6 +246,28 @@ TEST_F(StringsReplaceTest, ReplaceEndOfString)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
+TEST_F(StringsReplaceTest, ReplaceAdjacentMultiByteTarget)
+{
+  auto input = cudf::test::strings_column_wrapper({"ééééééé", "eéeéeée", "eeeeeee"});
+  auto strings_view = cudf::strings_column_view(input);
+  // replace all occurrences of 'é' with 'e'
+  cudf::test::strings_column_wrapper expected({"eeeeeee", "eeeeeee", "eeeeeee"});
+
+  auto stream = cudf::get_default_stream();
+  auto mr     = rmm::mr::get_current_device_resource();
+
+  auto target  = cudf::string_scalar("é", true, stream);
+  auto repl    = cudf::string_scalar("e", true, stream);
+  auto results = cudf::strings::replace(strings_view, target, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = cudf::strings::detail::replace<algorithm::CHAR_PARALLEL>(
+    strings_view, target, repl, -1, stream, mr);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = cudf::strings::detail::replace<algorithm::ROW_PARALLEL>(
+    strings_view, target, repl, -1, stream, mr);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsReplaceTest, ReplaceSlice)
 {
   std::vector<char const*> h_strings{"Héllo", "thesé", nullptr, "ARE THE", "tést strings", ""};

From 04e2cd6ff4d525390d4a416651cefa16e11c2a50 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 6 Oct 2023 09:33:16 -0400
Subject: [PATCH 009/118] cudf::detail::pinned_allocator doesn't throw from
 `deallocate` (#14251)

Fixes #14165

The deallocate function is called by the `pinned_host_vector`. Throwing from destructors is bad since they can't be caught, and generally get converted into runtime sig aborts.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Divye Gala (https://github.com/divyegala)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/14251
---
 cpp/include/cudf/detail/utilities/pinned_host_vector.hpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
index 9e2b85ea129..eee974c8399 100644
--- a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
@@ -169,7 +169,12 @@ class pinned_allocator {
    *        It is the responsibility of the caller to destroy
    *        the objects stored at \p p.
    */
-  __host__ inline void deallocate(pointer p, size_type /*cnt*/) { CUDF_CUDA_TRY(cudaFreeHost(p)); }
+  __host__ inline void deallocate(pointer p, size_type /*cnt*/)
+  {
+    auto dealloc_worked = cudaFreeHost(p);
+    (void)dealloc_worked;
+    assert(dealloc_worked == cudaSuccess);
+  }
 
   /**
    * @brief This method returns the maximum size of the \c cnt parameter

From fc3694730334971c6c7bd916bf36b71302cfcd42 Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Fri, 6 Oct 2023 14:03:32 -0400
Subject: [PATCH 010/118] Fixing parquet list of struct interpretation (#13715)

This change alters how we interpret non-annotated data in a parquet file. Most modern parquet writers would produce something like:
```
message spark_schema {
  required int32 id;
  optional group phoneNumbers (LIST) {
    repeated group phone {
      required int64 number;
      optional binary kind (STRING);
    }
  }
}
```

But the list annotation isn't required. If it didn't exist, we would incorrectly interpret this schema as a struct of struct and not a list of struct. This change alters the code to look at the child and see if it is repeated. If it is, this indicates a list.

closes #13664

Authors:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/13715
---
 cpp/src/io/parquet/page_decode.cuh         |  2 +-
 cpp/src/io/parquet/parquet.hpp             |  2 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp | 86 ++++++++++++++++++++--
 cpp/src/io/parquet/reader_impl_helpers.hpp |  1 +
 cpp/tests/io/parquet_test.cpp              | 78 ++++++++++++++++++++
 5 files changed, 162 insertions(+), 7 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index cdc29197eb3..d70cabdd35f 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -753,7 +753,7 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value
           // for nested schemas, it's more complicated.  This warp will visit 32 incoming values,
           // however not all of them will necessarily represent a value at this nesting level. so
           // the validity bit for thread t might actually represent output value t-6. the correct
-          // position for thread t's bit is cur_value_count. for cuda 11 we could use
+          // position for thread t's bit is thread_value_count. for cuda 11 we could use
           // __reduce_or_sync(), but until then we have to do a warp reduce.
           WarpReduceOr32(is_valid << thread_value_count);
 
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index c2affc774c2..1df49262e87 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -206,7 +206,7 @@ struct SchemaElement {
   {
     return type == UNDEFINED_TYPE &&
            // this assumption might be a little weak.
-           ((repetition_type != REPEATED) || (repetition_type == REPEATED && num_children == 2));
+           ((repetition_type != REPEATED) || (repetition_type == REPEATED && num_children > 1));
   }
 };
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index fcaa610fbb7..9778cfc47d2 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -175,6 +175,81 @@ type_id to_type_id(SchemaElement const& schema,
   return type_id::EMPTY;
 }
 
+void metadata::sanitize_schema()
+{
+  // Parquet isn't very strict about incoming metadata. Lots of things can and should be inferred.
+  // There are also a lot of rules that simply aren't followed and are expected to be worked around.
+  // This step sanitizes the metadata to something that isn't ambiguous.
+  //
+  // Take, for example, the following schema:
+  //
+  //  required group field_id=-1 user {
+  //    required int32 field_id=-1 id;
+  //    optional group field_id=-1 phoneNumbers {
+  //      repeated group field_id=-1 phone {
+  //        required int64 field_id=-1 number;
+  //        optional binary field_id=-1 kind (String);
+  //      }
+  //    }
+  //  }
+  //
+  // This real-world example has no annotations telling us what is a list or a struct. On the
+  // surface this looks like a column of id's and a column of list<struct<int64, string>>, but this
+  // actually should be interpreted as a struct<list<struct<int64, string>>>. The phoneNumbers field
+  // has to be a struct because it is a group with no repeated tag and we have no annotation. The
+  // repeated group is actually BOTH a struct due to the multiple children and a list due to
+  // repeated.
+  //
+  // This code attempts to make this less messy for the code that follows.
+
+  std::function<void(size_t)> process = [&](size_t schema_idx) -> void {
+    if (schema_idx < 0) { return; }
+    auto& schema_elem = schema[schema_idx];
+    if (schema_idx != 0 && schema_elem.type == UNDEFINED_TYPE) {
+      auto const parent_type = schema[schema_elem.parent_idx].converted_type;
+      if (schema_elem.repetition_type == REPEATED && schema_elem.num_children > 1 &&
+          parent_type != LIST && parent_type != MAP) {
+        // This is a list of structs, so we need to mark this as a list, but also
+        // add a struct child and move this element's children to the struct
+        schema_elem.converted_type  = LIST;
+        schema_elem.repetition_type = OPTIONAL;
+        auto const struct_node_idx  = schema.size();
+
+        SchemaElement struct_elem;
+        struct_elem.name            = "struct_node";
+        struct_elem.repetition_type = REQUIRED;
+        struct_elem.num_children    = schema_elem.num_children;
+        struct_elem.type            = UNDEFINED_TYPE;
+        struct_elem.converted_type  = UNKNOWN;
+
+        // swap children
+        struct_elem.children_idx = std::move(schema_elem.children_idx);
+        schema_elem.children_idx = {struct_node_idx};
+        schema_elem.num_children = 1;
+
+        struct_elem.max_definition_level = schema_elem.max_definition_level;
+        struct_elem.max_repetition_level = schema_elem.max_repetition_level;
+        schema_elem.max_definition_level--;
+        schema_elem.max_repetition_level = schema[schema_elem.parent_idx].max_repetition_level;
+
+        // change parent index on new node and on children
+        struct_elem.parent_idx = schema_idx;
+        for (auto& child_idx : struct_elem.children_idx) {
+          schema[child_idx].parent_idx = struct_node_idx;
+        }
+        // add our struct
+        schema.push_back(struct_elem);
+      }
+    }
+
+    for (auto& child_idx : schema_elem.children_idx) {
+      process(child_idx);
+    }
+  };
+
+  process(0);
+}
+
 metadata::metadata(datasource* source)
 {
   constexpr auto header_len = sizeof(file_header_s);
@@ -195,6 +270,7 @@ metadata::metadata(datasource* source)
   CompactProtocolReader cp(buffer->data(), ender->footer_len);
   CUDF_EXPECTS(cp.read(this), "Cannot parse metadata");
   CUDF_EXPECTS(cp.InitSchema(this), "Cannot initialize schema");
+  sanitize_schema();
 }
 
 std::vector<metadata> aggregate_reader_metadata::metadatas_from_sources(
@@ -445,8 +521,10 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
           child_col_name_info, schema_elem.children_idx[0], out_col_array, has_list_parent);
       }
 
+      auto const one_level_list = schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx));
+
       // if we're at the root, this is a new output column
-      auto const col_type = schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx))
+      auto const col_type = one_level_list
                               ? type_id::LIST
                               : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
       auto const dtype    = to_data_type(col_type, schema_elem);
@@ -485,7 +563,7 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
           input_column_info{schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0});
 
         // set up child output column for one-level encoding list
-        if (schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx))) {
+        if (one_level_list) {
           // determine the element data type
           auto const element_type =
             to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
@@ -506,9 +584,7 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
         std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting));
 
         // pop off the extra nesting element.
-        if (schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx))) {
-          nesting.pop_back();
-        }
+        if (one_level_list) { nesting.pop_back(); }
 
         path_is_valid = true;  // If we're able to reach leaf then path is valid
       }
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 61e4f94df0f..9ee17f26a10 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -58,6 +58,7 @@ using namespace cudf::io::parquet;
  */
 struct metadata : public FileMetaData {
   explicit metadata(datasource* source);
+  void sanitize_schema();
 };
 
 class aggregate_reader_metadata {
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 81e0e12eeb9..73c946a5feb 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -6732,4 +6732,82 @@ TEST_P(ParquetV2Test, CheckEncodings)
   }
 }
 
+TEST_F(ParquetReaderTest, RepeatedNoAnnotations)
+{
+  constexpr unsigned char repeated_bytes[] = {
+    0x50, 0x41, 0x52, 0x31, 0x15, 0x04, 0x15, 0x30, 0x15, 0x30, 0x4c, 0x15, 0x0c, 0x15, 0x00, 0x12,
+    0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00,
+    0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x0a, 0x15, 0x0a,
+    0x2c, 0x15, 0x0c, 0x15, 0x10, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x03, 0x03, 0x88, 0xc6, 0x02,
+    0x26, 0x80, 0x01, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x10, 0x19, 0x18, 0x02, 0x69, 0x64, 0x15,
+    0x00, 0x16, 0x0c, 0x16, 0x78, 0x16, 0x78, 0x26, 0x54, 0x26, 0x08, 0x00, 0x00, 0x15, 0x04, 0x15,
+    0x40, 0x15, 0x40, 0x4c, 0x15, 0x08, 0x15, 0x00, 0x12, 0x00, 0x00, 0xe3, 0x0c, 0x23, 0x4b, 0x01,
+    0x00, 0x00, 0x00, 0xc7, 0x35, 0x3a, 0x42, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x6b, 0x74, 0x84, 0x00,
+    0x00, 0x00, 0x00, 0x55, 0xa1, 0xae, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x22, 0x15,
+    0x22, 0x2c, 0x15, 0x10, 0x15, 0x10, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x03, 0xc0, 0x03, 0x00, 0x00, 0x00, 0x03, 0x90, 0xaa, 0x02, 0x03, 0x94, 0x03, 0x26, 0xda, 0x02,
+    0x1c, 0x15, 0x04, 0x19, 0x25, 0x00, 0x10, 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e,
+    0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x06, 0x6e, 0x75, 0x6d,
+    0x62, 0x65, 0x72, 0x15, 0x00, 0x16, 0x10, 0x16, 0xa0, 0x01, 0x16, 0xa0, 0x01, 0x26, 0x96, 0x02,
+    0x26, 0xba, 0x01, 0x00, 0x00, 0x15, 0x04, 0x15, 0x24, 0x15, 0x24, 0x4c, 0x15, 0x04, 0x15, 0x00,
+    0x12, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x68, 0x6f, 0x6d, 0x65, 0x06, 0x00, 0x00, 0x00, 0x6d,
+    0x6f, 0x62, 0x69, 0x6c, 0x65, 0x15, 0x00, 0x15, 0x20, 0x15, 0x20, 0x2c, 0x15, 0x10, 0x15, 0x10,
+    0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0xc0, 0x03, 0x00, 0x00, 0x00,
+    0x03, 0x90, 0xef, 0x01, 0x03, 0x04, 0x26, 0xcc, 0x04, 0x1c, 0x15, 0x0c, 0x19, 0x25, 0x00, 0x10,
+    0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05,
+    0x70, 0x68, 0x6f, 0x6e, 0x65, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x15, 0x00, 0x16, 0x10, 0x16, 0x82,
+    0x01, 0x16, 0x82, 0x01, 0x26, 0x8a, 0x04, 0x26, 0xca, 0x03, 0x00, 0x00, 0x15, 0x02, 0x19, 0x6c,
+    0x48, 0x04, 0x75, 0x73, 0x65, 0x72, 0x15, 0x04, 0x00, 0x15, 0x02, 0x25, 0x00, 0x18, 0x02, 0x69,
+    0x64, 0x00, 0x35, 0x02, 0x18, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65,
+    0x72, 0x73, 0x15, 0x02, 0x00, 0x35, 0x04, 0x18, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x15, 0x04,
+    0x00, 0x15, 0x04, 0x25, 0x00, 0x18, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x00, 0x15, 0x0c,
+    0x25, 0x02, 0x18, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x25, 0x00, 0x00, 0x16, 0x00, 0x19, 0x1c, 0x19,
+    0x3c, 0x26, 0x80, 0x01, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x10, 0x19, 0x18, 0x02, 0x69, 0x64,
+    0x15, 0x00, 0x16, 0x0c, 0x16, 0x78, 0x16, 0x78, 0x26, 0x54, 0x26, 0x08, 0x00, 0x00, 0x26, 0xda,
+    0x02, 0x1c, 0x15, 0x04, 0x19, 0x25, 0x00, 0x10, 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65,
+    0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x06, 0x6e, 0x75,
+    0x6d, 0x62, 0x65, 0x72, 0x15, 0x00, 0x16, 0x10, 0x16, 0xa0, 0x01, 0x16, 0xa0, 0x01, 0x26, 0x96,
+    0x02, 0x26, 0xba, 0x01, 0x00, 0x00, 0x26, 0xcc, 0x04, 0x1c, 0x15, 0x0c, 0x19, 0x25, 0x00, 0x10,
+    0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05,
+    0x70, 0x68, 0x6f, 0x6e, 0x65, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x15, 0x00, 0x16, 0x10, 0x16, 0x82,
+    0x01, 0x16, 0x82, 0x01, 0x26, 0x8a, 0x04, 0x26, 0xca, 0x03, 0x00, 0x00, 0x16, 0x9a, 0x03, 0x16,
+    0x0c, 0x00, 0x28, 0x49, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2d, 0x72, 0x73, 0x20, 0x76,
+    0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x30, 0x2e, 0x33, 0x2e, 0x30, 0x20, 0x28, 0x62, 0x75,
+    0x69, 0x6c, 0x64, 0x20, 0x62, 0x34, 0x35, 0x63, 0x65, 0x37, 0x63, 0x62, 0x61, 0x32, 0x31, 0x39,
+    0x39, 0x66, 0x32, 0x32, 0x64, 0x39, 0x33, 0x32, 0x36, 0x39, 0x63, 0x31, 0x35, 0x30, 0x64, 0x38,
+    0x61, 0x38, 0x33, 0x39, 0x31, 0x36, 0x63, 0x36, 0x39, 0x62, 0x35, 0x65, 0x29, 0x00, 0x32, 0x01,
+    0x00, 0x00, 0x50, 0x41, 0x52, 0x31};
+
+  auto read_opts = cudf::io::parquet_reader_options::builder(
+    cudf::io::source_info{reinterpret_cast<char const*>(repeated_bytes), sizeof(repeated_bytes)});
+  auto result = cudf::io::read_parquet(read_opts);
+
+  EXPECT_EQ(result.tbl->view().column(0).size(), 6);
+  EXPECT_EQ(result.tbl->view().num_columns(), 2);
+
+  column_wrapper<int32_t> col0{1, 2, 3, 4, 5, 6};
+  column_wrapper<int64_t> child0{{5555555555l, 1111111111l, 1111111111l, 2222222222l, 3333333333l}};
+  cudf::test::strings_column_wrapper child1{{"-", "home", "home", "-", "mobile"}, {0, 1, 1, 0, 1}};
+  auto struct_col = cudf::test::structs_column_wrapper{{child0, child1}};
+
+  auto list_offsets_column =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 1, 2, 5}.release();
+  auto num_list_rows = list_offsets_column->size() - 1;
+
+  auto mask = cudf::create_null_mask(6, cudf::mask_state::ALL_VALID);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), 0, 2, false);
+
+  auto list_col = cudf::make_lists_column(
+    num_list_rows, std::move(list_offsets_column), struct_col.release(), 2, std::move(mask));
+
+  std::vector<std::unique_ptr<cudf::column>> struct_children;
+  struct_children.push_back(std::move(list_col));
+
+  auto outer_struct =
+    cudf::test::structs_column_wrapper{{std::move(struct_children)}, {0, 0, 1, 1, 1, 1}};
+  table_view expected{{col0, outer_struct}};
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), expected);
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 96664ec7436033f59aa5b9740e6f54aec707e3cf Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 6 Oct 2023 15:09:11 -0700
Subject: [PATCH 011/118] Add pylibcudf.Scalar that interoperates with Arrow
 scalars (#14133)

This PR adds a new Scalar object to pylibcudf that will function as the pylibcudf equivalent of cudf::scalar. Unlike columns, which are typically operated on in the form of views rather than owning types by libcudf, owning scalars are accepted by (const) ref by libcudf APIs and no corresponding view type exists. Therefore, pylibcudf.Scalar differs from pylibcudf.Column by actually owning an instance of the underlying libcudf type (cudf::scalar). Construction of pylibcudf Scalars is expected to be done from an Arrow scalar.

This PR relies on #14124 and should not be merged until after that one.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14133
---
 python/cudf/cudf/_lib/CMakeLists.txt          |   8 +-
 python/cudf/cudf/_lib/datetime.pyx            |   6 +-
 python/cudf/cudf/_lib/interop.pyx             |  95 +------------
 python/cudf/cudf/_lib/nvtext/CMakeLists.txt   |   8 ++
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |  25 +++-
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |   5 +-
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |   5 +-
 python/cudf/cudf/_lib/pylibcudf/interop.pxd   |   9 ++
 python/cudf/cudf/_lib/pylibcudf/interop.pyx   |  23 +++
 python/cudf/cudf/_lib/pylibcudf/scalar.pxd    |  32 +++++
 python/cudf/cudf/_lib/pylibcudf/scalar.pyx    | 133 ++++++++++++++++++
 python/cudf/cudf/_lib/pylibcudf/table.pxd     |   3 +
 python/cudf/cudf/_lib/pylibcudf/table.pyx     |  33 ++++-
 python/cudf/cudf/_lib/scalar.pxd              |  13 +-
 python/cudf/cudf/_lib/scalar.pyx              |  88 ++++++++----
 python/cudf/cudf/_lib/strings/CMakeLists.txt  |  10 +-
 .../cudf/_lib/strings/convert/CMakeLists.txt  |  10 +-
 .../cudf/_lib/strings/split/CMakeLists.txt    |  10 +-
 18 files changed, 378 insertions(+), 138 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/interop.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/interop.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/scalar.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/scalar.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 947659c290a..1b543b94589 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -107,8 +107,12 @@ if(${PYARROW_RESULT})
   message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}")
 endif()
 
-set(targets_using_arrow_headers interop avro csv orc json parquet)
-foreach(target IN LISTS targets_using_arrow_headers)
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers.
+# These requirements will go away once all scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
   target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
 endforeach()
 
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 81949dbaa20..3d96f59c4d6 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -10,6 +10,7 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.filling cimport calendrical_month_sequence
+from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
@@ -166,10 +167,11 @@ def date_range(DeviceScalar start, size_type n, offset):
         + offset.kwds.get("months", 0)
     )
 
+    cdef const scalar* c_start = start.c_value.get()
     with nogil:
         c_result = move(calendrical_month_sequence(
             n,
-            start.c_value.get()[0],
+            c_start[0],
             months
         ))
     return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 639754fc54f..8fd2a409d90 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -4,14 +4,7 @@ from cpython cimport pycapsule
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-from pyarrow.lib cimport (
-    CScalar,
-    CTable,
-    pyarrow_unwrap_scalar,
-    pyarrow_unwrap_table,
-    pyarrow_wrap_scalar,
-    pyarrow_wrap_table,
-)
+from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table
 
 from cudf._lib.cpp.interop cimport (
     DLManagedTensor,
@@ -21,22 +14,12 @@ from cudf._lib.cpp.interop cimport (
     to_arrow as cpp_to_arrow,
     to_dlpack as cpp_to_dlpack,
 )
-from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport type_id
-from cudf._lib.cpp.wrappers.decimals cimport (
-    decimal32,
-    decimal64,
-    decimal128,
-    scale_type,
-)
-from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf.api.types import is_list_dtype, is_struct_dtype
 from cudf.core.buffer import acquire_spill_lock
-from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype
 
 
 def from_dlpack(dlpack_capsule):
@@ -199,79 +182,3 @@ def from_arrow(object input_table):
         c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))
 
     return columns_from_unique_ptr(move(c_result))
-
-
-@acquire_spill_lock()
-def to_arrow_scalar(DeviceScalar source_scalar):
-    """Convert a scalar to a PyArrow scalar.
-
-    Parameters
-    ----------
-    source_scalar : the scalar to convert
-
-    Returns
-    -------
-    pyarrow.lib.Scalar
-    """
-    cdef vector[column_metadata] cpp_metadata = gather_metadata(
-        [("", source_scalar.dtype)]
-    )
-    cdef const scalar* source_scalar_ptr = source_scalar.get_raw_ptr()
-
-    cdef shared_ptr[CScalar] cpp_arrow_scalar
-    with nogil:
-        cpp_arrow_scalar = cpp_to_arrow(
-            source_scalar_ptr[0], cpp_metadata[0]
-        )
-
-    return pyarrow_wrap_scalar(cpp_arrow_scalar)
-
-
-@acquire_spill_lock()
-def from_arrow_scalar(object input_scalar, output_dtype=None):
-    """Convert from PyArrow scalar to a cudf scalar.
-
-    Parameters
-    ----------
-    input_scalar : PyArrow scalar
-    output_dtype : output type to cast to, ignored except for decimals
-
-    Returns
-    -------
-    cudf._lib.DeviceScalar
-    """
-    cdef shared_ptr[CScalar] cpp_arrow_scalar = (
-        pyarrow_unwrap_scalar(input_scalar)
-    )
-    cdef unique_ptr[scalar] c_result
-
-    with nogil:
-        c_result = move(cpp_from_arrow(cpp_arrow_scalar.get()[0]))
-
-    cdef type_id ctype = c_result.get().type().id()
-    if ctype == type_id.DECIMAL128:
-        if output_dtype is None:
-            # Decimals must be cast to the cudf dtype of the right width
-            raise ValueError(
-                "Decimal scalars must be constructed with a dtype"
-            )
-
-        if isinstance(output_dtype, Decimal32Dtype):
-            c_result.reset(
-                new fixed_point_scalar[decimal32](
-                    (<fixed_point_scalar[decimal128]*> c_result.get()).value(),
-                    scale_type(-input_scalar.type.scale),
-                    c_result.get().is_valid()
-                )
-            )
-        elif isinstance(output_dtype, Decimal64Dtype):
-            c_result.reset(
-                new fixed_point_scalar[decimal64](
-                    (<fixed_point_scalar[decimal128]*> c_result.get()).value(),
-                    scale_type(-input_scalar.type.scale),
-                    c_result.get().is_valid()
-                )
-            )
-        # Decimal128Dtype is a no-op, no conversion needed.
-
-    return DeviceScalar.from_unique_ptr(move(c_result), output_dtype)
diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
index 515b9c1d6e4..d4e2392ee04 100644
--- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
@@ -22,3 +22,11 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf
 )
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers.
+# These requirements will go away once all scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 0ce42dc43ff..5185b2d4bb5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -12,10 +12,33 @@
 # the License.
 # =============================================================================
 
-set(cython_sources column.pyx copying.pyx gpumemoryview.pyx table.pyx types.pyx utils.pyx)
+set(cython_sources column.pyx copying.pyx gpumemoryview.pyx interop.pyx scalar.pyx table.pyx
+                   types.pyx utils.pyx
+)
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
 )
+
+find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
+
+execute_process(
+  COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())"
+  OUTPUT_VARIABLE PYARROW_INCLUDE_DIR
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
+
+# TODO: Clean up this include when switching to scikit-build-core. See cudf/_lib/CMakeLists.txt for
+# more info
+find_package(NumPy REQUIRED)
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24.
+  # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}")
+endforeach()
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index ba7822b0a54..7a35854392c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -1,9 +1,10 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 # TODO: Verify consistent usage of relative/absolute imports in pylibcudf.
-from . cimport copying
+from . cimport copying, interop
 from .column cimport Column
 from .gpumemoryview cimport gpumemoryview
+from .scalar cimport Scalar
 from .table cimport Table
 # TODO: cimport type_id once
 # https://github.com/cython/cython/issues/5609 is resolved
@@ -12,7 +13,9 @@ from .types cimport DataType
 __all__ = [
     "Column",
     "DataType",
+    "Scalar",
     "Table",
     "copying",
     "gpumemoryview",
+    "interop",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 3edff9a53e8..72b74a57b87 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -1,16 +1,19 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
-from . import copying
+from . import copying, interop
 from .column import Column
 from .gpumemoryview import gpumemoryview
+from .scalar import Scalar
 from .table import Table
 from .types import DataType, TypeId
 
 __all__ = [
     "Column",
     "DataType",
+    "Scalar",
     "Table",
     "TypeId",
     "copying",
     "gpumemoryview",
+    "interop",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/interop.pxd
new file mode 100644
index 00000000000..3a79e5425d4
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pxd
@@ -0,0 +1,9 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from cudf._lib.cpp.interop cimport column_metadata
+
+
+cdef class ColumnMetadata:
+    cdef public object name
+    cdef public object children_meta
+    cdef column_metadata to_libcudf(self)
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
new file mode 100644
index 00000000000..0cdca275027
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
@@ -0,0 +1,23 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from cudf._lib.cpp.interop cimport column_metadata
+
+
+cdef class ColumnMetadata:
+    def __init__(self, name):
+        self.name = name
+        self.children_meta = []
+
+    cdef column_metadata to_libcudf(self):
+        """Convert to C++ column_metadata.
+
+        Since this class is mutable and cheap, it is easier to create the C++
+        object on the fly rather than have it directly backing the storage for
+        the Cython class.
+        """
+        cdef column_metadata c_metadata
+        cdef ColumnMetadata child_meta
+        c_metadata.name = self.name.encode()
+        for child_meta in self.children_meta:
+            c_metadata.children_meta.push_back(child_meta.to_libcudf())
+        return c_metadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
new file mode 100644
index 00000000000..09d853d832f
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
@@ -0,0 +1,32 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from pyarrow cimport lib as pa
+
+from rmm._lib.memory_resource cimport DeviceMemoryResource
+
+from cudf._lib.cpp.scalar.scalar cimport scalar
+
+from .interop cimport ColumnMetadata
+from .types cimport DataType
+
+
+cdef class Scalar:
+    cdef unique_ptr[scalar] c_obj
+    cdef DataType _data_type
+
+    # Holds a reference to the DeviceMemoryResource used for allocation.
+    # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is
+    # needed for deallocation
+    cdef DeviceMemoryResource mr
+
+    cdef const scalar* get(self) except *
+
+    cpdef DataType type(self)
+    cpdef bool is_valid(self)
+
+    @staticmethod
+    cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*)
+
+    cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata)
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
new file mode 100644
index 00000000000..04f588bd3e6
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
@@ -0,0 +1,133 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from cython cimport no_gc_clear
+from cython.operator cimport dereference
+from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.utility cimport move
+from pyarrow cimport lib as pa
+
+from rmm._lib.memory_resource cimport get_current_device_resource
+
+from cudf._lib.cpp.interop cimport (
+    column_metadata,
+    from_arrow as cpp_from_arrow,
+    to_arrow as cpp_to_arrow,
+)
+from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar
+from cudf._lib.cpp.wrappers.decimals cimport (
+    decimal32,
+    decimal64,
+    decimal128,
+    scale_type,
+)
+
+from .interop cimport ColumnMetadata
+from .types cimport DataType, type_id
+
+
+# The DeviceMemoryResource attribute could be released prematurely
+# by the gc if the Scalar is in a reference cycle. Removing the tp_clear
+# function with the no_gc_clear decoration prevents that. See
+# https://github.com/rapidsai/rmm/pull/931 for details.
+@no_gc_clear
+cdef class Scalar:
+    """A scalar value in device memory."""
+    # Unlike for columns, libcudf does not support scalar views. All APIs that
+    # accept scalar values accept references to the owning object rather than a
+    # special view type. As a result, pylibcudf.Scalar has a simpler structure
+    # than pylibcudf.Column because it can be a true wrapper around a libcudf
+    # column
+
+    def __cinit__(self, *args, **kwargs):
+        self.mr = get_current_device_resource()
+
+    def __init__(self, pa.Scalar value=None):
+        # TODO: This case is not something we really want to
+        # support, but it here for now to ease the transition of
+        # DeviceScalar.
+        if value is not None:
+            raise ValueError("Scalar should be constructed with a factory")
+
+    @staticmethod
+    def from_arrow(pa.Scalar value, DataType data_type=None):
+        # Allow passing a dtype, but only for the purpose of decimals for now
+
+        cdef shared_ptr[pa.CScalar] cscalar = (
+            pa.pyarrow_unwrap_scalar(value)
+        )
+        cdef unique_ptr[scalar] c_result
+
+        with nogil:
+            c_result = move(cpp_from_arrow(cscalar.get()[0]))
+
+        cdef Scalar s = Scalar.from_libcudf(move(c_result))
+
+        if s.type().id() != type_id.DECIMAL128:
+            if data_type is not None:
+                raise ValueError(
+                    "dtype may not be passed for non-decimal types"
+                )
+            return s
+
+        if data_type is None:
+            raise ValueError(
+                "Decimal scalars must be constructed with a dtype"
+            )
+
+        cdef type_id tid = data_type.id()
+
+        if tid == type_id.DECIMAL32:
+            s.c_obj.reset(
+                new fixed_point_scalar[decimal32](
+                    (<fixed_point_scalar[decimal128]*> s.c_obj.get()).value(),
+                    scale_type(-value.type.scale),
+                    s.c_obj.get().is_valid()
+                )
+            )
+        elif tid == type_id.DECIMAL64:
+            s.c_obj.reset(
+                new fixed_point_scalar[decimal64](
+                    (<fixed_point_scalar[decimal128]*> s.c_obj.get()).value(),
+                    scale_type(-value.type.scale),
+                    s.c_obj.get().is_valid()
+                )
+            )
+        elif tid != type_id.DECIMAL128:
+            raise ValueError(
+                "Decimal scalars may only be cast to decimals"
+            )
+
+        return s
+
+    cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata):
+        cdef shared_ptr[pa.CScalar] c_result
+        cdef column_metadata c_metadata = metadata.to_libcudf()
+
+        with nogil:
+            c_result = move(cpp_to_arrow(dereference(self.c_obj.get()), c_metadata))
+
+        return pa.pyarrow_wrap_scalar(c_result)
+
+    cdef const scalar* get(self) except *:
+        return self.c_obj.get()
+
+    cpdef DataType type(self):
+        """The type of data in the column."""
+        return self._data_type
+
+    cpdef bool is_valid(self):
+        """True if the scalar is valid, false if not"""
+        return self.get().is_valid()
+
+    @staticmethod
+    cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=None):
+        """Construct a Scalar object from a libcudf scalar.
+
+        This method is for pylibcudf's functions to use to ingest outputs of
+        calling libcudf algorithms, and should generally not be needed by users
+        (even direct pylibcudf Cython users).
+        """
+        cdef Scalar s = Scalar.__new__(Scalar)
+        s.c_obj.swap(libcudf_scalar)
+        s._data_type = DataType.from_libcudf(s.get().type())
+        return s
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd
index 95f197b13eb..a9e2874232a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from pyarrow cimport lib as pa
 
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
@@ -16,3 +17,5 @@ cdef class Table:
     cdef Table from_libcudf(unique_ptr[table] libcudf_tbl)
 
     cpdef list columns(self)
+
+    cpdef pa.Table to_arrow(self, list metadata)
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx
index 720f9815bd6..c41eb82e4a1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx
@@ -1,15 +1,22 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
+from pyarrow cimport lib as pa
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.interop cimport (
+    column_metadata,
+    from_arrow as cpp_from_arrow,
+    to_arrow as cpp_to_arrow,
+)
 from cudf._lib.cpp.table.table cimport table
 
 from .column cimport Column
+from .interop cimport ColumnMetadata
 
 
 cdef class Table:
@@ -60,3 +67,27 @@ cdef class Table:
 
     cpdef list columns(self):
         return self._columns
+
+    @staticmethod
+    def from_arrow(pa.Table pyarrow_table):
+        cdef shared_ptr[pa.CTable] ctable = (
+            pa.pyarrow_unwrap_table(pyarrow_table)
+        )
+        cdef unique_ptr[table] c_result
+
+        with nogil:
+            c_result = move(cpp_from_arrow(ctable.get()[0]))
+
+        return Table.from_libcudf(move(c_result))
+
+    cpdef pa.Table to_arrow(self, list metadata):
+        cdef shared_ptr[pa.CTable] c_result
+        cdef vector[column_metadata] c_metadata
+        cdef ColumnMetadata meta
+        for meta in metadata:
+            c_metadata.push_back(meta.to_libcudf())
+
+        with nogil:
+            c_result = move(cpp_to_arrow(self.view(), c_metadata))
+
+        return pa.pyarrow_wrap_table(c_result)
diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd
index 1deed60d67d..77733f59c3d 100644
--- a/python/cudf/cudf/_lib/scalar.pxd
+++ b/python/cudf/cudf/_lib/scalar.pxd
@@ -1,20 +1,19 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
 from rmm._lib.memory_resource cimport DeviceMemoryResource
 
+# TODO: Would like to remove this cimport, but it will require some more work
+# to excise all C code in scalar.pyx that relies on using the C API of the
+# pylibcudf Scalar underlying the DeviceScalar.
+from cudf._lib cimport pylibcudf
 from cudf._lib.cpp.scalar.scalar cimport scalar
 
 
 cdef class DeviceScalar:
-    cdef unique_ptr[scalar] c_value
-
-    # Holds a reference to the DeviceMemoryResource used for allocation.
-    # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is
-    # needed for deallocation
-    cdef DeviceMemoryResource mr
+    cdef pylibcudf.Scalar c_value
 
     cdef object _dtype
 
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 5ab286c5701..0b64c75f7b6 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -1,7 +1,5 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-cimport cython
-
 import copy
 
 import numpy as np
@@ -13,17 +11,17 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from rmm._lib.memory_resource cimport get_current_device_resource
-
 import cudf
+from cudf._lib import pylibcudf
 from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES
-from cudf.core.dtypes import ListDtype, StructDtype
+from cudf.core.dtypes import (
+    ListDtype,
+    StructDtype,
+    is_list_dtype,
+    is_struct_dtype,
+)
 from cudf.core.missing import NA, NaT
 
-from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
-
-from cudf._lib.interop import from_arrow_scalar, to_arrow_scalar
-
 cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.cpp.scalar.scalar cimport (
     duration_scalar,
@@ -44,6 +42,7 @@ from cudf._lib.cpp.wrappers.timestamps cimport (
     timestamp_s,
     timestamp_us,
 )
+from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 
 
 def _replace_nested(obj, check, replacement):
@@ -61,15 +60,44 @@ def _replace_nested(obj, check, replacement):
                 _replace_nested(v, check, replacement)
 
 
-# The DeviceMemoryResource attribute could be released prematurely
-# by the gc if the DeviceScalar is in a reference cycle. Removing
-# the tp_clear function with the no_gc_clear decoration prevents that.
-# See https://github.com/rapidsai/rmm/pull/931 for details.
-@cython.no_gc_clear
+def gather_metadata(dtypes):
+    """Convert a dict of dtypes to a list of ColumnMetadata objects.
+
+    The metadata is constructed recursively so that nested types are
+    represented as nested ColumnMetadata objects.
+
+    Parameters
+    ----------
+    dtypes : dict
+        A dict mapping column names to dtypes.
+
+    Returns
+    -------
+    List[ColumnMetadata]
+        A list of ColumnMetadata objects.
+    """
+    out = []
+    for name, dtype in dtypes.items():
+        v = pylibcudf.interop.ColumnMetadata(name)
+        if is_struct_dtype(dtype):
+            v.children_meta = gather_metadata(dtype.fields)
+        elif is_list_dtype(dtype):
+            # Offsets column is unnamed and has no children
+            v.children_meta.append(pylibcudf.interop.ColumnMetadata(""))
+            v.children_meta.extend(
+                gather_metadata({"": dtype.element_type})
+            )
+        out.append(v)
+    return out
+
+
 cdef class DeviceScalar:
 
+    # TODO: I think this should be removable, except that currently the way
+    # that from_unique_ptr is implemented is probably dereferencing this in an
+    # invalid state. See what the best way to fix that is.
     def __cinit__(self, *args, **kwargs):
-        self.mr = get_current_device_resource()
+        self.c_value = pylibcudf.Scalar()
 
     def __init__(self, value, dtype):
         """
@@ -85,7 +113,7 @@ cdef class DeviceScalar:
         dtype : dtype
             A NumPy dtype.
         """
-        self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object')
+        dtype = dtype if dtype.kind != 'U' else cudf.dtype('object')
 
         if cudf.utils.utils.is_na_like(value):
             value = None
@@ -108,10 +136,17 @@ cdef class DeviceScalar:
 
         pa_scalar = pa.scalar(value, type=pa_type)
 
-        # Note: This factory-like behavior in __init__ will be removed when
-        # migrating to pylibcudf.
-        cdef DeviceScalar obj = from_arrow_scalar(pa_scalar, self._dtype)
-        self.c_value.swap(obj.c_value)
+        data_type = None
+        if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
+            tid = pylibcudf.TypeId.DECIMAL128
+            if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
+                tid = pylibcudf.TypeId.DECIMAL32
+            elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
+                tid = pylibcudf.TypeId.DECIMAL64
+            data_type = pylibcudf.DataType(tid, -dtype.scale)
+
+        self.c_value = pylibcudf.Scalar.from_arrow(pa_scalar, data_type)
+        self._dtype = dtype
 
     def _to_host_scalar(self):
         is_datetime = self.dtype.kind == "M"
@@ -119,7 +154,8 @@ cdef class DeviceScalar:
 
         null_type = NaT if is_datetime or is_timedelta else NA
 
-        ps = to_arrow_scalar(self)
+        metadata = gather_metadata({"": self.dtype})[0]
+        ps = self.c_value.to_arrow(metadata)
         if not ps.is_valid:
             return null_type
 
@@ -158,13 +194,13 @@ cdef class DeviceScalar:
         return self._to_host_scalar()
 
     cdef const scalar* get_raw_ptr(self) except *:
-        return self.c_value.get()
+        return self.c_value.c_obj.get()
 
     cpdef bool is_valid(self):
         """
         Returns if the Scalar is valid or not(i.e., <NA>).
         """
-        return self.get_raw_ptr()[0].is_valid()
+        return self.c_value.is_valid()
 
     def __repr__(self):
         if cudf.utils.utils.is_na_like(self.value):
@@ -183,7 +219,7 @@ cdef class DeviceScalar:
         cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar)
         cdef libcudf_types.data_type cdtype
 
-        s.c_value = move(ptr)
+        s.c_value = pylibcudf.Scalar.from_libcudf(move(ptr))
         cdtype = s.get_raw_ptr()[0].type()
 
         if dtype is not None:
@@ -310,9 +346,9 @@ def _create_proxy_nat_scalar(dtype):
     if dtype.char in 'mM':
         nat = dtype.type('NaT').astype(dtype)
         if dtype.type == np.datetime64:
-            _set_datetime64_from_np_scalar(result.c_value, nat, dtype, True)
+            _set_datetime64_from_np_scalar(result.c_value.c_obj, nat, dtype, True)
         elif dtype.type == np.timedelta64:
-            _set_timedelta64_from_np_scalar(result.c_value, nat, dtype, True)
+            _set_timedelta64_from_np_scalar(result.c_value.c_obj, nat, dtype, True)
         return result
     else:
         raise TypeError('NAT only valid for datetime and timedelta')
diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt
index a5e87a456cb..fc11f047ab4 100644
--- a/python/cudf/cudf/_lib/strings/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/strings/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -40,6 +40,14 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
 )
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That requirement will go away once all
+# scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
 
 add_subdirectory(convert)
 add_subdirectory(split)
diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
index 434f79d3b5f..f55bb1fb780 100644
--- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -22,3 +22,11 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
 )
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That requirement will go away once all
+# scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
index 59a22c06e85..2f2063482af 100644
--- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -20,3 +20,11 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
 )
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That requirement will go away once all
+# scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()

From e28017cc17d2feb050d2effd4ebafb84600fd607 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Mon, 9 Oct 2023 10:05:12 -0500
Subject: [PATCH 012/118] Cleanup of namespaces in parquet code. (#14259)

Cleans up several issues in the parquet code:

- We were using the namespace `cudf::io::detail::parquet`, when `cudf::io::parquet::detail` makes more sense.
- Converts the `cudf::io::parquet::gpu` namespace to also just use `cudf::io::parquet::detail`
- Several detail-style headers and source files were using `cudf::io::parquet` when they should probably have been in the detail namespace.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/14259
---
 cpp/include/cudf/io/detail/parquet.hpp        |   8 +-
 cpp/include/cudf/io/parquet.hpp               |   4 +-
 cpp/src/io/functions.cpp                      |   4 +-
 cpp/src/io/parquet/chunk_dict.cu              |  19 +-
 .../io/parquet/compact_protocol_reader.cpp    |   8 +-
 .../io/parquet/compact_protocol_reader.hpp    |   9 +-
 .../io/parquet/compact_protocol_writer.cpp    |   8 +-
 .../io/parquet/compact_protocol_writer.hpp    |   8 +-
 cpp/src/io/parquet/decode_preprocess.cu       |  10 +-
 cpp/src/io/parquet/delta_binary.cuh           |   4 +-
 cpp/src/io/parquet/page_data.cu               |  12 +-
 cpp/src/io/parquet/page_decode.cuh            |   4 +-
 cpp/src/io/parquet/page_delta_decode.cu       |   6 +-
 cpp/src/io/parquet/page_enc.cu                |  22 +-
 cpp/src/io/parquet/page_hdr.cu                |  14 +-
 cpp/src/io/parquet/page_string_decode.cu      |  14 +-
 cpp/src/io/parquet/page_string_utils.cuh      |   4 +-
 cpp/src/io/parquet/parquet.hpp                |   9 +-
 cpp/src/io/parquet/parquet_common.hpp         |   9 +-
 cpp/src/io/parquet/parquet_gpu.cuh            |   4 +-
 cpp/src/io/parquet/parquet_gpu.hpp            |  27 +-
 cpp/src/io/parquet/predicate_pushdown.cpp     |  14 +-
 cpp/src/io/parquet/reader.cpp                 |   4 +-
 cpp/src/io/parquet/reader_impl.cpp            |  36 +--
 cpp/src/io/parquet/reader_impl.hpp            |  12 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    | 121 ++++----
 cpp/src/io/parquet/reader_impl_helpers.hpp    |  21 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu  | 259 +++++++++---------
 cpp/src/io/parquet/rle_stream.cuh             |   4 +-
 cpp/src/io/parquet/writer_impl.cu             | 223 ++++++++-------
 cpp/src/io/parquet/writer_impl.hpp            |  28 +-
 cpp/tests/io/parquet_test.cpp                 | 207 +++++++-------
 32 files changed, 531 insertions(+), 605 deletions(-)

diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 074f690d2c7..0b8ee9676de 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -38,7 +38,7 @@ class parquet_reader_options;
 class parquet_writer_options;
 class chunked_parquet_writer_options;
 
-namespace detail::parquet {
+namespace parquet::detail {
 
 /**
  * @brief Class to read Parquet dataset data into columns.
@@ -186,7 +186,7 @@ class writer {
    */
   explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
                   parquet_writer_options const& options,
-                  single_write_mode mode,
+                  cudf::io::detail::single_write_mode mode,
                   rmm::cuda_stream_view stream);
 
   /**
@@ -201,7 +201,7 @@ class writer {
    */
   explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
                   chunked_parquet_writer_options const& options,
-                  single_write_mode mode,
+                  cudf::io::detail::single_write_mode mode,
                   rmm::cuda_stream_view stream);
 
   /**
@@ -250,5 +250,5 @@ class writer {
  * metadata.
  */
 parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources);
-}  // namespace detail::parquet
+}  // namespace parquet::detail
 }  // namespace cudf::io
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index deaf23d405a..6283099e700 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -499,7 +499,7 @@ class chunked_parquet_reader {
   [[nodiscard]] table_with_metadata read_chunk() const;
 
  private:
-  std::unique_ptr<cudf::io::detail::parquet::chunked_reader> reader;
+  std::unique_ptr<cudf::io::parquet::detail::chunked_reader> reader;
 };
 
 /** @} */  // end of group
@@ -1750,7 +1750,7 @@ class parquet_chunked_writer {
     std::vector<std::string> const& column_chunks_file_paths = {});
 
   /// Unique pointer to impl writer class
-  std::unique_ptr<cudf::io::detail::parquet::writer> writer;
+  std::unique_ptr<parquet::detail::writer> writer;
 };
 
 /** @} */  // end of group
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 392a7850886..726442d752e 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -470,8 +470,8 @@ void orc_chunked_writer::close()
   writer->close();
 }
 
-using namespace cudf::io::detail::parquet;
-namespace detail_parquet = cudf::io::detail::parquet;
+using namespace cudf::io::parquet::detail;
+namespace detail_parquet = cudf::io::parquet::detail;
 
 table_with_metadata read_parquet(parquet_reader_options const& options,
                                  rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 9ff1869edde..53ff31ab0a7 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -24,10 +24,8 @@
 
 #include <cuda/atomic>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
+
 namespace {
 constexpr int DEFAULT_BLOCK_SIZE = 256;
 }
@@ -101,7 +99,7 @@ struct map_find_fn {
 
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<gpu::PageFragment const> frags)
+  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
@@ -226,7 +224,7 @@ __global__ void __launch_bounds__(block_size)
 
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  get_dictionary_indices_kernel(cudf::detail::device_2dspan<gpu::PageFragment const> frags)
+  get_dictionary_indices_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
@@ -276,7 +274,7 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
     <<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
 }
 
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
@@ -290,14 +288,11 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
   collect_map_entries_kernel<block_size><<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
 }
 
-void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void get_dictionary_indices(cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
   get_dictionary_indices_kernel<DEFAULT_BLOCK_SIZE>
     <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(frags);
 }
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 5c7b8ca3f8c..81d1be64a45 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -21,9 +21,7 @@
 #include <functional>
 #include <tuple>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Base class for parquet field functors.
@@ -870,6 +868,4 @@ int CompactProtocolReader::WalkSchema(
   }
 }
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index 619815db503..cbb4161b138 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -25,9 +25,8 @@
 #include <utility>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
+
 /**
  * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata
  *
@@ -147,6 +146,4 @@ class CompactProtocolReader {
   friend class parquet_field_struct_blob;
 };
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 60bc8984d81..9adc8767880 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -16,9 +16,7 @@
 
 #include "compact_protocol_writer.hpp"
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Parquet CompactProtocolWriter class
@@ -391,6 +389,4 @@ inline void CompactProtocolFieldWriter::set_current_field(int const& field)
   current_field_value = field;
 }
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 26d66527aa5..4849a814b14 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -25,9 +25,7 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata
@@ -115,6 +113,4 @@ class CompactProtocolFieldWriter {
   inline void set_current_field(int const& field);
 };
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index 8de3702bc2e..544c93ee616 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -23,10 +23,7 @@
 #include <rmm/exec_policy.hpp>
 #include <thrust/reduce.h>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -411,7 +408,4 @@ void ComputePageSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
   }
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh
index 2382e4aafdf..a513e6674b4 100644
--- a/cpp/src/io/parquet/delta_binary.cuh
+++ b/cpp/src/io/parquet/delta_binary.cuh
@@ -18,7 +18,7 @@
 
 #include "page_decode.cuh"
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 // DELTA_XXX encoding support
 //
@@ -291,4 +291,4 @@ struct delta_binary_decoder {
   }
 };
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 230834632dd..cce3659b902 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -23,10 +23,7 @@
 #include <rmm/exec_policy.hpp>
 #include <thrust/reduce.h>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -624,7 +621,7 @@ uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector<PageInfo>
 }
 
 /**
- * @copydoc cudf::io::parquet::gpu::DecodePageData
+ * @copydoc cudf::io::parquet::detail::DecodePageData
  */
 void __host__ DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                              cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -648,7 +645,4 @@ void __host__ DecodePageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
   }
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index d70cabdd35f..7c866fd8b9e 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -24,7 +24,7 @@
 #include <cuda/atomic>
 #include <cuda/std/tuple>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 struct page_state_s {
   constexpr page_state_s() noexcept {}
@@ -1384,4 +1384,4 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
   return true;
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index 2b78dead205..d25684a59f3 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -23,7 +23,7 @@
 #include <rmm/exec_policy.hpp>
 #include <thrust/transform_scan.h>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -160,7 +160,7 @@ __global__ void __launch_bounds__(96)
 }  // anonymous namespace
 
 /**
- * @copydoc cudf::io::parquet::gpu::DecodeDeltaBinary
+ * @copydoc cudf::io::parquet::detail::DecodeDeltaBinary
  */
 void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
                                 cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -184,4 +184,4 @@ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages
   }
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index fe0dbb85124..78873d5e8ca 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -41,10 +41,7 @@
 #include <thrust/scatter.h>
 #include <thrust/tuple.h>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -329,7 +326,7 @@ __global__ void __launch_bounds__(128)
 // blockDim {128,1,1}
 __global__ void __launch_bounds__(128)
   gpuInitPages(device_2dspan<EncColumnChunk> chunks,
-               device_span<gpu::EncPage> pages,
+               device_span<EncPage> pages,
                device_span<size_type> page_sizes,
                device_span<size_type> comp_page_sizes,
                device_span<parquet_column_device_view const> col_desc,
@@ -998,7 +995,7 @@ __device__ auto julian_days_with_time(int64_t v)
 // blockDim(128, 1, 1)
 template <int block_size>
 __global__ void __launch_bounds__(128, 8)
-  gpuEncodePages(device_span<gpu::EncPage> pages,
+  gpuEncodePages(device_span<EncPage> pages,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
                  device_span<compression_result> comp_results,
@@ -1988,7 +1985,7 @@ __global__ void __launch_bounds__(128)
 
 // blockDim(1024, 1, 1)
 __global__ void __launch_bounds__(1024)
-  gpuGatherPages(device_span<EncColumnChunk> chunks, device_span<gpu::EncPage const> pages)
+  gpuGatherPages(device_span<EncColumnChunk> chunks, device_span<EncPage const> pages)
 {
   __shared__ __align__(8) EncColumnChunk ck_g;
   __shared__ __align__(8) EncPage page_g;
@@ -2265,7 +2262,7 @@ void InitFragmentStatistics(device_span<statistics_group> groups,
 }
 
 void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
-                      device_span<gpu::EncPage> pages,
+                      device_span<EncPage> pages,
                       device_span<size_type> page_sizes,
                       device_span<size_type> comp_page_sizes,
                       device_span<parquet_column_device_view const> col_desc,
@@ -2294,7 +2291,7 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
                                                      write_v2_headers);
 }
 
-void EncodePages(device_span<gpu::EncPage> pages,
+void EncodePages(device_span<EncPage> pages,
                  bool write_v2_headers,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
@@ -2328,7 +2325,7 @@ void EncodePageHeaders(device_span<EncPage> pages,
 }
 
 void GatherPages(device_span<EncColumnChunk> chunks,
-                 device_span<gpu::EncPage const> pages,
+                 device_span<EncPage const> pages,
                  rmm::cuda_stream_view stream)
 {
   gpuGatherPages<<<chunks.size(), 1024, 0, stream.value()>>>(chunks, pages);
@@ -2343,7 +2340,4 @@ void EncodeColumnIndexes(device_span<EncColumnChunk> chunks,
     chunks, column_stats, column_index_truncate_length);
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 6f8b2f50443..eae8e05e61e 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -20,10 +20,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
+
 // Minimal thrift implementation for parsing page headers
 // https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
 
@@ -161,8 +159,7 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type)
  * @param chunk Column chunk the page belongs to
  * @return `kernel_mask_bits` value for the given page
  */
-__device__ uint32_t kernel_mask_for_page(gpu::PageInfo const& page,
-                                         gpu::ColumnChunkDesc const& chunk)
+__device__ uint32_t kernel_mask_for_page(PageInfo const& page, ColumnChunkDesc const& chunk)
 {
   if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; }
 
@@ -528,7 +525,4 @@ void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
   gpuBuildStringDictionaryIndex<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_chunks);
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index d79abe4a6d2..4d79770ec34 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -20,10 +20,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/gather.cuh>
 
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
@@ -757,7 +754,7 @@ __global__ void __launch_bounds__(decode_block_size)
 }  // anonymous namespace
 
 /**
- * @copydoc cudf::io::parquet::gpu::ComputePageStringSizes
+ * @copydoc cudf::io::parquet::detail::ComputePageStringSizes
  */
 void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
                             cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -778,7 +775,7 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
 }
 
 /**
- * @copydoc cudf::io::parquet::gpu::DecodeStringPageData
+ * @copydoc cudf::io::parquet::detail::DecodeStringPageData
  */
 void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
                                    cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -802,7 +799,4 @@ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pa
   }
 }
 
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh
index 9395599b3ff..a81d0a64466 100644
--- a/cpp/src/io/parquet/page_string_utils.cuh
+++ b/cpp/src/io/parquet/page_string_utils.cuh
@@ -18,7 +18,7 @@
 
 #include <cudf/strings/detail/gather.cuh>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 // stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp.
 // copies from src to dst in 16B chunks per thread.
@@ -107,4 +107,4 @@ __device__ void block_excl_sum(size_type* arr, size_type length, size_type initi
   }
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 1df49262e87..c5993d73dec 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -25,9 +25,8 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
+
 constexpr uint32_t parquet_magic = (('P' << 0) | ('A' << 8) | ('R' << 16) | ('1' << 24));
 
 /**
@@ -405,6 +404,4 @@ static inline int CountLeadingZeros32(uint32_t value)
 #endif
 }
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp
index 5a1716bb547..50736197eb9 100644
--- a/cpp/src/io/parquet/parquet_common.hpp
+++ b/cpp/src/io/parquet/parquet_common.hpp
@@ -18,9 +18,8 @@
 
 #include <cstdint>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet::detail {
+
 // Max decimal precisions according to the parquet spec:
 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal
 auto constexpr MAX_DECIMAL32_PRECISION  = 9;
@@ -156,6 +155,4 @@ enum FieldType {
   ST_FLD_STRUCT = 12,
 };
 
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index dc74bee1536..10e12ebb782 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -23,7 +23,7 @@
 
 #include <cuco/static_map.cuh>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 auto constexpr KEY_SENTINEL   = size_type{-1};
 auto constexpr VALUE_SENTINEL = size_type{-1};
@@ -81,4 +81,4 @@ inline size_type __device__ row_to_value_idx(size_type idx,
   return idx;
 }
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 51c862b376b..767668cc65e 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -35,7 +35,7 @@
 
 #include <vector>
 
-namespace cudf::io::parquet {
+namespace cudf::io::parquet::detail {
 
 using cudf::io::detail::string_index_pair;
 
@@ -88,8 +88,6 @@ struct input_column_info {
   auto nesting_depth() const { return nesting.size(); }
 };
 
-namespace gpu {
-
 /**
  * @brief Enums for the flags in the page header
  */
@@ -347,7 +345,7 @@ struct file_intermediate_data {
 
   // all chunks from the selected row groups. We may end up reading these chunks progressively
   // instead of all at once
-  std::vector<gpu::ColumnChunkDesc> chunks{};
+  std::vector<ColumnChunkDesc> chunks{};
 
   // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we
   // may not be visiting every row group that contains these bounds
@@ -372,16 +370,16 @@ struct pass_intermediate_data {
 
   // rowgroup, chunk and page information for the current pass.
   std::vector<row_group_info> row_groups{};
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc> chunks{};
-  cudf::detail::hostdevice_vector<gpu::PageInfo> pages_info{};
-  cudf::detail::hostdevice_vector<gpu::PageNestingInfo> page_nesting_info{};
-  cudf::detail::hostdevice_vector<gpu::PageNestingDecodeInfo> page_nesting_decode_info{};
+  cudf::detail::hostdevice_vector<ColumnChunkDesc> chunks{};
+  cudf::detail::hostdevice_vector<PageInfo> pages_info{};
+  cudf::detail::hostdevice_vector<PageNestingInfo> page_nesting_info{};
+  cudf::detail::hostdevice_vector<PageNestingDecodeInfo> page_nesting_decode_info{};
 
   rmm::device_uvector<int32_t> page_keys{0, rmm::cuda_stream_default};
   rmm::device_uvector<int32_t> page_index{0, rmm::cuda_stream_default};
   rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
 
-  std::vector<gpu::chunk_read_info> output_chunk_read_info;
+  std::vector<chunk_read_info> output_chunk_read_info;
   std::size_t current_output_chunk{0};
 
   rmm::device_buffer level_decode_data{};
@@ -739,7 +737,7 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream);
 
 /**
@@ -762,7 +760,7 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const> frags,
+void get_dictionary_indices(cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream);
 
 /**
@@ -781,7 +779,7 @@ void get_dictionary_indices(cudf::detail::device_2dspan<gpu::PageFragment const>
  * @param[in] stream CUDA stream to use
  */
 void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                      device_span<gpu::EncPage> pages,
+                      device_span<EncPage> pages,
                       device_span<size_type> page_sizes,
                       device_span<size_type> comp_page_sizes,
                       device_span<parquet_column_device_view const> col_desc,
@@ -847,7 +845,7 @@ void EncodePageHeaders(device_span<EncPage> pages,
  * @param[in] stream CUDA stream to use
  */
 void GatherPages(device_span<EncColumnChunk> chunks,
-                 device_span<gpu::EncPage const> pages,
+                 device_span<EncPage const> pages,
                  rmm::cuda_stream_view stream);
 
 /**
@@ -863,5 +861,4 @@ void EncodeColumnIndexes(device_span<EncColumnChunk> chunks,
                          int32_t column_index_truncate_length,
                          rmm::cuda_stream_view stream);
 
-}  // namespace gpu
-}  // namespace cudf::io::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 805d082c71e..9083be1c2dd 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -35,7 +35,7 @@
 #include <numeric>
 #include <optional>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 namespace {
 /**
@@ -62,13 +62,13 @@ struct stats_caster {
 
   // uses storage type as T
   template <typename T, CUDF_ENABLE_IF(cudf::is_dictionary<T>() or cudf::is_nested<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     CUDF_FAIL("unsupported type for stats casting");
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_boolean<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     CUDF_EXPECTS(type == BOOLEAN, "Invalid type and stats combination");
     return targetType<T>(*reinterpret_cast<bool const*>(stats_val));
@@ -78,7 +78,7 @@ struct stats_caster {
   template <typename T,
             CUDF_ENABLE_IF((cudf::is_integral<T>() and !cudf::is_boolean<T>()) or
                            cudf::is_fixed_point<T>() or cudf::is_chrono<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     switch (type) {
       case INT32: return targetType<T>(*reinterpret_cast<int32_t const*>(stats_val));
@@ -103,7 +103,7 @@ struct stats_caster {
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_floating_point<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     switch (type) {
       case FLOAT: return targetType<T>(*reinterpret_cast<float const*>(stats_val));
@@ -113,7 +113,7 @@ struct stats_caster {
   }
 
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, string_view>)>
-  static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type)
+  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
   {
     switch (type) {
       case BYTE_ARRAY: [[fallthrough]];
@@ -527,4 +527,4 @@ named_to_reference_converter::visit_operands(
   return transformed_operands;
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index 1e87447006d..17d7c07bc91 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -16,7 +16,7 @@
 
 #include "reader_impl.hpp"
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 reader::reader() = default;
 
@@ -59,4 +59,4 @@ bool chunked_reader::has_next() const { return _impl->has_next(); }
 
 table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index ea40f29a070..26ec83d5946 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -25,7 +25,7 @@
 #include <bitset>
 #include <numeric>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
@@ -38,7 +38,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
 
   size_t const sum_max_depths = std::accumulate(
-    chunks.begin(), chunks.end(), 0, [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) {
+    chunks.begin(), chunks.end(), 0, [&](size_t cursum, ColumnChunkDesc const& chunk) {
       return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
     });
 
@@ -51,10 +51,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   // doing a gather operation later on.
   // TODO: This step is somewhat redundant if size info has already been calculated (nested schema,
   // chunked reader).
-  auto const has_strings = (kernel_mask & gpu::KERNEL_MASK_STRING) != 0;
+  auto const has_strings = (kernel_mask & KERNEL_MASK_STRING) != 0;
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
-    gpu::ComputePageStringSizes(
+    ComputePageStringSizes(
       pages, chunks, skip_rows, num_rows, _pass_itm_data->level_type_size, _stream);
 
     col_sizes = calculate_page_string_offsets();
@@ -176,19 +176,19 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   if (has_strings) {
     auto& stream = streams[s_idx++];
     chunk_nested_str_data.host_to_device_async(stream);
-    gpu::DecodeStringPageData(
+    DecodeStringPageData(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), stream);
   }
 
   // launch delta binary decoder
-  if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) {
-    gpu::DecodeDeltaBinary(
+  if ((kernel_mask & KERNEL_MASK_DELTA_BINARY) != 0) {
+    DecodeDeltaBinary(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
   // launch the catch-all page decoder
-  if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) {
-    gpu::DecodePageData(
+  if ((kernel_mask & KERNEL_MASK_GENERAL) != 0) {
+    DecodePageData(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
@@ -248,13 +248,13 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 
   // update null counts in the final column buffers
   for (size_t idx = 0; idx < pages.size(); idx++) {
-    gpu::PageInfo* pi = &pages[idx];
-    if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; }
-    gpu::ColumnChunkDesc* col          = &chunks[pi->chunk_idx];
+    PageInfo* pi = &pages[idx];
+    if (pi->flags & PAGEINFO_FLAGS_DICTIONARY) { continue; }
+    ColumnChunkDesc* col               = &chunks[pi->chunk_idx];
     input_column_info const& input_col = _input_columns[col->src_col_index];
 
-    int index                        = pi->nesting_decode - page_nesting_decode.device_ptr();
-    gpu::PageNestingDecodeInfo* pndi = &page_nesting_decode[index];
+    int index                   = pi->nesting_decode - page_nesting_decode.device_ptr();
+    PageNestingDecodeInfo* pndi = &page_nesting_decode[index];
 
     auto* cols = &_output_buffers;
     for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
@@ -320,7 +320,7 @@ reader::impl::impl(std::size_t chunk_read_limit,
 
   // Save the states of the output buffers for reuse in `chunk_read()`.
   for (auto const& buff : _output_buffers) {
-    _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff));
+    _output_buffers_template.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
   }
 }
 
@@ -368,7 +368,7 @@ void reader::impl::prepare_data(int64_t skip_rows,
 
     // always create the pass struct, even if we end up with no passes.
     // this will also cause the previous pass information to be deleted
-    _pass_itm_data = std::make_unique<cudf::io::parquet::gpu::pass_intermediate_data>();
+    _pass_itm_data = std::make_unique<pass_intermediate_data>();
 
     if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
         not _input_columns.empty() && _current_input_pass < num_passes) {
@@ -521,7 +521,7 @@ table_with_metadata reader::impl::read_chunk()
   if (_chunk_count > 0) {
     _output_buffers.resize(0);
     for (auto const& buff : _output_buffers_template) {
-      _output_buffers.emplace_back(inline_column_buffer::empty_like(buff));
+      _output_buffers.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
     }
   }
 
@@ -571,4 +571,4 @@ parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> con
                           metadata.get_key_value_metadata()[0]};
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 9445e4d1648..6003b931b04 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -35,7 +35,7 @@
 #include <optional>
 #include <vector>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Implementation for Parquet reader
@@ -261,10 +261,10 @@ class reader::impl {
   std::vector<input_column_info> _input_columns;
 
   // Buffers for generating output columns
-  std::vector<inline_column_buffer> _output_buffers;
+  std::vector<cudf::io::detail::inline_column_buffer> _output_buffers;
 
   // Buffers copied from `_output_buffers` after construction for reuse
-  std::vector<inline_column_buffer> _output_buffers_template;
+  std::vector<cudf::io::detail::inline_column_buffer> _output_buffers_template;
 
   // _output_buffers associated schema indices
   std::vector<int> _output_column_schemas;
@@ -285,8 +285,8 @@ class reader::impl {
   // Within a pass, we produce one or more chunks of output, whose maximum total
   // byte size is controlled by _output_chunk_read_limit.
 
-  cudf::io::parquet::gpu::file_intermediate_data _file_itm_data;
-  std::unique_ptr<cudf::io::parquet::gpu::pass_intermediate_data> _pass_itm_data;
+  file_intermediate_data _file_itm_data;
+  std::unique_ptr<pass_intermediate_data> _pass_itm_data;
 
   // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents
   // the start/end of the chunks to be loaded for a given pass.
@@ -301,4 +301,4 @@ class reader::impl {
   bool _file_preprocessed{false};
 };
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 9778cfc47d2..171cf07da3e 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -21,34 +21,34 @@
 #include <numeric>
 #include <regex>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
 
 namespace {
 
 ConvertedType logical_type_to_converted_type(LogicalType const& logical)
 {
   if (logical.isset.STRING) {
-    return parquet::UTF8;
+    return UTF8;
   } else if (logical.isset.MAP) {
-    return parquet::MAP;
+    return MAP;
   } else if (logical.isset.LIST) {
-    return parquet::LIST;
+    return LIST;
   } else if (logical.isset.ENUM) {
-    return parquet::ENUM;
+    return ENUM;
   } else if (logical.isset.DECIMAL) {
-    return parquet::DECIMAL;  // TODO set decimal values
+    return DECIMAL;  // TODO set decimal values
   } else if (logical.isset.DATE) {
-    return parquet::DATE;
+    return DATE;
   } else if (logical.isset.TIME) {
     if (logical.TIME.unit.isset.MILLIS)
-      return parquet::TIME_MILLIS;
+      return TIME_MILLIS;
     else if (logical.TIME.unit.isset.MICROS)
-      return parquet::TIME_MICROS;
+      return TIME_MICROS;
   } else if (logical.isset.TIMESTAMP) {
     if (logical.TIMESTAMP.unit.isset.MILLIS)
-      return parquet::TIMESTAMP_MILLIS;
+      return TIMESTAMP_MILLIS;
     else if (logical.TIMESTAMP.unit.isset.MICROS)
-      return parquet::TIMESTAMP_MICROS;
+      return TIMESTAMP_MICROS;
   } else if (logical.isset.INTEGER) {
     switch (logical.INTEGER.bitWidth) {
       case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8;
@@ -58,13 +58,13 @@ ConvertedType logical_type_to_converted_type(LogicalType const& logical)
       default: break;
     }
   } else if (logical.isset.UNKNOWN) {
-    return parquet::NA;
+    return NA;
   } else if (logical.isset.JSON) {
-    return parquet::JSON;
+    return JSON;
   } else if (logical.isset.BSON) {
-    return parquet::BSON;
+    return BSON;
   }
-  return parquet::UNKNOWN;
+  return UNKNOWN;
 }
 
 }  // namespace
@@ -76,39 +76,39 @@ type_id to_type_id(SchemaElement const& schema,
                    bool strings_to_categorical,
                    type_id timestamp_type_id)
 {
-  parquet::Type const physical            = schema.type;
-  parquet::LogicalType const logical_type = schema.logical_type;
-  parquet::ConvertedType converted_type   = schema.converted_type;
-  int32_t decimal_precision               = schema.decimal_precision;
+  Type const physical            = schema.type;
+  LogicalType const logical_type = schema.logical_type;
+  ConvertedType converted_type   = schema.converted_type;
+  int32_t decimal_precision      = schema.decimal_precision;
 
   // Logical type used for actual data interpretation; the legacy converted type
   // is superseded by 'logical' type whenever available.
   auto const inferred_converted_type = logical_type_to_converted_type(logical_type);
-  if (inferred_converted_type != parquet::UNKNOWN) { converted_type = inferred_converted_type; }
-  if (inferred_converted_type == parquet::DECIMAL) {
+  if (inferred_converted_type != UNKNOWN) { converted_type = inferred_converted_type; }
+  if (inferred_converted_type == DECIMAL) {
     decimal_precision = schema.logical_type.DECIMAL.precision;
   }
 
   switch (converted_type) {
-    case parquet::UINT_8: return type_id::UINT8;
-    case parquet::INT_8: return type_id::INT8;
-    case parquet::UINT_16: return type_id::UINT16;
-    case parquet::INT_16: return type_id::INT16;
-    case parquet::UINT_32: return type_id::UINT32;
-    case parquet::UINT_64: return type_id::UINT64;
-    case parquet::DATE: return type_id::TIMESTAMP_DAYS;
-    case parquet::TIME_MILLIS: return type_id::DURATION_MILLISECONDS;
-    case parquet::TIME_MICROS: return type_id::DURATION_MICROSECONDS;
-    case parquet::TIMESTAMP_MILLIS:
+    case UINT_8: return type_id::UINT8;
+    case INT_8: return type_id::INT8;
+    case UINT_16: return type_id::UINT16;
+    case INT_16: return type_id::INT16;
+    case UINT_32: return type_id::UINT32;
+    case UINT_64: return type_id::UINT64;
+    case DATE: return type_id::TIMESTAMP_DAYS;
+    case TIME_MILLIS: return type_id::DURATION_MILLISECONDS;
+    case TIME_MICROS: return type_id::DURATION_MICROSECONDS;
+    case TIMESTAMP_MILLIS:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_MILLISECONDS;
-    case parquet::TIMESTAMP_MICROS:
+    case TIMESTAMP_MICROS:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_MICROSECONDS;
-    case parquet::DECIMAL:
-      if (physical == parquet::INT32) { return type_id::DECIMAL32; }
-      if (physical == parquet::INT64) { return type_id::DECIMAL64; }
-      if (physical == parquet::FIXED_LEN_BYTE_ARRAY) {
+    case DECIMAL:
+      if (physical == INT32) { return type_id::DECIMAL32; }
+      if (physical == INT64) { return type_id::DECIMAL64; }
+      if (physical == FIXED_LEN_BYTE_ARRAY) {
         if (schema.type_length <= static_cast<int32_t>(sizeof(int32_t))) {
           return type_id::DECIMAL32;
         }
@@ -119,7 +119,7 @@ type_id to_type_id(SchemaElement const& schema,
           return type_id::DECIMAL128;
         }
       }
-      if (physical == parquet::BYTE_ARRAY) {
+      if (physical == BYTE_ARRAY) {
         CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision");
         if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
           return type_id::DECIMAL32;
@@ -133,20 +133,20 @@ type_id to_type_id(SchemaElement const& schema,
       break;
 
     // maps are just List<Struct<>>.
-    case parquet::MAP:
-    case parquet::LIST: return type_id::LIST;
-    case parquet::NA: return type_id::STRING;
+    case MAP:
+    case LIST: return type_id::LIST;
+    case NA: return type_id::STRING;
     // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support
     default: break;
   }
 
-  if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and
+  if (inferred_converted_type == UNKNOWN and physical == INT64 and
       logical_type.TIMESTAMP.unit.isset.NANOS) {
     return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                  : type_id::TIMESTAMP_NANOSECONDS;
   }
 
-  if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and
+  if (inferred_converted_type == UNKNOWN and physical == INT64 and
       logical_type.TIME.unit.isset.NANOS) {
     return type_id::DURATION_NANOSECONDS;
   }
@@ -157,16 +157,16 @@ type_id to_type_id(SchemaElement const& schema,
   // Physical storage type supported by Parquet; controls the on-disk storage
   // format in combination with the encoding type.
   switch (physical) {
-    case parquet::BOOLEAN: return type_id::BOOL8;
-    case parquet::INT32: return type_id::INT32;
-    case parquet::INT64: return type_id::INT64;
-    case parquet::FLOAT: return type_id::FLOAT32;
-    case parquet::DOUBLE: return type_id::FLOAT64;
-    case parquet::BYTE_ARRAY:
-    case parquet::FIXED_LEN_BYTE_ARRAY:
+    case BOOLEAN: return type_id::BOOL8;
+    case INT32: return type_id::INT32;
+    case INT64: return type_id::INT64;
+    case FLOAT: return type_id::FLOAT32;
+    case DOUBLE: return type_id::FLOAT64;
+    case BYTE_ARRAY:
+    case FIXED_LEN_BYTE_ARRAY:
       // Can be mapped to INT32 (32-bit hash) or STRING
       return strings_to_categorical ? type_id::INT32 : type_id::STRING;
-    case parquet::INT96:
+    case INT96:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_NANOSECONDS;
     default: break;
@@ -420,7 +420,7 @@ std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() con
   return names;
 }
 
-std::tuple<int64_t, size_type, std::vector<gpu::row_group_info>>
+std::tuple<int64_t, size_type, std::vector<row_group_info>>
 aggregate_reader_metadata::select_row_groups(
   host_span<std::vector<size_type> const> row_group_indices,
   int64_t skip_rows_opt,
@@ -438,7 +438,7 @@ aggregate_reader_metadata::select_row_groups(
         host_span<std::vector<size_type> const>(filtered_row_group_indices.value());
     }
   }
-  std::vector<gpu::row_group_info> selection;
+  std::vector<row_group_info> selection;
   auto [rows_to_skip, rows_to_read] = [&]() {
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
     auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options(
@@ -478,7 +478,7 @@ aggregate_reader_metadata::select_row_groups(
 }
 
 std::tuple<std::vector<input_column_info>,
-           std::vector<inline_column_buffer>,
+           std::vector<cudf::io::detail::inline_column_buffer>,
            std::vector<size_type>>
 aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>> const& use_names,
                                           bool include_index,
@@ -496,17 +496,18 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
              : -1;
   };
 
-  std::vector<inline_column_buffer> output_columns;
+  std::vector<cudf::io::detail::inline_column_buffer> output_columns;
   std::vector<input_column_info> input_columns;
   std::vector<int> nesting;
 
   // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is
   // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is
   // not a child of "struct1" then the function will return false for "struct1"
-  std::function<bool(column_name_info const*, int, std::vector<inline_column_buffer>&, bool)>
+  std::function<bool(
+    column_name_info const*, int, std::vector<cudf::io::detail::inline_column_buffer>&, bool)>
     build_column = [&](column_name_info const* col_name_info,
                        int schema_idx,
-                       std::vector<inline_column_buffer>& out_col_array,
+                       std::vector<cudf::io::detail::inline_column_buffer>& out_col_array,
                        bool has_list_parent) {
       if (schema_idx < 0) { return false; }
       auto const& schema_elem = get_schema(schema_idx);
@@ -529,7 +530,8 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
                               : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
       auto const dtype    = to_data_type(col_type, schema_elem);
 
-      inline_column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
+      cudf::io::detail::inline_column_buffer output_col(dtype,
+                                                        schema_elem.repetition_type == OPTIONAL);
       if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; }
       // store the index of this element if inserted in out_col_array
       nesting.push_back(static_cast<int>(out_col_array.size()));
@@ -569,7 +571,8 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
             to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
           auto const element_dtype = to_data_type(element_type, schema_elem);
 
-          inline_column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL);
+          cudf::io::detail::inline_column_buffer element_col(
+            element_dtype, schema_elem.repetition_type == OPTIONAL);
           if (has_list_parent || col_type == type_id::LIST) {
             element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT;
           }
@@ -732,4 +735,4 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
     std::move(input_columns), std::move(output_columns), std::move(output_column_schemas));
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 9ee17f26a10..1a73e2f55ac 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -32,9 +32,7 @@
 #include <tuple>
 #include <vector>
 
-namespace cudf::io::detail::parquet {
-
-using namespace cudf::io::parquet;
+namespace cudf::io::parquet::detail {
 
 /**
  * @brief Function that translates Parquet datatype to cuDF type enum
@@ -182,7 +180,7 @@ class aggregate_reader_metadata {
    * @return A tuple of corrected row_start, row_count and list of row group indexes and its
    *         starting row
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<gpu::row_group_info>> select_row_groups(
+  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>> select_row_groups(
     host_span<std::vector<size_type> const> row_group_indices,
     int64_t row_start,
     std::optional<size_type> const& row_count,
@@ -202,12 +200,13 @@ class aggregate_reader_metadata {
    * @return input column information, output column information, list of output column schema
    * indices
    */
-  [[nodiscard]] std::
-    tuple<std::vector<input_column_info>, std::vector<inline_column_buffer>, std::vector<size_type>>
-    select_columns(std::optional<std::vector<std::string>> const& use_names,
-                   bool include_index,
-                   bool strings_to_categorical,
-                   type_id timestamp_type_id) const;
+  [[nodiscard]] std::tuple<std::vector<input_column_info>,
+                           std::vector<cudf::io::detail::inline_column_buffer>,
+                           std::vector<size_type>>
+  select_columns(std::optional<std::vector<std::string>> const& use_names,
+                 bool include_index,
+                 bool strings_to_categorical,
+                 type_id timestamp_type_id) const;
 };
 
 /**
@@ -276,4 +275,4 @@ class named_to_reference_converter : public ast::detail::expression_transformer
   std::list<ast::operation> _operators;
 };
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index c731c467f2c..4bc6bb6f43b 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -43,7 +43,8 @@
 
 #include <numeric>
 
-namespace cudf::io::detail::parquet {
+namespace cudf::io::parquet::detail {
+
 namespace {
 
 /**
@@ -185,11 +186,11 @@ template <typename T = uint8_t>
  */
 [[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
                                                                    type_id timestamp_type_id,
-                                                                   parquet::Type physical,
+                                                                   Type physical,
                                                                    int8_t converted,
                                                                    int32_t length)
 {
-  int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0;
+  int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0;
   int32_t clock_rate = 0;
   if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
     type_width = 1;  // I32 -> I8
@@ -202,9 +203,9 @@ template <typename T = uint8_t>
   }
 
   int8_t converted_type = converted;
-  if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 &&
+  if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 &&
       not cudf::is_fixed_point(data_type{column_type_id})) {
-    converted_type = parquet::UNKNOWN;  // Not converting to float64 or decimal
+    converted_type = UNKNOWN;  // Not converting to float64 or decimal
   }
   return std::make_tuple(type_width, clock_rate, converted_type);
 }
@@ -226,7 +227,7 @@ template <typename T = uint8_t>
 [[nodiscard]] std::future<void> read_column_chunks_async(
   std::vector<std::unique_ptr<datasource>> const& sources,
   std::vector<std::unique_ptr<datasource::buffer>>& page_data,
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+  cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
   size_t begin_chunk,
   size_t end_chunk,
   std::vector<size_t> const& column_chunk_offsets,
@@ -239,11 +240,10 @@ template <typename T = uint8_t>
     size_t const io_offset   = column_chunk_offsets[chunk];
     size_t io_size           = chunks[chunk].compressed_size;
     size_t next_chunk        = chunk + 1;
-    bool const is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED);
+    bool const is_compressed = (chunks[chunk].codec != Compression::UNCOMPRESSED);
     while (next_chunk < end_chunk) {
-      size_t const next_offset = column_chunk_offsets[next_chunk];
-      bool const is_next_compressed =
-        (chunks[next_chunk].codec != parquet::Compression::UNCOMPRESSED);
+      size_t const next_offset      = column_chunk_offsets[next_chunk];
+      bool const is_next_compressed = (chunks[next_chunk].codec != Compression::UNCOMPRESSED);
       if (next_offset != io_offset + io_size || is_next_compressed != is_compressed ||
           chunk_source_map[chunk] != chunk_source_map[next_chunk]) {
         // Can't merge if not contiguous or mixing compressed and uncompressed
@@ -300,13 +300,13 @@ template <typename T = uint8_t>
  *
  * @return The total number of pages
  */
-[[nodiscard]] size_t count_page_headers(
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks, rmm::cuda_stream_view stream)
+[[nodiscard]] size_t count_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
+                                        rmm::cuda_stream_view stream)
 {
   size_t total_pages = 0;
 
   chunks.host_to_device_async(stream);
-  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
   chunks.device_to_host_sync(stream);
 
   for (size_t c = 0; c < chunks.size(); c++) {
@@ -337,8 +337,8 @@ constexpr bool is_supported_encoding(Encoding enc)
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @returns The size in bytes of level type data required
  */
-int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                        cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
+                        cudf::detail::hostdevice_vector<PageInfo>& pages,
                         rmm::cuda_stream_view stream)
 {
   // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
@@ -350,14 +350,14 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
   }
 
   chunks.host_to_device_async(stream);
-  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
 
   // compute max bytes needed for level data
   auto level_bit_size =
     cudf::detail::make_counting_transform_iterator(0, [chunks = chunks.begin()] __device__(int i) {
       auto c = chunks[i];
       return static_cast<int>(
-        max(c.level_bits[gpu::level_type::REPETITION], c.level_bits[gpu::level_type::DEFINITION]));
+        max(c.level_bits[level_type::REPETITION], c.level_bits[level_type::DEFINITION]));
     });
   // max level data bit size.
   int const max_level_bits   = thrust::reduce(rmm::exec_policy(stream),
@@ -388,11 +388,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
  * @return Device buffer to decompressed page data
  */
 [[nodiscard]] rmm::device_buffer decompress_page_data(
-  cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-  cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+  cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
+  cudf::detail::hostdevice_vector<PageInfo>& pages,
   rmm::cuda_stream_view stream)
 {
-  auto for_each_codec_page = [&](parquet::Compression codec, std::function<void(size_t)> const& f) {
+  auto for_each_codec_page = [&](Compression codec, std::function<void(size_t)> const& f) {
     for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
       const auto page_stride = chunks[c].max_num_pages;
       if (chunks[c].codec == codec) {
@@ -412,19 +412,16 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
   size_t total_decomp_size = 0;
 
   struct codec_stats {
-    parquet::Compression compression_type = UNCOMPRESSED;
-    size_t num_pages                      = 0;
-    int32_t max_decompressed_size         = 0;
-    size_t total_decomp_size              = 0;
+    Compression compression_type  = UNCOMPRESSED;
+    size_t num_pages              = 0;
+    int32_t max_decompressed_size = 0;
+    size_t total_decomp_size      = 0;
   };
 
-  std::array codecs{codec_stats{parquet::GZIP},
-                    codec_stats{parquet::SNAPPY},
-                    codec_stats{parquet::BROTLI},
-                    codec_stats{parquet::ZSTD}};
+  std::array codecs{codec_stats{GZIP}, codec_stats{SNAPPY}, codec_stats{BROTLI}, codec_stats{ZSTD}};
 
   auto is_codec_supported = [&codecs](int8_t codec) {
-    if (codec == parquet::UNCOMPRESSED) return true;
+    if (codec == UNCOMPRESSED) return true;
     return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) {
              return codec == cstats.compression_type;
            }) != codecs.end();
@@ -445,7 +442,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
       codec.num_pages++;
       num_comp_pages++;
     });
-    if (codec.compression_type == parquet::BROTLI && codec.num_pages > 0) {
+    if (codec.compression_type == BROTLI && codec.num_pages > 0) {
       debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream);
     }
   }
@@ -482,7 +479,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
       auto& page          = pages[page_idx];
       // offset will only be non-zero for V2 pages
       auto const offset =
-        page.lvl_bytes[gpu::level_type::DEFINITION] + page.lvl_bytes[gpu::level_type::REPETITION];
+        page.lvl_bytes[level_type::DEFINITION] + page.lvl_bytes[level_type::REPETITION];
       // for V2 need to copy def and rep level info into place, and then offset the
       // input and output buffers. otherwise we'd have to keep both the compressed
       // and decompressed data.
@@ -509,11 +506,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
     device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
 
     switch (codec.compression_type) {
-      case parquet::GZIP:
+      case GZIP:
         gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
         break;
-      case parquet::SNAPPY:
-        if (nvcomp_integration::is_stable_enabled()) {
+      case SNAPPY:
+        if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
           nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
                                      d_comp_in,
                                      d_comp_out,
@@ -525,7 +522,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
           gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream);
         }
         break;
-      case parquet::ZSTD:
+      case ZSTD:
         nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
                                    d_comp_in,
                                    d_comp_out,
@@ -534,7 +531,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>& c
                                    codec.total_decomp_size,
                                    stream);
         break;
-      case parquet::BROTLI:
+      case BROTLI:
         gpu_debrotli(d_comp_in,
                      d_comp_out,
                      d_comp_res_view,
@@ -594,9 +591,9 @@ void reader::impl::allocate_nesting_info()
     });
 
   page_nesting_info =
-    cudf::detail::hostdevice_vector<gpu::PageNestingInfo>{total_page_nesting_infos, _stream};
+    cudf::detail::hostdevice_vector<PageNestingInfo>{total_page_nesting_infos, _stream};
   page_nesting_decode_info =
-    cudf::detail::hostdevice_vector<gpu::PageNestingDecodeInfo>{total_page_nesting_infos, _stream};
+    cudf::detail::hostdevice_vector<PageNestingDecodeInfo>{total_page_nesting_infos, _stream};
 
   // update pointers in the PageInfos
   int target_page_index = 0;
@@ -653,10 +650,10 @@ void reader::impl::allocate_nesting_info()
       if (!cur_schema.is_stub()) {
         // initialize each page within the chunk
         for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
-          gpu::PageNestingInfo* pni =
+          PageNestingInfo* pni =
             &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
 
-          gpu::PageNestingDecodeInfo* nesting_info =
+          PageNestingDecodeInfo* nesting_info =
             &page_nesting_decode_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
 
           // if we have lists, set our start and end depth remappings
@@ -717,9 +714,9 @@ void reader::impl::allocate_level_decode_space()
   for (size_t idx = 0; idx < pages.size(); idx++) {
     auto& p = pages[idx];
 
-    p.lvl_decode_buf[gpu::level_type::DEFINITION] = buf;
+    p.lvl_decode_buf[level_type::DEFINITION] = buf;
     buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
-    p.lvl_decode_buf[gpu::level_type::REPETITION] = buf;
+    p.lvl_decode_buf[level_type::REPETITION] = buf;
     buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size);
   }
 }
@@ -824,25 +821,25 @@ void reader::impl::load_global_chunk_info()
                         schema.converted_type,
                         schema.type_length);
 
-      chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size,
-                                            nullptr,
-                                            col_meta.num_values,
-                                            schema.type,
-                                            type_width,
-                                            row_group_start,
-                                            row_group_rows,
-                                            schema.max_definition_level,
-                                            schema.max_repetition_level,
-                                            _metadata->get_output_nesting_depth(col.schema_idx),
-                                            required_bits(schema.max_definition_level),
-                                            required_bits(schema.max_repetition_level),
-                                            col_meta.codec,
-                                            converted_type,
-                                            schema.logical_type,
-                                            schema.decimal_precision,
-                                            clock_rate,
-                                            i,
-                                            col.schema_idx));
+      chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size,
+                                       nullptr,
+                                       col_meta.num_values,
+                                       schema.type,
+                                       type_width,
+                                       row_group_start,
+                                       row_group_rows,
+                                       schema.max_definition_level,
+                                       schema.max_repetition_level,
+                                       _metadata->get_output_nesting_depth(col.schema_idx),
+                                       required_bits(schema.max_definition_level),
+                                       required_bits(schema.max_repetition_level),
+                                       col_meta.codec,
+                                       converted_type,
+                                       schema.logical_type,
+                                       schema.decimal_precision,
+                                       clock_rate,
+                                       i,
+                                       col.schema_idx));
     }
 
     remaining_rows -= row_group_rows;
@@ -909,7 +906,7 @@ void reader::impl::compute_input_pass_row_group_info()
 void reader::impl::setup_pass()
 {
   // this will also cause the previous pass information to be deleted
-  _pass_itm_data = std::make_unique<cudf::io::parquet::gpu::pass_intermediate_data>();
+  _pass_itm_data = std::make_unique<pass_intermediate_data>();
 
   // setup row groups to be loaded for this pass
   auto const row_group_start = _input_pass_row_group_offsets[_current_input_pass];
@@ -929,8 +926,7 @@ void reader::impl::setup_pass()
   auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
   auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
 
-  _pass_itm_data->chunks =
-    cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc>(num_chunks, _stream);
+  _pass_itm_data->chunks = cudf::detail::hostdevice_vector<ColumnChunkDesc>(num_chunks, _stream);
   std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
 
   // adjust skip_rows and num_rows by what's available in the row groups we are processing
@@ -970,7 +966,7 @@ void reader::impl::load_and_decompress_data()
   // Process dataset chunk pages into output columns
   auto const total_pages = count_page_headers(chunks, _stream);
   if (total_pages <= 0) { return; }
-  pages = cudf::detail::hostdevice_vector<gpu::PageInfo>(total_pages, total_pages, _stream);
+  pages = cudf::detail::hostdevice_vector<PageInfo>(total_pages, total_pages, _stream);
 
   // decoding of column/page information
   _pass_itm_data->level_type_size = decode_page_headers(chunks, pages, _stream);
@@ -978,7 +974,7 @@ void reader::impl::load_and_decompress_data()
     decomp_page_data = decompress_page_data(chunks, pages, _stream);
     // Free compressed data
     for (size_t c = 0; c < chunks.size(); c++) {
-      if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
+      if (chunks[c].codec != Compression::UNCOMPRESSED) { raw_page_data[c].reset(); }
     }
   }
 
@@ -1019,14 +1015,13 @@ struct cumulative_row_info {
 };
 
 #if defined(PREPROCESS_DEBUG)
-void print_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-                 rmm::cuda_stream_view _stream)
+void print_pages(cudf::detail::hostdevice_vector<PageInfo>& pages, rmm::cuda_stream_view _stream)
 {
   pages.device_to_host_sync(_stream);
   for (size_t idx = 0; idx < pages.size(); idx++) {
     auto const& p = pages[idx];
     // skip dictionary pages
-    if (p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; }
+    if (p.flags & PAGEINFO_FLAGS_DICTIONARY) { continue; }
     printf(
       "P(%lu, s:%d): chunk_row(%d), num_rows(%d), skipped_values(%d), skipped_leaf_values(%d), "
       "str_bytes(%d)\n",
@@ -1040,7 +1035,7 @@ void print_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
   }
 }
 
-void print_cumulative_page_info(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
+void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages,
                                 rmm::device_uvector<int32_t> const& page_index,
                                 rmm::device_uvector<cumulative_row_info> const& c_info,
                                 rmm::cuda_stream_view stream)
@@ -1067,7 +1062,7 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector<gpu::PageInfo>&
     printf("Schema %d\n", schemas[idx]);
     for (size_t pidx = 0; pidx < pages.size(); pidx++) {
       auto const& page = pages[h_page_index[pidx]];
-      if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
+      if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
         continue;
       }
       printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes);
@@ -1075,10 +1070,9 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector<gpu::PageInfo>&
   }
 }
 
-void print_cumulative_row_info(
-  host_span<cumulative_row_info const> sizes,
-  std::string const& label,
-  std::optional<std::vector<gpu::chunk_read_info>> splits = std::nullopt)
+void print_cumulative_row_info(host_span<cumulative_row_info const> sizes,
+                               std::string const& label,
+                               std::optional<std::vector<chunk_read_info>> splits = std::nullopt)
 {
   if (splits.has_value()) {
     printf("------------\nSplits\n");
@@ -1093,7 +1087,7 @@ void print_cumulative_row_info(
     if (splits.has_value()) {
       // if we have a split at this row count and this is the last instance of this row count
       auto start = thrust::make_transform_iterator(
-        splits->begin(), [](gpu::chunk_read_info const& i) { return i.skip_rows; });
+        splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; });
       auto end               = start + splits->size();
       auto split             = std::find(start, end, sizes[idx].row_count);
       auto const split_index = [&]() -> int {
@@ -1180,12 +1174,12 @@ __device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, boo
  * Sums across all nesting levels.
  */
 struct get_cumulative_row_info {
-  gpu::PageInfo const* const pages;
+  PageInfo const* const pages;
 
   __device__ cumulative_row_info operator()(size_type index)
   {
     auto const& page = pages[index];
-    if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) {
+    if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
       return cumulative_row_info{0, 0, page.src_col_schema};
     }
 
@@ -1250,15 +1244,15 @@ struct row_total_size {
  * @param num_rows Total number of rows to read
  * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
  */
-std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
-                                              size_t num_rows,
-                                              size_t chunk_read_limit)
+std::vector<chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
+                                         size_t num_rows,
+                                         size_t chunk_read_limit)
 {
   // now we have an array of {row_count, real output bytes}. just walk through it and generate
   // splits.
   // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch
   // sizes are reasonably large, this shouldn't iterate too many times
-  std::vector<gpu::chunk_read_info> splits;
+  std::vector<chunk_read_info> splits;
   {
     size_t cur_pos             = 0;
     size_t cur_cumulative_size = 0;
@@ -1290,7 +1284,7 @@ std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> c
 
       auto const start_row = cur_row_count;
       cur_row_count        = sizes[split_pos].row_count;
-      splits.push_back(gpu::chunk_read_info{start_row, cur_row_count - start_row});
+      splits.push_back(chunk_read_info{start_row, cur_row_count - start_row});
       cur_pos             = split_pos;
       cur_cumulative_size = sizes[split_pos].size_bytes;
     }
@@ -1311,12 +1305,11 @@ std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> c
  * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
  * @param stream CUDA stream to use
  */
-std::vector<gpu::chunk_read_info> compute_splits(
-  cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-  gpu::pass_intermediate_data const& id,
-  size_t num_rows,
-  size_t chunk_read_limit,
-  rmm::cuda_stream_view stream)
+std::vector<chunk_read_info> compute_splits(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                            pass_intermediate_data const& id,
+                                            size_t num_rows,
+                                            size_t chunk_read_limit,
+                                            rmm::cuda_stream_view stream)
 {
   auto const& page_keys  = id.page_keys;
   auto const& page_index = id.page_index;
@@ -1395,16 +1388,16 @@ std::vector<gpu::chunk_read_info> compute_splits(
 }
 
 struct get_page_chunk_idx {
-  __device__ size_type operator()(gpu::PageInfo const& page) { return page.chunk_idx; }
+  __device__ size_type operator()(PageInfo const& page) { return page.chunk_idx; }
 };
 
 struct get_page_num_rows {
-  __device__ size_type operator()(gpu::PageInfo const& page) { return page.num_rows; }
+  __device__ size_type operator()(PageInfo const& page) { return page.num_rows; }
 };
 
 struct get_page_column_index {
-  gpu::ColumnChunkDesc const* chunks;
-  __device__ size_type operator()(gpu::PageInfo const& page)
+  ColumnChunkDesc const* chunks;
+  __device__ size_type operator()(PageInfo const& page)
   {
     return chunks[page.chunk_idx].src_col_index;
   }
@@ -1441,7 +1434,7 @@ struct get_page_nesting_size {
   input_col_info const* const input_cols;
   size_type const max_depth;
   size_t const num_pages;
-  gpu::PageInfo const* const pages;
+  PageInfo const* const pages;
   int const* page_indices;
 
   __device__ size_type operator()(size_t index) const
@@ -1450,7 +1443,7 @@ struct get_page_nesting_size {
 
     auto const& page = pages[page_indices[indices.page_idx]];
     if (page.src_col_schema != input_cols[indices.col_idx].schema_idx ||
-        page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY ||
+        page.flags & PAGEINFO_FLAGS_DICTIONARY ||
         indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) {
       return 0;
     }
@@ -1468,7 +1461,7 @@ struct get_reduction_key {
  * @brief Writes to the chunk_row field of the PageInfo struct.
  */
 struct chunk_row_output_iter {
-  gpu::PageInfo* p;
+  PageInfo* p;
   using value_type        = size_type;
   using difference_type   = size_type;
   using pointer           = size_type*;
@@ -1490,7 +1483,7 @@ struct chunk_row_output_iter {
  * @brief Writes to the page_start_value field of the PageNestingInfo struct, keyed by schema.
  */
 struct start_offset_output_iterator {
-  gpu::PageInfo const* pages;
+  PageInfo const* pages;
   int const* page_indices;
   size_t cur_index;
   input_col_info const* input_cols;
@@ -1529,9 +1522,9 @@ struct start_offset_output_iterator {
   {
     auto const indices = reduction_indices{index, max_depth, num_pages};
 
-    gpu::PageInfo const& p = pages[page_indices[indices.page_idx]];
+    PageInfo const& p = pages[page_indices[indices.page_idx]];
     if (p.src_col_schema != input_cols[indices.col_idx].schema_idx ||
-        p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY ||
+        p.flags & PAGEINFO_FLAGS_DICTIONARY ||
         indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) {
       return empty;
     }
@@ -1540,15 +1533,15 @@ struct start_offset_output_iterator {
 };
 
 struct flat_column_num_rows {
-  gpu::PageInfo const* pages;
-  gpu::ColumnChunkDesc const* chunks;
+  PageInfo const* pages;
+  ColumnChunkDesc const* chunks;
 
   __device__ size_type operator()(size_type pindex) const
   {
-    gpu::PageInfo const& page = pages[pindex];
+    PageInfo const& page = pages[pindex];
     // ignore dictionary pages and pages belonging to any column containing repetition (lists)
-    if ((page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) ||
-        (chunks[page.chunk_idx].max_level[gpu::level_type::REPETITION] > 0)) {
+    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) ||
+        (chunks[page.chunk_idx].max_level[level_type::REPETITION] > 0)) {
       return 0;
     }
     return page.num_rows;
@@ -1581,8 +1574,8 @@ struct row_counts_different {
  * @param expected_row_count Expected row count, if applicable
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void detect_malformed_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& pages,
-                            cudf::detail::hostdevice_vector<gpu::ColumnChunkDesc> const& chunks,
+void detect_malformed_pages(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                            cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
                             device_span<int const> page_keys,
                             device_span<int const> page_index,
                             std::optional<size_t> expected_row_count,
@@ -1631,23 +1624,21 @@ void detect_malformed_pages(cudf::detail::hostdevice_vector<gpu::PageInfo>& page
 }
 
 struct page_to_string_size {
-  gpu::PageInfo* pages;
-  gpu::ColumnChunkDesc const* chunks;
+  PageInfo* pages;
+  ColumnChunkDesc const* chunks;
 
   __device__ size_t operator()(size_type page_idx) const
   {
     auto const page  = pages[page_idx];
     auto const chunk = chunks[page.chunk_idx];
 
-    if (not is_string_col(chunk) || (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) != 0) {
-      return 0;
-    }
+    if (not is_string_col(chunk) || (page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return 0; }
     return pages[page_idx].str_bytes;
   }
 };
 
 struct page_offset_output_iter {
-  gpu::PageInfo* p;
+  PageInfo* p;
   size_type const* index;
 
   using value_type        = size_type;
@@ -1738,7 +1729,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
       cols          = &out_buf.children;
 
       // if this has a list parent, we have to get column sizes from the
-      // data computed during gpu::ComputePageSizes
+      // data computed during ComputePageSizes
       if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) {
         has_lists = true;
         break;
@@ -1749,7 +1740,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
 
   // generate string dict indices if necessary
   {
-    auto is_dict_chunk = [](gpu::ColumnChunkDesc const& chunk) {
+    auto is_dict_chunk = [](ColumnChunkDesc const& chunk) {
       return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0;
     };
 
@@ -1785,7 +1776,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
 
     if (total_str_dict_indexes > 0) {
       chunks.host_to_device_async(_stream);
-      gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
+      BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
     }
   }
 
@@ -1800,14 +1791,14 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
     // if:
     // - user has passed custom row bounds
     // - we will be doing a chunked read
-    gpu::ComputePageSizes(pages,
-                          chunks,
-                          0,  // 0-max size_t. process all possible rows
-                          std::numeric_limits<size_t>::max(),
-                          true,                  // compute num_rows
-                          chunk_read_limit > 0,  // compute string sizes
-                          _pass_itm_data->level_type_size,
-                          _stream);
+    ComputePageSizes(pages,
+                     chunks,
+                     0,  // 0-max size_t. process all possible rows
+                     std::numeric_limits<size_t>::max(),
+                     true,                  // compute num_rows
+                     chunk_read_limit > 0,  // compute string sizes
+                     _pass_itm_data->level_type_size,
+                     _stream);
 
     // computes:
     // PageInfo::chunk_row (the absolute start row index) for all pages
@@ -1836,7 +1827,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
   _pass_itm_data->output_chunk_read_info =
     _output_chunk_read_limit > 0
       ? compute_splits(pages, *_pass_itm_data, num_rows, chunk_read_limit, _stream)
-      : std::vector<gpu::chunk_read_info>{{skip_rows, num_rows}};
+      : std::vector<chunk_read_info>{{skip_rows, num_rows}};
 }
 
 void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds)
@@ -1853,14 +1844,14 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
   // respect the user bounds. It is only necessary to do this second pass if uses_custom_row_bounds
   // is set (if the user has specified artificial bounds).
   if (uses_custom_row_bounds) {
-    gpu::ComputePageSizes(pages,
-                          chunks,
-                          skip_rows,
-                          num_rows,
-                          false,  // num_rows is already computed
-                          false,  // no need to compute string sizes
-                          _pass_itm_data->level_type_size,
-                          _stream);
+    ComputePageSizes(pages,
+                     chunks,
+                     skip_rows,
+                     num_rows,
+                     false,  // num_rows is already computed
+                     false,  // no need to compute string sizes
+                     _pass_itm_data->level_type_size,
+                     _stream);
 
     // print_pages(pages, _stream);
   }
@@ -1879,7 +1870,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
       cols          = &out_buf.children;
 
       // if this has a list parent, we have to get column sizes from the
-      // data computed during gpu::ComputePageSizes
+      // data computed during ComputePageSizes
       if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) {
         has_lists = true;
       }
@@ -2014,4 +2005,4 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
   return col_sizes;
 }
 
-}  // namespace cudf::io::detail::parquet
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 2545a074a38..799d6d9fd64 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -20,7 +20,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 
-namespace cudf::io::parquet::gpu {
+namespace cudf::io::parquet::detail {
 
 template <int num_threads>
 constexpr int rle_stream_required_run_buffer_size()
@@ -362,4 +362,4 @@ struct rle_stream {
   }
 };
 
-}  // namespace cudf::io::parquet::gpu
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index a124f352ee4..50589f23626 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -54,12 +54,9 @@
 #include <numeric>
 #include <utility>
 
-namespace cudf {
-namespace io {
-namespace detail {
-namespace parquet {
-using namespace cudf::io::parquet;
-using namespace cudf::io;
+namespace cudf::io::parquet::detail {
+
+using namespace cudf::io::detail;
 
 struct aggregate_writer_metadata {
   aggregate_writer_metadata(host_span<partition_info const> partitions,
@@ -185,13 +182,13 @@ namespace {
  * @param compression The compression type
  * @return The supported Parquet compression
  */
-parquet::Compression to_parquet_compression(compression_type compression)
+Compression to_parquet_compression(compression_type compression)
 {
   switch (compression) {
     case compression_type::AUTO:
-    case compression_type::SNAPPY: return parquet::Compression::SNAPPY;
-    case compression_type::ZSTD: return parquet::Compression::ZSTD;
-    case compression_type::NONE: return parquet::Compression::UNCOMPRESSED;
+    case compression_type::SNAPPY: return Compression::SNAPPY;
+    case compression_type::ZSTD: return Compression::ZSTD;
+    case compression_type::NONE: return Compression::UNCOMPRESSED;
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
@@ -206,7 +203,7 @@ void update_chunk_encodings(std::vector<Encoding>& encodings, uint32_t enc_mask)
 {
   for (uint8_t enc = 0; enc < static_cast<uint8_t>(Encoding::NUM_ENCODINGS); enc++) {
     auto const enc_enum = static_cast<Encoding>(enc);
-    if ((enc_mask & gpu::encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); }
+    if ((enc_mask & encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); }
   }
 }
 
@@ -761,11 +758,11 @@ struct parquet_column_view {
                       std::vector<schema_tree_node> const& schema_tree,
                       rmm::cuda_stream_view stream);
 
-  [[nodiscard]] gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const;
+  [[nodiscard]] parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const;
 
   [[nodiscard]] column_view cudf_column_view() const { return cudf_col; }
-  [[nodiscard]] parquet::Type physical_type() const { return schema_node.type; }
-  [[nodiscard]] parquet::ConvertedType converted_type() const { return schema_node.converted_type; }
+  [[nodiscard]] Type physical_type() const { return schema_node.type; }
+  [[nodiscard]] ConvertedType converted_type() const { return schema_node.converted_type; }
 
   std::vector<std::string> const& get_path_in_schema() { return path_in_schema; }
 
@@ -846,11 +843,11 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   uint16_t max_rep_level = 0;
   curr_schema_node       = schema_node;
   while (curr_schema_node.parent_idx != -1) {
-    if (curr_schema_node.repetition_type == parquet::REPEATED or
-        curr_schema_node.repetition_type == parquet::OPTIONAL) {
+    if (curr_schema_node.repetition_type == REPEATED or
+        curr_schema_node.repetition_type == OPTIONAL) {
       ++max_def_level;
     }
-    if (curr_schema_node.repetition_type == parquet::REPEATED) { ++max_rep_level; }
+    if (curr_schema_node.repetition_type == REPEATED) { ++max_rep_level; }
     curr_schema_node = schema_tree[curr_schema_node.parent_idx];
   }
   CUDF_EXPECTS(max_def_level < 256, "Definition levels above 255 are not supported");
@@ -897,9 +894,9 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   }
 }
 
-gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const
+parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const
 {
-  auto desc        = gpu::parquet_column_device_view{};  // Zero out all fields
+  auto desc        = parquet_column_device_view{};  // Zero out all fields
   desc.stats_dtype = schema_node.stats_dtype;
   desc.ts_scale    = schema_node.ts_scale;
 
@@ -931,8 +928,8 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_s
  * @param fragment_size Number of rows per fragment
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void init_row_group_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragment>& frag,
-                              device_span<gpu::parquet_column_device_view const> col_desc,
+void init_row_group_fragments(cudf::detail::hostdevice_2dvector<PageFragment>& frag,
+                              device_span<parquet_column_device_view const> col_desc,
                               host_span<partition_info const> partitions,
                               device_span<int const> part_frag_offset,
                               uint32_t fragment_size,
@@ -940,7 +937,7 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragmen
 {
   auto d_partitions = cudf::detail::make_device_uvector_async(
     partitions, stream, rmm::mr::get_current_device_resource());
-  gpu::InitRowGroupFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
+  InitRowGroupFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
   frag.device_to_host_sync(stream);
 }
 
@@ -954,13 +951,13 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragmen
  * @param frag_sizes Array of fragment sizes for each column
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void calculate_page_fragments(device_span<gpu::PageFragment> frag,
+void calculate_page_fragments(device_span<PageFragment> frag,
                               host_span<size_type const> frag_sizes,
                               rmm::cuda_stream_view stream)
 {
   auto d_frag_sz = cudf::detail::make_device_uvector_async(
     frag_sizes, stream, rmm::mr::get_current_device_resource());
-  gpu::CalculatePageFragments(frag, d_frag_sz, stream);
+  CalculatePageFragments(frag, d_frag_sz, stream);
 }
 
 /**
@@ -972,13 +969,13 @@ void calculate_page_fragments(device_span<gpu::PageFragment> frag,
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void gather_fragment_statistics(device_span<statistics_chunk> frag_stats,
-                                device_span<gpu::PageFragment const> frags,
+                                device_span<PageFragment const> frags,
                                 bool int96_timestamps,
                                 rmm::cuda_stream_view stream)
 {
   rmm::device_uvector<statistics_group> frag_stats_group(frag_stats.size(), stream);
 
-  gpu::InitFragmentStatistics(frag_stats_group, frags, stream);
+  InitFragmentStatistics(frag_stats_group, frags, stream);
   detail::calculate_group_statistics<detail::io_file_format::PARQUET>(
     frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream, int96_timestamps);
   stream.synchronize();
@@ -1008,8 +1005,8 @@ size_t max_compression_output_size(Compression codec, uint32_t compression_block
   return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
 }
 
-auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                     device_span<gpu::parquet_column_device_view const> col_desc,
+auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
+                     device_span<parquet_column_device_view const> col_desc,
                      uint32_t num_columns,
                      size_t max_page_size_bytes,
                      size_type max_page_size_rows,
@@ -1021,19 +1018,19 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
   chunks.host_to_device_async(stream);
   // Calculate number of pages and store in respective chunks
-  gpu::InitEncoderPages(chunks,
-                        {},
-                        {},
-                        {},
-                        col_desc,
-                        num_columns,
-                        max_page_size_bytes,
-                        max_page_size_rows,
-                        page_alignment(compression_codec),
-                        write_v2_headers,
-                        nullptr,
-                        nullptr,
-                        stream);
+  InitEncoderPages(chunks,
+                   {},
+                   {},
+                   {},
+                   col_desc,
+                   num_columns,
+                   max_page_size_bytes,
+                   max_page_size_rows,
+                   page_alignment(compression_codec),
+                   write_v2_headers,
+                   nullptr,
+                   nullptr,
+                   stream);
   chunks.device_to_host_sync(stream);
 
   int num_pages = 0;
@@ -1046,19 +1043,19 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   // Now that we know the number of pages, allocate an array to hold per page size and get it
   // populated
   cudf::detail::hostdevice_vector<size_type> page_sizes(num_pages, stream);
-  gpu::InitEncoderPages(chunks,
-                        {},
-                        page_sizes,
-                        {},
-                        col_desc,
-                        num_columns,
-                        max_page_size_bytes,
-                        max_page_size_rows,
-                        page_alignment(compression_codec),
-                        write_v2_headers,
-                        nullptr,
-                        nullptr,
-                        stream);
+  InitEncoderPages(chunks,
+                   {},
+                   page_sizes,
+                   {},
+                   col_desc,
+                   num_columns,
+                   max_page_size_bytes,
+                   max_page_size_rows,
+                   page_alignment(compression_codec),
+                   write_v2_headers,
+                   nullptr,
+                   nullptr,
+                   stream);
   page_sizes.device_to_host_sync(stream);
 
   // Get per-page max compressed size
@@ -1072,26 +1069,26 @@ auto init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   comp_page_sizes.host_to_device_async(stream);
 
   // Use per-page max compressed size to calculate chunk.compressed_size
-  gpu::InitEncoderPages(chunks,
-                        {},
-                        {},
-                        comp_page_sizes,
-                        col_desc,
-                        num_columns,
-                        max_page_size_bytes,
-                        max_page_size_rows,
-                        page_alignment(compression_codec),
-                        write_v2_headers,
-                        nullptr,
-                        nullptr,
-                        stream);
+  InitEncoderPages(chunks,
+                   {},
+                   {},
+                   comp_page_sizes,
+                   col_desc,
+                   num_columns,
+                   max_page_size_bytes,
+                   max_page_size_rows,
+                   page_alignment(compression_codec),
+                   write_v2_headers,
+                   nullptr,
+                   nullptr,
+                   stream);
   chunks.device_to_host_sync(stream);
   return comp_page_sizes;
 }
 
 size_t max_page_bytes(Compression compression, size_t max_page_size_bytes)
 {
-  if (compression == parquet::Compression::UNCOMPRESSED) { return max_page_size_bytes; }
+  if (compression == Compression::UNCOMPRESSED) { return max_page_size_bytes; }
 
   auto const ncomp_type   = to_nvcomp_compression_type(compression);
   auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type)
@@ -1104,9 +1101,9 @@ size_t max_page_bytes(Compression compression, size_t max_page_size_bytes)
 }
 
 std::pair<std::vector<rmm::device_uvector<size_type>>, std::vector<rmm::device_uvector<size_type>>>
-build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                         host_span<gpu::parquet_column_device_view const> col_desc,
-                         device_2dspan<gpu::PageFragment const> frags,
+build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
+                         host_span<parquet_column_device_view const> col_desc,
+                         device_2dspan<PageFragment const> frags,
                          Compression compression,
                          dictionary_policy dict_policy,
                          size_t max_dict_size,
@@ -1130,7 +1127,7 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   }
 
   // Allocate slots for each chunk
-  std::vector<rmm::device_uvector<gpu::slot_type>> hash_maps_storage;
+  std::vector<rmm::device_uvector<slot_type>> hash_maps_storage;
   hash_maps_storage.reserve(h_chunks.size());
   for (auto& chunk : h_chunks) {
     if (col_desc[chunk.col_desc_id].physical_type == Type::BOOLEAN ||
@@ -1149,8 +1146,8 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
   chunks.host_to_device_async(stream);
 
-  gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
-  gpu::populate_chunk_hash_maps(frags, stream);
+  initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
+  populate_chunk_hash_maps(frags, stream);
 
   chunks.device_to_host_sync(stream);
 
@@ -1197,8 +1194,8 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
     chunk.dict_index          = inserted_dict_index.data();
   }
   chunks.host_to_device_async(stream);
-  gpu::collect_map_entries(chunks.device_view().flat_view(), stream);
-  gpu::get_dictionary_indices(frags, stream);
+  collect_map_entries(chunks.device_view().flat_view(), stream);
+  get_dictionary_indices(frags, stream);
 
   return std::pair(std::move(dict_data), std::move(dict_index));
 }
@@ -1221,9 +1218,9 @@ build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param write_v2_headers True if version 2 page headers are to be written
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                        device_span<gpu::parquet_column_device_view const> col_desc,
-                        device_span<gpu::EncPage> pages,
+void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
+                        device_span<parquet_column_device_view const> col_desc,
+                        device_span<EncPage> pages,
                         cudf::detail::hostdevice_vector<size_type>& comp_page_sizes,
                         statistics_chunk* page_stats,
                         statistics_chunk* frag_stats,
@@ -1286,8 +1283,8 @@ void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param write_v2_headers True if V2 page headers should be written
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                  device_span<gpu::EncPage> pages,
+void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
+                  device_span<EncPage> pages,
                   uint32_t pages_in_batch,
                   uint32_t first_page_in_batch,
                   uint32_t rowgroups_in_batch,
@@ -1308,8 +1305,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
       ? device_span<statistics_chunk const>(page_stats + first_page_in_batch, pages_in_batch)
       : device_span<statistics_chunk const>();
 
-  uint32_t max_comp_pages =
-    (compression != parquet::Compression::UNCOMPRESSED) ? pages_in_batch : 0;
+  uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? pages_in_batch : 0;
 
   rmm::device_uvector<device_span<uint8_t const>> comp_in(max_comp_pages, stream);
   rmm::device_uvector<device_span<uint8_t>> comp_out(max_comp_pages, stream);
@@ -1319,9 +1315,9 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                comp_res.end(),
                compression_result{0, compression_status::FAILURE});
 
-  gpu::EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
+  EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
   switch (compression) {
-    case parquet::Compression::SNAPPY:
+    case Compression::SNAPPY:
       if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
         gpu_snap(comp_in, comp_out, comp_res, stream);
       } else {
@@ -1329,7 +1325,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
           nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
       }
       break;
-    case parquet::Compression::ZSTD: {
+    case Compression::ZSTD: {
       if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD);
           reason) {
         CUDF_FAIL("Compression error: " + reason.value());
@@ -1338,7 +1334,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
 
       break;
     }
-    case parquet::Compression::UNCOMPRESSED: break;
+    case Compression::UNCOMPRESSED: break;
     default: CUDF_FAIL("invalid compression type");
   }
 
@@ -1378,7 +1374,7 @@ void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
  * @param column_index_truncate_length maximum length of min or max values in column index, in bytes
  * @return Computed buffer size needed to encode the column index
  */
-size_t column_index_buffer_size(gpu::EncColumnChunk* ck, int32_t column_index_truncate_length)
+size_t column_index_buffer_size(EncColumnChunk* ck, int32_t column_index_truncate_length)
 {
   // encoding the column index for a given chunk requires:
   //   each list (4 of them) requires 6 bytes of overhead
@@ -1499,8 +1495,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   std::vector<SchemaElement> this_table_schema(schema_tree.begin(), schema_tree.end());
 
   // Initialize column description
-  cudf::detail::hostdevice_vector<gpu::parquet_column_device_view> col_desc(parquet_columns.size(),
-                                                                            stream);
+  cudf::detail::hostdevice_vector<parquet_column_device_view> col_desc(parquet_columns.size(),
+                                                                       stream);
   std::transform(
     parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto const& pcol) {
       return pcol.get_device_view(stream);
@@ -1576,7 +1572,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
   auto d_part_frag_offset = cudf::detail::make_device_uvector_async(
     part_frag_offset, stream, rmm::mr::get_current_device_resource());
-  cudf::detail::hostdevice_2dvector<gpu::PageFragment> row_group_fragments(
+  cudf::detail::hostdevice_2dvector<PageFragment> row_group_fragments(
     num_columns, num_fragments, stream);
 
   // Create table_device_view so that corresponding column_device_view data
@@ -1588,7 +1584,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   if (num_fragments != 0) {
     // Move column info to device
     col_desc.host_to_device_async(stream);
-    leaf_column_views = create_leaf_column_device_views<gpu::parquet_column_device_view>(
+    leaf_column_views = create_leaf_column_device_views<parquet_column_device_view>(
       col_desc, *parent_column_table_device_view, stream);
 
     init_row_group_fragments(row_group_fragments,
@@ -1662,7 +1658,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
   // Initialize row groups and column chunks
   auto const num_chunks = num_rowgroups * num_columns;
-  hostdevice_2dvector<gpu::EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
+  hostdevice_2dvector<EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
 
   // total fragments per column (in case they are non-uniform)
   std::vector<size_type> frags_per_column(num_columns, 0);
@@ -1678,7 +1674,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       row_group.total_byte_size = 0;
       row_group.columns.resize(num_columns);
       for (int c = 0; c < num_columns; c++) {
-        gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
+        EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
 
         ck                   = {};
         ck.col_desc          = col_desc.device_ptr() + c;
@@ -1700,7 +1696,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
             return l + r.num_values;
           });
         ck.plain_data_size = std::accumulate(
-          chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) {
+          chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, PageFragment frag) {
             return sum + frag.fragment_data_size;
           });
         auto& column_chunk_meta          = row_group.columns[c].meta_data;
@@ -1731,7 +1727,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     frags_per_column.empty() ? 0 : frag_offsets.back() + frags_per_column.back();
 
   rmm::device_uvector<statistics_chunk> frag_stats(0, stream);
-  cudf::detail::hostdevice_vector<gpu::PageFragment> page_fragments(total_frags, stream);
+  cudf::detail::hostdevice_vector<PageFragment> page_fragments(total_frags, stream);
 
   // update fragments and/or prepare for fragment statistics calculation if necessary
   if (total_frags != 0) {
@@ -1749,9 +1745,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
           auto const& row_group = agg_meta->file(p).row_groups[global_r];
           uint32_t const fragments_in_chunk =
             util::div_rounding_up_unsafe(row_group.num_rows, frag_size);
-          gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
-          ck.fragments            = page_fragments.device_ptr(frag_offset);
-          ck.first_fragment       = frag_offset;
+          EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
+          ck.fragments       = page_fragments.device_ptr(frag_offset);
+          ck.first_fragment  = frag_offset;
 
           // update the chunk pointer here for each fragment in chunk.fragments
           for (uint32_t i = 0; i < fragments_in_chunk; i++) {
@@ -1817,8 +1813,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     size_t comp_rowgroup_size = 0;
     if (r < num_rowgroups) {
       for (int i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk* ck = &chunks[r][i];
-        ck->first_page          = num_pages;
+        EncColumnChunk* ck = &chunks[r][i];
+        ck->first_page     = num_pages;
         num_pages += ck->num_pages;
         pages_in_batch += ck->num_pages;
         rowgroup_size += ck->bfr_size;
@@ -1850,7 +1846,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   }
 
   // Clear compressed buffer size if compression has been turned off
-  if (compression == parquet::Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; }
+  if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; }
 
   // Initialize data pointers in batch
   uint32_t const num_stats_bfr =
@@ -1864,7 +1860,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                               stream);
 
   rmm::device_buffer col_idx_bfr(column_index_bfr_size, stream);
-  rmm::device_uvector<gpu::EncPage> pages(num_pages, stream);
+  rmm::device_uvector<EncPage> pages(num_pages, stream);
 
   // This contains stats for both the pages and the rowgroups. TODO: make them separate.
   rmm::device_uvector<statistics_chunk> page_stats(num_stats_bfr, stream);
@@ -1874,10 +1870,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     auto bfr_c = static_cast<uint8_t*>(comp_bfr.data());
     for (auto j = 0; j < batch_list[b]; j++, r++) {
       for (auto i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk& ck = chunks[r][i];
-        ck.uncompressed_bfr     = bfr;
-        ck.compressed_bfr       = bfr_c;
-        ck.column_index_blob    = bfr_i;
+        EncColumnChunk& ck   = chunks[r][i];
+        ck.uncompressed_bfr  = bfr;
+        ck.compressed_bfr    = bfr_c;
+        ck.column_index_blob = bfr_i;
         bfr += ck.bfr_size;
         bfr_c += ck.compressed_size;
         if (stats_granularity == statistics_freq::STATISTICS_COLUMN) {
@@ -1960,7 +1956,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         if (ck.ck_stat_size != 0) {
           std::vector<uint8_t> const stats_blob = cudf::detail::make_std_vector_sync(
             device_span<uint8_t const>(dev_bfr, ck.ck_stat_size), stream);
-          cudf::io::parquet::CompactProtocolReader cp(stats_blob.data(), stats_blob.size());
+          CompactProtocolReader cp(stats_blob.data(), stats_blob.size());
           cp.read(&column_chunk_meta.statistics);
           need_sync = true;
         }
@@ -2142,8 +2138,8 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
 
 void writer::impl::write_parquet_data_to_sink(
   std::unique_ptr<aggregate_writer_metadata>& updated_agg_meta,
-  device_span<gpu::EncPage const> pages,
-  host_2dspan<gpu::EncColumnChunk const> chunks,
+  device_span<EncPage const> pages,
+  host_2dspan<EncColumnChunk const> chunks,
   host_span<size_t const> global_rowgroup_base,
   host_span<int const> first_rg_in_part,
   host_span<size_type const> batch_list,
@@ -2209,7 +2205,7 @@ void writer::impl::write_parquet_data_to_sink(
         int const global_r    = global_rowgroup_base[p] + r - first_rg_in_part[p];
         auto const& row_group = _agg_meta->file(p).row_groups[global_r];
         for (std::size_t i = 0; i < num_columns; i++) {
-          gpu::EncColumnChunk const& ck = chunks[r][i];
+          EncColumnChunk const& ck      = chunks[r][i];
           auto const& column_chunk_meta = row_group.columns[i].meta_data;
 
           // start transfer of the column index
@@ -2392,7 +2388,4 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
   return std::make_unique<std::vector<uint8_t>>(std::move(output));
 }
 
-}  // namespace parquet
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 89ef85ba2bd..1d27a8400c8 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -38,15 +38,11 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace detail {
-namespace parquet {
+namespace cudf::io::parquet::detail {
+
 // Forward internal classes
 struct aggregate_writer_metadata;
 
-using namespace cudf::io::parquet;
-using namespace cudf::io;
 using cudf::detail::device_2dspan;
 using cudf::detail::host_2dspan;
 using cudf::detail::hostdevice_2dvector;
@@ -66,7 +62,7 @@ class writer::impl {
    */
   explicit impl(std::vector<std::unique_ptr<data_sink>> sinks,
                 parquet_writer_options const& options,
-                single_write_mode mode,
+                cudf::io::detail::single_write_mode mode,
                 rmm::cuda_stream_view stream);
 
   /**
@@ -79,7 +75,7 @@ class writer::impl {
    */
   explicit impl(std::vector<std::unique_ptr<data_sink>> sinks,
                 chunked_parquet_writer_options const& options,
-                single_write_mode mode,
+                cudf::io::detail::single_write_mode mode,
                 rmm::cuda_stream_view stream);
 
   /**
@@ -139,8 +135,8 @@ class writer::impl {
    * @param[out] bounce_buffer Temporary host output buffer
    */
   void write_parquet_data_to_sink(std::unique_ptr<aggregate_writer_metadata>& updated_agg_meta,
-                                  device_span<gpu::EncPage const> pages,
-                                  host_2dspan<gpu::EncColumnChunk const> chunks,
+                                  device_span<EncPage const> pages,
+                                  host_2dspan<EncColumnChunk const> chunks,
                                   host_span<size_t const> global_rowgroup_base,
                                   host_span<int const> first_rg_in_part,
                                   host_span<size_type const> batch_list,
@@ -164,9 +160,10 @@ class writer::impl {
   bool const _write_v2_headers;
   int32_t const _column_index_truncate_length;
   std::vector<std::map<std::string, std::string>> const _kv_meta;  // Optional user metadata.
-  single_write_mode const _single_write_mode;  // Special parameter only used by `write()` to
-                                               // indicate that we are guaranteeing a single table
-                                               // write. This enables some internal optimizations.
+  cudf::io::detail::single_write_mode const
+    _single_write_mode;  // Special parameter only used by `write()` to
+                         // indicate that we are guaranteeing a single table
+                         // write. This enables some internal optimizations.
   std::vector<std::unique_ptr<data_sink>> const _out_sink;
 
   // Internal states, filled during `write()` and written to sink during `write` and `close()`.
@@ -180,7 +177,4 @@ class writer::impl {
   bool _closed                = false;  // To track if the output has been written to sink.
 };
 
-}  // namespace parquet
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 73c946a5feb..3e5d7033e60 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -200,29 +200,30 @@ std::unique_ptr<cudf::column> make_parquet_list_list_col(
 // of the file to populate the FileMetaData pointed to by file_meta_data.
 // throws cudf::logic_error if the file or metadata is invalid.
 void read_footer(std::unique_ptr<cudf::io::datasource> const& source,
-                 cudf::io::parquet::FileMetaData* file_meta_data)
+                 cudf::io::parquet::detail::FileMetaData* file_meta_data)
 {
-  constexpr auto header_len = sizeof(cudf::io::parquet::file_header_s);
-  constexpr auto ender_len  = sizeof(cudf::io::parquet::file_ender_s);
+  constexpr auto header_len = sizeof(cudf::io::parquet::detail::file_header_s);
+  constexpr auto ender_len  = sizeof(cudf::io::parquet::detail::file_ender_s);
 
   auto const len           = source->size();
   auto const header_buffer = source->host_read(0, header_len);
   auto const header =
-    reinterpret_cast<cudf::io::parquet::file_header_s const*>(header_buffer->data());
+    reinterpret_cast<cudf::io::parquet::detail::file_header_s const*>(header_buffer->data());
   auto const ender_buffer = source->host_read(len - ender_len, ender_len);
-  auto const ender = reinterpret_cast<cudf::io::parquet::file_ender_s const*>(ender_buffer->data());
+  auto const ender =
+    reinterpret_cast<cudf::io::parquet::detail::file_ender_s const*>(ender_buffer->data());
 
   // checks for valid header, footer, and file length
   ASSERT_GT(len, header_len + ender_len);
-  ASSERT_TRUE(header->magic == cudf::io::parquet::parquet_magic &&
-              ender->magic == cudf::io::parquet::parquet_magic);
+  ASSERT_TRUE(header->magic == cudf::io::parquet::detail::parquet_magic &&
+              ender->magic == cudf::io::parquet::detail::parquet_magic);
   ASSERT_TRUE(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len));
 
   // parquet files end with 4-byte footer_length and 4-byte magic == "PAR1"
   // seek backwards from the end of the file (footer_length + 8 bytes of ender)
   auto const footer_buffer =
     source->host_read(len - ender->footer_len - ender_len, ender->footer_len);
-  cudf::io::parquet::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len);
+  cudf::io::parquet::detail::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len);
 
   // returns true on success
   bool res = cp.read(file_meta_data);
@@ -233,14 +234,14 @@ void read_footer(std::unique_ptr<cudf::io::datasource> const& source,
 // this assumes the data is uncompressed.
 // throws cudf::logic_error if the page_loc data is invalid.
 int read_dict_bits(std::unique_ptr<cudf::io::datasource> const& source,
-                   cudf::io::parquet::PageLocation const& page_loc)
+                   cudf::io::parquet::detail::PageLocation const& page_loc)
 {
   CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header");
   CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length");
 
-  cudf::io::parquet::PageHeader page_hdr;
+  cudf::io::parquet::detail::PageHeader page_hdr;
   auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size);
-  cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size());
   bool res = cp.read(&page_hdr);
   CUDF_EXPECTS(res, "Cannot parse page header");
 
@@ -252,15 +253,16 @@ int read_dict_bits(std::unique_ptr<cudf::io::datasource> const& source,
 // read column index from datasource at location indicated by chunk,
 // parse and return as a ColumnIndex struct.
 // throws cudf::logic_error if the chunk data is invalid.
-cudf::io::parquet::ColumnIndex read_column_index(
-  std::unique_ptr<cudf::io::datasource> const& source, cudf::io::parquet::ColumnChunk const& chunk)
+cudf::io::parquet::detail::ColumnIndex read_column_index(
+  std::unique_ptr<cudf::io::datasource> const& source,
+  cudf::io::parquet::detail::ColumnChunk const& chunk)
 {
   CUDF_EXPECTS(chunk.column_index_offset > 0, "Cannot find column index");
   CUDF_EXPECTS(chunk.column_index_length > 0, "Invalid column index length");
 
-  cudf::io::parquet::ColumnIndex colidx;
+  cudf::io::parquet::detail::ColumnIndex colidx;
   auto const ci_buf = source->host_read(chunk.column_index_offset, chunk.column_index_length);
-  cudf::io::parquet::CompactProtocolReader cp(ci_buf->data(), ci_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(ci_buf->data(), ci_buf->size());
   bool res = cp.read(&colidx);
   CUDF_EXPECTS(res, "Cannot parse column index");
   return colidx;
@@ -269,22 +271,24 @@ cudf::io::parquet::ColumnIndex read_column_index(
 // read offset index from datasource at location indicated by chunk,
 // parse and return as an OffsetIndex struct.
 // throws cudf::logic_error if the chunk data is invalid.
-cudf::io::parquet::OffsetIndex read_offset_index(
-  std::unique_ptr<cudf::io::datasource> const& source, cudf::io::parquet::ColumnChunk const& chunk)
+cudf::io::parquet::detail::OffsetIndex read_offset_index(
+  std::unique_ptr<cudf::io::datasource> const& source,
+  cudf::io::parquet::detail::ColumnChunk const& chunk)
 {
   CUDF_EXPECTS(chunk.offset_index_offset > 0, "Cannot find offset index");
   CUDF_EXPECTS(chunk.offset_index_length > 0, "Invalid offset index length");
 
-  cudf::io::parquet::OffsetIndex offidx;
+  cudf::io::parquet::detail::OffsetIndex offidx;
   auto const oi_buf = source->host_read(chunk.offset_index_offset, chunk.offset_index_length);
-  cudf::io::parquet::CompactProtocolReader cp(oi_buf->data(), oi_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(oi_buf->data(), oi_buf->size());
   bool res = cp.read(&offidx);
   CUDF_EXPECTS(res, "Cannot parse offset index");
   return offidx;
 }
 
 // Return as a Statistics from the column chunk
-cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChunk const& chunk)
+cudf::io::parquet::detail::Statistics const& get_statistics(
+  cudf::io::parquet::detail::ColumnChunk const& chunk)
 {
   return chunk.meta_data.statistics;
 }
@@ -292,15 +296,16 @@ cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChu
 // read page header from datasource at location indicated by page_loc,
 // parse and return as a PageHeader struct.
 // throws cudf::logic_error if the page_loc data is invalid.
-cudf::io::parquet::PageHeader read_page_header(std::unique_ptr<cudf::io::datasource> const& source,
-                                               cudf::io::parquet::PageLocation const& page_loc)
+cudf::io::parquet::detail::PageHeader read_page_header(
+  std::unique_ptr<cudf::io::datasource> const& source,
+  cudf::io::parquet::detail::PageLocation const& page_loc)
 {
   CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header");
   CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length");
 
-  cudf::io::parquet::PageHeader page_hdr;
+  cudf::io::parquet::detail::PageHeader page_hdr;
   auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size);
-  cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size());
+  cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size());
   bool res = cp.read(&page_hdr);
   CUDF_EXPECTS(res, "Cannot parse page header");
   return page_hdr;
@@ -3686,7 +3691,7 @@ TEST_F(ParquetWriterTest, CheckPageRows)
 
   // check first page header and make sure it has only page_rows values
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_GT(fmd.row_groups.size(), 0);
@@ -3697,7 +3702,7 @@ TEST_F(ParquetWriterTest, CheckPageRows)
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
   auto const ph = read_page_header(
-    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0});
+    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0});
 
   EXPECT_EQ(ph.data_page_header.num_values, page_rows);
 }
@@ -3722,7 +3727,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted)
 
   // check first page header and make sure it has only page_rows values
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_GT(fmd.row_groups.size(), 0);
@@ -3733,7 +3738,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted)
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
   auto const ph = read_page_header(
-    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0});
+    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0});
 
   EXPECT_LE(ph.data_page_header.num_values, rows_per_page);
 }
@@ -3759,7 +3764,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall)
 
   // check that file is written correctly when rows/page < fragment size
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_TRUE(fmd.row_groups.size() > 0);
@@ -3770,7 +3775,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall)
   // read first data page header.  sizeof(PageHeader) is not exact, but the thrift encoded
   // version should be smaller than size of the struct.
   auto const ph = read_page_header(
-    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0});
+    source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0});
 
   // there should be only one page since the fragment size is larger than rows_per_page
   EXPECT_EQ(ph.data_page_header.num_values, num_rows);
@@ -3798,7 +3803,7 @@ TEST_F(ParquetWriterTest, Decimal128Stats)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4031,7 +4036,7 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   ASSERT_GT(fmd.row_groups.size(), 0);
@@ -4041,10 +4046,10 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted)
 
   // now check that the boundary order for chunk 1 is ascending,
   // chunk 2 is descending, and chunk 3 is unordered
-  cudf::io::parquet::BoundaryOrder expected_orders[] = {
-    cudf::io::parquet::BoundaryOrder::ASCENDING,
-    cudf::io::parquet::BoundaryOrder::DESCENDING,
-    cudf::io::parquet::BoundaryOrder::UNORDERED};
+  cudf::io::parquet::detail::BoundaryOrder expected_orders[] = {
+    cudf::io::parquet::detail::BoundaryOrder::ASCENDING,
+    cudf::io::parquet::detail::BoundaryOrder::DESCENDING,
+    cudf::io::parquet::detail::BoundaryOrder::UNORDERED};
 
   for (std::size_t i = 0; i < columns.size(); i++) {
     auto const ci = read_column_index(source, columns[i]);
@@ -4067,15 +4072,15 @@ int32_t compare(T& v1, T& v2)
 // 1 if v1 > v2.
 int32_t compare_binary(std::vector<uint8_t> const& v1,
                        std::vector<uint8_t> const& v2,
-                       cudf::io::parquet::Type ptype,
-                       cudf::io::parquet::ConvertedType ctype)
+                       cudf::io::parquet::detail::Type ptype,
+                       cudf::io::parquet::detail::ConvertedType ctype)
 {
   switch (ptype) {
-    case cudf::io::parquet::INT32:
+    case cudf::io::parquet::detail::INT32:
       switch (ctype) {
-        case cudf::io::parquet::UINT_8:
-        case cudf::io::parquet::UINT_16:
-        case cudf::io::parquet::UINT_32:
+        case cudf::io::parquet::detail::UINT_8:
+        case cudf::io::parquet::detail::UINT_16:
+        case cudf::io::parquet::detail::UINT_32:
           return compare(*(reinterpret_cast<uint32_t const*>(v1.data())),
                          *(reinterpret_cast<uint32_t const*>(v2.data())));
         default:
@@ -4083,23 +4088,23 @@ int32_t compare_binary(std::vector<uint8_t> const& v1,
                          *(reinterpret_cast<int32_t const*>(v2.data())));
       }
 
-    case cudf::io::parquet::INT64:
-      if (ctype == cudf::io::parquet::UINT_64) {
+    case cudf::io::parquet::detail::INT64:
+      if (ctype == cudf::io::parquet::detail::UINT_64) {
         return compare(*(reinterpret_cast<uint64_t const*>(v1.data())),
                        *(reinterpret_cast<uint64_t const*>(v2.data())));
       }
       return compare(*(reinterpret_cast<int64_t const*>(v1.data())),
                      *(reinterpret_cast<int64_t const*>(v2.data())));
 
-    case cudf::io::parquet::FLOAT:
+    case cudf::io::parquet::detail::FLOAT:
       return compare(*(reinterpret_cast<float const*>(v1.data())),
                      *(reinterpret_cast<float const*>(v2.data())));
 
-    case cudf::io::parquet::DOUBLE:
+    case cudf::io::parquet::detail::DOUBLE:
       return compare(*(reinterpret_cast<double const*>(v1.data())),
                      *(reinterpret_cast<double const*>(v2.data())));
 
-    case cudf::io::parquet::BYTE_ARRAY: {
+    case cudf::io::parquet::detail::BYTE_ARRAY: {
       int32_t v1sz = v1.size();
       int32_t v2sz = v2.size();
       int32_t ret  = memcmp(v1.data(), v2.data(), std::min(v1sz, v2sz));
@@ -4142,7 +4147,7 @@ TEST_P(ParquetV2Test, LargeColumnIndex)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4164,10 +4169,10 @@ TEST_P(ParquetV2Test, LargeColumnIndex)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
 {
-  constexpr auto num_rows = 100000;
-  auto const is_v2        = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  constexpr auto num_rows      = 100000;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4210,7 +4215,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4255,10 +4260,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
 {
-  constexpr auto num_rows = 100000;
-  auto const is_v2        = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  constexpr auto num_rows      = 100000;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4311,7 +4316,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4362,10 +4367,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
 {
-  constexpr auto num_rows = 100000;
-  auto const is_v2        = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  constexpr auto num_rows      = 100000;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   // fixed length strings
   auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
@@ -4403,7 +4408,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4458,9 +4463,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
 {
-  auto const is_v2 = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   auto c0 = testdata::ascending<uint32_t>();
 
@@ -4495,7 +4500,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4542,9 +4547,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
 
 TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
 {
-  auto const is_v2 = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   auto validity2 =
     cudf::detail::make_counting_transform_iterator(0, [](cudf::size_type i) { return i % 2; });
@@ -4586,7 +4591,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4616,9 +4621,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
 
 TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)
 {
-  auto const is_v2 = GetParam();
-  auto const expected_hdr_type =
-    is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE;
+  auto const is_v2             = GetParam();
+  auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
+                                       : cudf::io::parquet::detail::PageType::DATA_PAGE;
 
   using cudf::test::iterators::null_at;
   using cudf::test::iterators::nulls_at;
@@ -4711,7 +4716,7 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4812,7 +4817,7 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4870,7 +4875,7 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation)
   cudf::io::write_parquet(out_opts);
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -5030,10 +5035,10 @@ TEST_F(ParquetReaderTest, NestedByteArray)
   cudf::io::write_parquet(out_opts);
 
   auto source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
-  EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::Type::BYTE_ARRAY);
+  EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::detail::Type::BYTE_ARRAY);
 
   std::vector<cudf::io::reader_column_schema> md{
     {},
@@ -5081,12 +5086,12 @@ TEST_F(ParquetWriterTest, ByteArrayStats)
   auto result = cudf::io::read_parquet(in_opts);
 
   auto source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
-  EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::Type::BYTE_ARRAY);
-  EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::Type::BYTE_ARRAY);
+  EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::detail::Type::BYTE_ARRAY);
+  EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::detail::Type::BYTE_ARRAY);
 
   auto const stats0 = get_statistics(fmd.row_groups[0].columns[0]);
   auto const stats1 = get_statistics(fmd.row_groups[0].columns[1]);
@@ -5137,9 +5142,9 @@ TEST_F(ParquetReaderTest, StructByteArray)
 
 TEST_F(ParquetReaderTest, NestingOptimizationTest)
 {
-  // test nesting levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info deep.
+  // test nesting levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info deep.
   constexpr cudf::size_type num_nesting_levels = 16;
-  static_assert(num_nesting_levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info);
+  static_assert(num_nesting_levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info);
   constexpr cudf::size_type rows_per_level = 2;
 
   constexpr cudf::size_type num_values = (1 << num_nesting_levels) * rows_per_level;
@@ -5206,13 +5211,13 @@ TEST_F(ParquetWriterTest, SingleValueDictionaryTest)
 
   // make sure dictionary was used
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd]() {
     for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5252,13 +5257,13 @@ TEST_F(ParquetWriterTest, DictionaryNeverTest)
 
   // make sure dictionary was not used
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd]() {
     for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5303,13 +5308,13 @@ TEST_F(ParquetWriterTest, DictionaryAdaptiveTest)
   // make sure dictionary was used as expected. col0 should use one,
   // col1 should not.
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd](int col) {
     for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5354,13 +5359,13 @@ TEST_F(ParquetWriterTest, DictionaryAlwaysTest)
 
   // make sure dictionary was used for both columns
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd](int col) {
     for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -5438,13 +5443,13 @@ TEST_P(ParquetSizedTest, DictionaryTest)
 
   // make sure dictionary was used
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto used_dict = [&fmd]() {
     for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) {
-      if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or
-          enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) {
+      if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or
+          enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) {
         return true;
       }
     }
@@ -6664,7 +6669,7 @@ TEST_F(ParquetWriterTest, PreserveNullability)
 
 TEST_P(ParquetV2Test, CheckEncodings)
 {
-  using cudf::io::parquet::Encoding;
+  using cudf::io::parquet::detail::Encoding;
   constexpr auto num_rows = 100'000;
   auto const is_v2        = GetParam();
 
@@ -6697,7 +6702,7 @@ TEST_P(ParquetV2Test, CheckEncodings)
   };
 
   auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::FileMetaData fmd;
+  cudf::io::parquet::detail::FileMetaData fmd;
 
   read_footer(source, &fmd);
   auto const& chunk0_enc = fmd.row_groups[0].columns[0].meta_data.encodings;

From e345620ddaf5d8ac87e2428a84508ecfec2ba4f8 Mon Sep 17 00:00:00 2001
From: Suraj Aralihalli <suraj.ara16@gmail.com>
Date: Mon, 9 Oct 2023 10:13:25 -0700
Subject: [PATCH 013/118] Add stream parameter to List Manipulation and
 Operations APIs (#14248)

I have organized the public List APIs into **three** distinct categories based on their functionality, simplifying the PRs for easier and shorter reviews. This particular PR introduces the `stream` parameter only to the `List Manipulation and Operations APIs`, which fall under `Section 1`. See next comment for other sections.


1. List Manipulation and Operations (`combine.hpp`, `contains.hpp`, `count_elements.hpp`)

```
concatenate_rows
concatenate_list_elements
contains_nulls
contains - search_keys
contains - search_key
index_of - search_keys
index_of - search_key
count_elements
```


This PR addresses issues in the following files:

1. **column_wrapper.hpp**:
      - Corrects the improper passing of the stream value in the `make_lists_column` function.
      - Enables the missing cast to `lists_column_view`.
      - Substitutes `copy_bitmask` with `cudf::detail::copy_bitmask` to include the stream parameter.

2. **concatenate.cu:**

      - Substitutes `create_null_mask` with `cudf::detail::create_null_mask` to include the stream parameter.

Authors:
  - Suraj Aralihalli (https://github.com/SurajAralihalli)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/14248
---
 cpp/include/cudf/lists/combine.hpp            |  4 +
 cpp/include/cudf/lists/contains.hpp           | 14 ++-
 cpp/include/cudf/lists/count_elements.hpp     |  2 +
 cpp/include/cudf_test/column_wrapper.hpp      | 45 +++++++---
 .../combine/concatenate_list_elements.cu      |  3 +-
 cpp/src/lists/combine/concatenate_rows.cu     |  3 +-
 cpp/src/lists/contains.cu                     | 37 ++++----
 cpp/src/lists/copying/concatenate.cu          |  5 +-
 cpp/src/lists/count_elements.cu               |  3 +-
 cpp/tests/CMakeLists.txt                      |  1 +
 cpp/tests/streams/lists_test.cpp              | 87 +++++++++++++++++++
 11 files changed, 169 insertions(+), 35 deletions(-)
 create mode 100644 cpp/tests/streams/lists_test.cpp

diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp
index 0bc76828fc3..0d9c1c157eb 100644
--- a/cpp/include/cudf/lists/combine.hpp
+++ b/cpp/include/cudf/lists/combine.hpp
@@ -57,6 +57,7 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW };
  * @param input Table of lists to be concatenated.
  * @param null_policy The parameter to specify whether a null list element will be ignored from
  *        concatenation, or any concatenation involving a null element will result in a null list.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A new column in which each row is a list resulted from concatenating all list elements in
  *         the corresponding row of the input table.
@@ -64,6 +65,7 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW };
 std::unique_ptr<column> concatenate_rows(
   table_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -86,6 +88,7 @@ std::unique_ptr<column> concatenate_rows(
  * @param input The lists column containing lists of list elements to concatenate.
  * @param null_policy The parameter to specify whether a null list element will be ignored from
  *        concatenation, or any concatenation involving a null element will result in a null list.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A new column in which each row is a list resulted from concatenating all list elements in
  *         the corresponding row of the input lists column.
@@ -93,6 +96,7 @@ std::unique_ptr<column> concatenate_rows(
 std::unique_ptr<column> concatenate_list_elements(
   column_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp
index 21c2ca1d64e..7cf67ec9205 100644
--- a/cpp/include/cudf/lists/contains.hpp
+++ b/cpp/include/cudf/lists/contains.hpp
@@ -42,12 +42,14 @@ namespace lists {
  *
  * @param lists Lists column whose `n` rows are to be searched
  * @param search_key The scalar key to be looked up in each list row
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::scalar const& search_key,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -63,13 +65,15 @@ std::unique_ptr<column> contains(
  *   2. The list row `lists[i]` is null
  *
  * @param lists Lists column whose `n` rows are to be searched
- * @param search_keys Column of elements to be looked up in each list row
+ * @param search_keys Column of elements to be looked up in each list row.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::column_view const& search_keys,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -84,12 +88,14 @@ std::unique_ptr<column> contains(
  * A row with an empty list will always return false.
  * Nulls inside non-null nested elements (such as lists or structs) are not considered.
  *
- * @param lists Lists column whose `n` rows are to be searched
+ * @param lists Lists column whose `n` rows are to be searched.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return BOOL8 column of `n` rows with the result of the lookup
  */
 std::unique_ptr<column> contains_nulls(
   cudf::lists_column_view const& lists,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -125,6 +131,7 @@ enum class duplicate_find_option : int32_t {
  * @param search_key The scalar key to be looked up in each list row
  * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
  * last (`FIND_LAST`)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return column of `n` rows with the location of the `search_key`
  */
@@ -132,6 +139,7 @@ std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
   cudf::scalar const& search_key,
   duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -160,6 +168,7 @@ std::unique_ptr<column> index_of(
  * `lists`
  * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
  * last (`FIND_LAST`)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return column of `n` rows with the location of the `search_key`
  */
@@ -167,6 +176,7 @@ std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
   cudf::column_view const& search_keys,
   duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp
index 552ba058b93..e4bd0dca9ae 100644
--- a/cpp/include/cudf/lists/count_elements.hpp
+++ b/cpp/include/cudf/lists/count_elements.hpp
@@ -45,11 +45,13 @@ namespace lists {
  * in the output column.
  *
  * @param input Input lists column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column with the number of elements for each row
  */
 std::unique_ptr<column> count_elements(
   lists_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of lists_elements group
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index c0932b81dc3..e94dfea9dcf 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -21,6 +21,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
@@ -1281,6 +1282,11 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
 template <typename T, typename SourceElementT = T>
 class lists_column_wrapper : public detail::column_wrapper {
  public:
+  /**
+   * @brief Cast to lists_column_view
+   */
+  operator lists_column_view() const { return cudf::lists_column_view{wrapped->view()}; }
+
   /**
    * @brief Construct a lists column containing a single list of fixed-width
    * type from an initializer list of values.
@@ -1542,8 +1548,12 @@ class lists_column_wrapper : public detail::column_wrapper {
                        rmm::device_buffer&& null_mask)
   {
     // construct the list column
-    wrapped = make_lists_column(
-      num_rows, std::move(offsets), std::move(values), null_count, std::move(null_mask));
+    wrapped = make_lists_column(num_rows,
+                                std::move(offsets),
+                                std::move(values),
+                                null_count,
+                                std::move(null_mask),
+                                cudf::test::get_default_stream());
   }
 
   /**
@@ -1618,8 +1628,12 @@ class lists_column_wrapper : public detail::column_wrapper {
     }();
 
     // construct the list column
-    wrapped = make_lists_column(
-      cols.size(), std::move(offsets), std::move(data), null_count, std::move(null_mask));
+    wrapped = make_lists_column(cols.size(),
+                                std::move(offsets),
+                                std::move(data),
+                                null_count,
+                                std::move(null_mask),
+                                cudf::test::get_default_stream());
   }
 
   /**
@@ -1647,8 +1661,12 @@ class lists_column_wrapper : public detail::column_wrapper {
     depth = 0;
 
     size_type num_elements = offsets->size() == 0 ? 0 : offsets->size() - 1;
-    wrapped =
-      make_lists_column(num_elements, std::move(offsets), std::move(c), 0, rmm::device_buffer{});
+    wrapped                = make_lists_column(num_elements,
+                                std::move(offsets),
+                                std::move(c),
+                                0,
+                                rmm::device_buffer{},
+                                cudf::test::get_default_stream());
   }
 
   /**
@@ -1697,12 +1715,15 @@ class lists_column_wrapper : public detail::column_wrapper {
     }
 
     lists_column_view lcv(col);
-    return make_lists_column(col.size(),
-                             std::make_unique<column>(lcv.offsets()),
-                             normalize_column(lists_column_view(col).child(),
-                                              lists_column_view(expected_hierarchy).child()),
-                             col.null_count(),
-                             copy_bitmask(col));
+    return make_lists_column(
+      col.size(),
+      std::make_unique<column>(lcv.offsets()),
+      normalize_column(lists_column_view(col).child(),
+                       lists_column_view(expected_hierarchy).child()),
+      col.null_count(),
+      cudf::detail::copy_bitmask(
+        col, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
+      cudf::test::get_default_stream());
   }
 
   std::pair<std::vector<column_view>, std::vector<std::unique_ptr<column>>> preprocess_columns(
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index fbe297765f8..99dbd55678b 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -271,10 +271,11 @@ std::unique_ptr<column> concatenate_list_elements(column_view const& input,
  */
 std::unique_ptr<column> concatenate_list_elements(column_view const& input,
                                                   concatenate_null_policy null_policy,
+                                                  rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate_list_elements(input, null_policy, cudf::get_default_stream(), mr);
+  return detail::concatenate_list_elements(input, null_policy, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index 658538b0195..49be7b5ff17 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -305,10 +305,11 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
  */
 std::unique_ptr<column> concatenate_rows(table_view const& input,
                                          concatenate_null_policy null_policy,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate_rows(input, null_policy, cudf::get_default_stream(), mr);
+  return detail::concatenate_rows(input, null_policy, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index df1d043bdb6..4733a5d63a8 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -287,7 +287,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
   }
 
   auto search_key_col = cudf::make_column_from_scalar(search_key, lists.size(), stream, mr);
-  return index_of(lists, search_key_col->view(), find_option, stream, mr);
+  return detail::index_of(lists, search_key_col->view(), find_option, stream, mr);
 }
 
 std::unique_ptr<column> index_of(lists_column_view const& lists,
@@ -306,11 +306,11 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  auto key_indices = index_of(lists,
-                              search_key,
-                              duplicate_find_option::FIND_FIRST,
-                              stream,
-                              rmm::mr::get_current_device_resource());
+  auto key_indices = detail::index_of(lists,
+                                      search_key,
+                                      duplicate_find_option::FIND_FIRST,
+                                      stream,
+                                      rmm::mr::get_current_device_resource());
   return to_contains(std::move(key_indices), stream, mr);
 }
 
@@ -322,11 +322,11 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
   CUDF_EXPECTS(search_keys.size() == lists.size(),
                "Number of search keys must match list column size.");
 
-  auto key_indices = index_of(lists,
-                              search_keys,
-                              duplicate_find_option::FIND_FIRST,
-                              stream,
-                              rmm::mr::get_current_device_resource());
+  auto key_indices = detail::index_of(lists,
+                                      search_keys,
+                                      duplicate_find_option::FIND_FIRST,
+                                      stream,
+                                      rmm::mr::get_current_device_resource());
   return to_contains(std::move(key_indices), stream, mr);
 }
 
@@ -364,43 +364,48 @@ std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
 
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(lists, search_key, cudf::get_default_stream(), mr);
+  return detail::contains(lists, search_key, stream, mr);
 }
 
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  column_view const& search_keys,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(lists, search_keys, cudf::get_default_stream(), mr);
+  return detail::contains(lists, search_keys, stream, mr);
 }
 
 std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains_nulls(lists, cudf::get_default_stream(), mr);
+  return detail::contains_nulls(lists, stream, mr);
 }
 
 std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  duplicate_find_option find_option,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::index_of(lists, search_key, find_option, cudf::get_default_stream(), mr);
+  return detail::index_of(lists, search_key, find_option, stream, mr);
 }
 
 std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  column_view const& search_keys,
                                  duplicate_find_option find_option,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::index_of(lists, search_keys, find_option, cudf::get_default_stream(), mr);
+  return detail::index_of(lists, search_keys, find_option, stream, mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index ddd0dfbe084..5407b88236f 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/concatenate_masks.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -123,8 +124,8 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   // if any of the input columns have nulls, construct the output mask
   bool const has_nulls =
     std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
-  rmm::device_buffer null_mask = create_null_mask(
-    total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED);
+  rmm::device_buffer null_mask = cudf::detail::create_null_mask(
+    total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED, stream, mr);
   auto null_mask_data = static_cast<bitmask_type*>(null_mask.data());
   auto const null_count =
     has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0};
diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu
index 40a14d805e1..2fd0851067a 100644
--- a/cpp/src/lists/count_elements.cu
+++ b/cpp/src/lists/count_elements.cu
@@ -73,10 +73,11 @@ std::unique_ptr<column> count_elements(lists_column_view const& input,
 // external APIS
 
 std::unique_ptr<column> count_elements(lists_column_view const& input,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_elements(input, cudf::get_default_stream(), mr);
+  return detail::count_elements(input, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index ac13c121530..ffaba7d6fa7 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -638,6 +638,7 @@ ConfigureTest(
 )
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/streams/lists_test.cpp b/cpp/tests/streams/lists_test.cpp
new file mode 100644
index 00000000000..e292b551d83
--- /dev/null
+++ b/cpp/tests/streams/lists_test.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/lists/combine.hpp>
+#include <cudf/lists/contains.hpp>
+#include <cudf/lists/count_elements.hpp>
+
+class ListTest : public cudf::test::BaseFixture {};
+
+TEST_F(ListTest, ConcatenateRows)
+{
+  cudf::test::lists_column_wrapper<int> list_col_1{{0, 1}, {2, 3}, {4, 5}};
+  cudf::test::lists_column_wrapper<int> list_col_2{{0, 1}, {2, 3}, {4, 5}};
+  cudf::table_view lists_table({list_col_1, list_col_2});
+  cudf::lists::concatenate_rows(
+    lists_table, cudf::lists::concatenate_null_policy::IGNORE, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ConcatenateListElements)
+{
+  cudf::test::lists_column_wrapper<int> ll_column{{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}};
+  cudf::lists::concatenate_list_elements(
+    ll_column, cudf::lists::concatenate_null_policy::IGNORE, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ContainsNulls)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::lists::contains_nulls(list_col, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ContainsSearchKey)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::numeric_scalar<int32_t> search_key(2, true, cudf::test::get_default_stream());
+  cudf::lists::contains(list_col, search_key, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ContainsSearchKeys)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::test::fixed_width_column_wrapper<int> search_keys({1, 2, 3});
+  cudf::lists::contains(list_col, search_keys, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, IndexOfSearchKey)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::numeric_scalar<int32_t> search_key(2, true, cudf::test::get_default_stream());
+  cudf::lists::index_of(list_col,
+                        search_key,
+                        cudf::lists::duplicate_find_option::FIND_FIRST,
+                        cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, IndexOfSearchKeys)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3}, {4, 5}};
+  cudf::test::fixed_width_column_wrapper<int> search_keys({1, 2, 3});
+  cudf::lists::index_of(list_col,
+                        search_keys,
+                        cudf::lists::duplicate_find_option::FIND_FIRST,
+                        cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, CountElements)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7}, {4, 5}};
+  cudf::lists::count_elements(list_col, cudf::test::get_default_stream());
+}

From b4fd77b30311f3b1de39cac22423f2c3a32ec72d Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Tue, 10 Oct 2023 12:20:42 -0500
Subject: [PATCH 014/118] Centralize chunked reading code in the parquet reader
 to reader_impl_chunking.cu (#14262)

As a precursor to further chunked reader work, this PR centralizes chunk-related code (mostly from the `reader::impl` class) into `reader_impl_chunking.cu` and `reader_impl_chunking.hpp`.  Also cleans up some variable naming and locations in `reader::impl` and the `file_intermediate_data` and `pass_intermediate_data classes`.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Robert Maynard (https://github.com/robertmaynard)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14262
---
 cpp/CMakeLists.txt                           |   1 +
 cpp/src/io/parquet/parquet_gpu.hpp           |  73 ---
 cpp/src/io/parquet/reader_impl.cpp           |  12 +-
 cpp/src/io/parquet/reader_impl.hpp           |  49 +-
 cpp/src/io/parquet/reader_impl_chunking.cu   | 598 +++++++++++++++++++
 cpp/src/io/parquet/reader_impl_chunking.hpp  |  87 +++
 cpp/src/io/parquet/reader_impl_helpers.hpp   |  17 +
 cpp/src/io/parquet/reader_impl_preprocess.cu | 558 +----------------
 cpp/src/io/utilities/column_buffer.cpp       |  10 +-
 9 files changed, 751 insertions(+), 654 deletions(-)
 create mode 100644 cpp/src/io/parquet/reader_impl_chunking.cu
 create mode 100644 cpp/src/io/parquet/reader_impl_chunking.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 000f80065ab..f8b9762f1d4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -401,6 +401,7 @@ add_library(
   src/io/parquet/predicate_pushdown.cpp
   src/io/parquet/reader.cpp
   src/io/parquet/reader_impl.cpp
+  src/io/parquet/reader_impl_chunking.cu
   src/io/parquet/reader_impl_helpers.cpp
   src/io/parquet/reader_impl_preprocess.cu
   src/io/parquet/writer_impl.cu
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 767668cc65e..6a93fec0c46 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -318,79 +318,6 @@ struct ColumnChunkDesc {
   int32_t src_col_schema{};  // my schema index in the file
 };
 
-/**
- * @brief The row_group_info class
- */
-struct row_group_info {
-  size_type index;  // row group index within a file. aggregate_reader_metadata::get_row_group() is
-                    // called with index and source_index
-  size_t start_row;
-  size_type source_index;  // file index.
-
-  row_group_info() = default;
-
-  row_group_info(size_type index, size_t start_row, size_type source_index)
-    : index{index}, start_row{start_row}, source_index{source_index}
-  {
-  }
-};
-
-/**
- * @brief Struct to store file-level data that remains constant for
- * all passes/chunks for the file.
- */
-struct file_intermediate_data {
-  // all row groups to read
-  std::vector<row_group_info> row_groups{};
-
-  // all chunks from the selected row groups. We may end up reading these chunks progressively
-  // instead of all at once
-  std::vector<ColumnChunkDesc> chunks{};
-
-  // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we
-  // may not be visiting every row group that contains these bounds
-  size_t global_skip_rows;
-  size_t global_num_rows;
-};
-
-/**
- * @brief Structs to identify the reading row range for each chunk of rows in chunked reading.
- */
-struct chunk_read_info {
-  size_t skip_rows;
-  size_t num_rows;
-};
-
-/**
- * @brief Struct to store pass-level data that remains constant for a single pass.
- */
-struct pass_intermediate_data {
-  std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
-  rmm::device_buffer decomp_page_data;
-
-  // rowgroup, chunk and page information for the current pass.
-  std::vector<row_group_info> row_groups{};
-  cudf::detail::hostdevice_vector<ColumnChunkDesc> chunks{};
-  cudf::detail::hostdevice_vector<PageInfo> pages_info{};
-  cudf::detail::hostdevice_vector<PageNestingInfo> page_nesting_info{};
-  cudf::detail::hostdevice_vector<PageNestingDecodeInfo> page_nesting_decode_info{};
-
-  rmm::device_uvector<int32_t> page_keys{0, rmm::cuda_stream_default};
-  rmm::device_uvector<int32_t> page_index{0, rmm::cuda_stream_default};
-  rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
-
-  std::vector<chunk_read_info> output_chunk_read_info;
-  std::size_t current_output_chunk{0};
-
-  rmm::device_buffer level_decode_data{};
-  int level_type_size{0};
-
-  // skip_rows and num_rows values for this particular pass. these may be adjusted values from the
-  // global values stored in file_intermediate_data.
-  size_t skip_rows;
-  size_t num_rows;
-};
-
 /**
  * @brief Struct describing an encoder column
  */
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 26ec83d5946..db81222157a 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -349,14 +349,14 @@ void reader::impl::prepare_data(int64_t skip_rows,
         not _input_columns.empty()) {
       // fills in chunk information without physically loading or decompressing
       // the associated data
-      load_global_chunk_info();
+      create_global_chunk_info();
 
       // compute schedule of input reads. Each rowgroup contains 1 chunk per column. For now
       // we will read an entire row group at a time. However, it is possible to do
       // sub-rowgroup reads if we made some estimates on individual chunk sizes (tricky) and
       // changed the high level structure such that we weren't always reading an entire table's
       // worth of columns at once.
-      compute_input_pass_row_group_info();
+      compute_input_passes();
     }
 
     _file_preprocessed = true;
@@ -364,7 +364,7 @@ void reader::impl::prepare_data(int64_t skip_rows,
 
   // if we have to start a new pass, do that now
   if (!_pass_preprocessed) {
-    auto const num_passes = _input_pass_row_group_offsets.size() - 1;
+    auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
 
     // always create the pass struct, even if we end up with no passes.
     // this will also cause the previous pass information to be deleted
@@ -373,7 +373,7 @@ void reader::impl::prepare_data(int64_t skip_rows,
     if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
         not _input_columns.empty() && _current_input_pass < num_passes) {
       // setup the pass_intermediate_info for this pass.
-      setup_pass();
+      setup_next_pass();
 
       load_and_decompress_data();
       preprocess_pages(uses_custom_row_bounds, _output_chunk_read_limit);
@@ -541,8 +541,8 @@ bool reader::impl::has_next()
                {} /*row_group_indices, empty means read all row groups*/,
                std::nullopt /*filter*/);
 
-  auto const num_input_passes =
-    _input_pass_row_group_offsets.size() == 0 ? 0 : _input_pass_row_group_offsets.size() - 1;
+  size_t const num_input_passes = std::max(
+    int64_t{0}, static_cast<int64_t>(_file_itm_data.input_pass_row_group_offsets.size()) - 1);
   return (_pass_itm_data->current_output_chunk < _pass_itm_data->output_chunk_read_info.size()) ||
          (_current_input_pass < num_input_passes);
 }
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 6003b931b04..cea4ba35606 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -22,6 +22,7 @@
 #pragma once
 
 #include "parquet_gpu.hpp"
+#include "reader_impl_chunking.hpp"
 #include "reader_impl_helpers.hpp"
 
 #include <cudf/io/datasource.hpp>
@@ -136,10 +137,6 @@ class reader::impl {
                     host_span<std::vector<size_type> const> row_group_indices,
                     std::optional<std::reference_wrapper<ast::expression const>> filter);
 
-  void load_global_chunk_info();
-  void compute_input_pass_row_group_info();
-  void setup_pass();
-
   /**
    * @brief Create chunk information and start file reads
    *
@@ -250,6 +247,31 @@ class reader::impl {
    */
   void decode_page_data(size_t skip_rows, size_t num_rows);
 
+  /**
+   * @brief Creates file-wide parquet chunk information.
+   *
+   * Creates information about all chunks in the file, storing it in
+   * the file-wide _file_itm_data structure.
+   */
+  void create_global_chunk_info();
+
+  /**
+   * @brief Computes all of the passes we will perform over the file.
+   */
+  void compute_input_passes();
+
+  /**
+   * @brief Close out the existing pass (if any) and prepare for the next pass.
+   */
+  void setup_next_pass();
+
+  /**
+   * @brief Given a set of pages that have had their sizes computed by nesting level and
+   * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing
+   * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes.
+   */
+  void compute_splits_for_pass();
+
  private:
   rmm::cuda_stream_view _stream;
   rmm::mr::device_memory_resource* _mr = nullptr;
@@ -278,7 +300,7 @@ class reader::impl {
 
   // chunked reading happens in 2 parts:
   //
-  // At the top level there is the "pass" in which we try and limit the
+  // At the top level, the entire file is divided up into "passes" omn which we try and limit the
   // total amount of temporary memory (compressed data, decompressed data) in use
   // via _input_pass_read_limit.
   //
@@ -286,19 +308,16 @@ class reader::impl {
   // byte size is controlled by _output_chunk_read_limit.
 
   file_intermediate_data _file_itm_data;
+  bool _file_preprocessed{false};
+
   std::unique_ptr<pass_intermediate_data> _pass_itm_data;
+  bool _pass_preprocessed{false};
 
-  // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents
-  // the start/end of the chunks to be loaded for a given pass.
-  std::vector<std::size_t> _input_pass_row_group_offsets{};
-  std::vector<std::size_t> _input_pass_row_count{};
-  std::size_t _current_input_pass{0};
-  std::size_t _chunk_count{0};
+  std::size_t _output_chunk_read_limit{0};  // output chunk size limit in bytes
+  std::size_t _input_pass_read_limit{0};    // input pass memory usage limit in bytes
 
-  std::size_t _output_chunk_read_limit{0};
-  std::size_t _input_pass_read_limit{0};
-  bool _pass_preprocessed{false};
-  bool _file_preprocessed{false};
+  std::size_t _current_input_pass{0};  // current input pass index
+  std::size_t _chunk_count{0};         // how many output chunks we have produced
 };
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
new file mode 100644
index 00000000000..ad52a7dfcc1
--- /dev/null
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -0,0 +1,598 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl.hpp"
+#include "reader_impl_chunking.hpp"
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+
+#include <io/utilities/time_utils.cuh>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/sort.h>
+
+namespace cudf::io::parquet::detail {
+
+namespace {
+
+struct cumulative_row_info {
+  size_t row_count;   // cumulative row count
+  size_t size_bytes;  // cumulative size in bytes
+  int key;            // schema index
+};
+
+#if defined(CHUNKING_DEBUG)
+void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                rmm::device_uvector<int32_t> const& page_index,
+                                rmm::device_uvector<cumulative_row_info> const& c_info,
+                                rmm::cuda_stream_view stream)
+{
+  pages.device_to_host_sync(stream);
+
+  printf("------------\nCumulative sizes by page\n");
+
+  std::vector<int> schemas(pages.size());
+  std::vector<int> h_page_index(pages.size());
+  CUDF_CUDA_TRY(cudaMemcpy(
+    h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault));
+  std::vector<cumulative_row_info> h_cinfo(pages.size());
+  CUDF_CUDA_TRY(cudaMemcpy(
+    h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault));
+  auto schema_iter = cudf::detail::make_counting_transform_iterator(
+    0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; });
+  thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin());
+  auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end());
+  schemas.resize(last - schemas.begin());
+  printf("Num schemas: %lu\n", schemas.size());
+
+  for (size_t idx = 0; idx < schemas.size(); idx++) {
+    printf("Schema %d\n", schemas[idx]);
+    for (size_t pidx = 0; pidx < pages.size(); pidx++) {
+      auto const& page = pages[h_page_index[pidx]];
+      if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
+        continue;
+      }
+      printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes);
+    }
+  }
+}
+
+void print_cumulative_row_info(host_span<cumulative_row_info const> sizes,
+                               std::string const& label,
+                               std::optional<std::vector<chunk_read_info>> splits = std::nullopt)
+{
+  if (splits.has_value()) {
+    printf("------------\nSplits\n");
+    for (size_t idx = 0; idx < splits->size(); idx++) {
+      printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows);
+    }
+  }
+
+  printf("------------\nCumulative sizes %s\n", label.c_str());
+  for (size_t idx = 0; idx < sizes.size(); idx++) {
+    printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key);
+    if (splits.has_value()) {
+      // if we have a split at this row count and this is the last instance of this row count
+      auto start = thrust::make_transform_iterator(
+        splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; });
+      auto end               = start + splits->size();
+      auto split             = std::find(start, end, sizes[idx].row_count);
+      auto const split_index = [&]() -> int {
+        if (split != end &&
+            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) {
+          return static_cast<int>(std::distance(start, split));
+        }
+        return idx == 0 ? 0 : -1;
+      }();
+      if (split_index >= 0) {
+        printf(" <-- split {%lu, %lu}",
+               splits.value()[split_index].skip_rows,
+               splits.value()[split_index].num_rows);
+      }
+    }
+    printf("\n");
+  }
+}
+#endif  // CHUNKING_DEBUG
+
+/**
+ * @brief Functor which reduces two cumulative_row_info structs of the same key.
+ */
+struct cumulative_row_sum {
+  cumulative_row_info operator()
+    __device__(cumulative_row_info const& a, cumulative_row_info const& b) const
+  {
+    return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key};
+  }
+};
+
+/**
+ * @brief Functor which computes the total data size for a given type of cudf column.
+ *
+ * In the case of strings, the return size does not include the chars themselves. That
+ * information is tracked separately (see PageInfo::str_bytes).
+ */
+struct row_size_functor {
+  __device__ size_t validity_size(size_t num_rows, bool nullable)
+  {
+    return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0;
+  }
+
+  template <typename T>
+  __device__ size_t operator()(size_t num_rows, bool nullable)
+  {
+    auto const element_size = sizeof(device_storage_type_t<T>);
+    return (element_size * num_rows) + validity_size(num_rows, nullable);
+  }
+};
+
+template <>
+__device__ size_t row_size_functor::operator()<list_view>(size_t num_rows, bool nullable)
+{
+  auto const offset_size = sizeof(size_type);
+  // NOTE: Adding the + 1 offset here isn't strictly correct.  There will only be 1 extra offset
+  // for the entire column, whereas this is adding an extra offset per page.  So we will get a
+  // small over-estimate of the real size of the order :  # of pages * 4 bytes. It seems better
+  // to overestimate size somewhat than to underestimate it and potentially generate chunks
+  // that are too large.
+  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
+}
+
+template <>
+__device__ size_t row_size_functor::operator()<struct_view>(size_t num_rows, bool nullable)
+{
+  return validity_size(num_rows, nullable);
+}
+
+template <>
+__device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, bool nullable)
+{
+  // only returns the size of offsets and validity. the size of the actual string chars
+  // is tracked separately.
+  auto const offset_size = sizeof(size_type);
+  // see note about offsets in the list_view template.
+  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
+}
+
+/**
+ * @brief Functor which computes the total output cudf data size for all of
+ * the data in this page.
+ *
+ * Sums across all nesting levels.
+ */
+struct get_cumulative_row_info {
+  PageInfo const* const pages;
+
+  __device__ cumulative_row_info operator()(size_type index)
+  {
+    auto const& page = pages[index];
+    if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
+      return cumulative_row_info{0, 0, page.src_col_schema};
+    }
+
+    // total nested size, not counting string data
+    auto iter =
+      cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) {
+        auto const& pni = page.nesting[i];
+        return cudf::type_dispatcher(
+          data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
+      });
+
+    size_t const row_count = static_cast<size_t>(page.nesting[0].size);
+    return {
+      row_count,
+      thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes,
+      page.src_col_schema};
+  }
+};
+
+/**
+ * @brief Functor which computes the effective size of all input columns by page.
+ *
+ * For a given row, we want to find the cost of all pages for all columns involved
+ * in loading up to that row.  The complication here is that not all pages are the
+ * same size between columns. Example:
+ *
+ *              page row counts
+ * Column A:    0 <----> 100 <----> 200
+ * Column B:    0 <---------------> 200 <--------> 400
+                          |
+ * if we decide to split at row 100, we don't really know the actual amount of bytes in column B
+ * at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
+ * page. Essentially, a conservative over-estimate of the real size.
+ */
+struct row_total_size {
+  cumulative_row_info const* c_info;
+  size_type const* key_offsets;
+  size_t num_keys;
+
+  __device__ cumulative_row_info operator()(cumulative_row_info const& i)
+  {
+    // sum sizes for each input column at this row
+    size_t sum = 0;
+    for (int idx = 0; idx < num_keys; idx++) {
+      auto const start = key_offsets[idx];
+      auto const end   = key_offsets[idx + 1];
+      auto iter        = cudf::detail::make_counting_transform_iterator(
+        0, [&] __device__(size_type i) { return c_info[i].row_count; });
+      auto const page_index =
+        thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter;
+      sum += c_info[page_index].size_bytes;
+    }
+    return {i.row_count, sum, i.key};
+  }
+};
+
+/**
+ * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read
+ * limit, determine the set of splits.
+ *
+ * @param sizes Vector of cumulative {row_count, byte_size} pairs
+ * @param num_rows Total number of rows to read
+ * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
+ */
+std::vector<chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
+                                         size_t num_rows,
+                                         size_t chunk_read_limit)
+{
+  // now we have an array of {row_count, real output bytes}. just walk through it and generate
+  // splits.
+  // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch
+  // sizes are reasonably large, this shouldn't iterate too many times
+  std::vector<chunk_read_info> splits;
+  {
+    size_t cur_pos             = 0;
+    size_t cur_cumulative_size = 0;
+    size_t cur_row_count       = 0;
+    auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) {
+      return i.size_bytes - cur_cumulative_size;
+    });
+    auto end   = start + sizes.size();
+    while (cur_row_count < num_rows) {
+      int64_t split_pos =
+        thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start;
+
+      // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back
+      // one.
+      if (static_cast<size_t>(split_pos) >= sizes.size() ||
+          (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) {
+        split_pos--;
+      }
+
+      // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
+      // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
+      // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
+      // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
+      // either do this, or we have to call unique() on the input first.
+      while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
+             (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) {
+        split_pos++;
+      }
+
+      auto const start_row = cur_row_count;
+      cur_row_count        = sizes[split_pos].row_count;
+      splits.push_back(chunk_read_info{start_row, cur_row_count - start_row});
+      cur_pos             = split_pos;
+      cur_cumulative_size = sizes[split_pos].size_bytes;
+    }
+  }
+  // print_cumulative_row_info(sizes, "adjusted", splits);
+
+  return splits;
+}
+
+/**
+ * @brief Converts cuDF units to Parquet units.
+ *
+ * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
+ */
+[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
+                                                                   type_id timestamp_type_id,
+                                                                   Type physical,
+                                                                   int8_t converted,
+                                                                   int32_t length)
+{
+  int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0;
+  int32_t clock_rate = 0;
+  if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
+    type_width = 1;  // I32 -> I8
+  } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) {
+    type_width = 2;  // I32 -> I16
+  } else if (column_type_id == type_id::INT32) {
+    type_width = 4;  // str -> hash32
+  } else if (is_chrono(data_type{column_type_id})) {
+    clock_rate = to_clockrate(timestamp_type_id);
+  }
+
+  int8_t converted_type = converted;
+  if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 &&
+      not cudf::is_fixed_point(data_type{column_type_id})) {
+    converted_type = UNKNOWN;  // Not converting to float64 or decimal
+  }
+  return std::make_tuple(type_width, clock_rate, converted_type);
+}
+
+/**
+ * @brief Return the required number of bits to store a value.
+ */
+template <typename T = uint8_t>
+[[nodiscard]] T required_bits(uint32_t max_level)
+{
+  return static_cast<T>(CompactProtocolReader::NumRequiredBits(max_level));
+}
+
+struct row_count_compare {
+  __device__ bool operator()(cumulative_row_info const& a, cumulative_row_info const& b)
+  {
+    return a.row_count < b.row_count;
+  }
+};
+
+}  // anonymous namespace
+
+void reader::impl::create_global_chunk_info()
+{
+  auto const num_rows         = _file_itm_data.global_num_rows;
+  auto const& row_groups_info = _file_itm_data.row_groups;
+  auto& chunks                = _file_itm_data.chunks;
+
+  // Descriptors for all the chunks that make up the selected columns
+  auto const num_input_columns = _input_columns.size();
+  auto const num_chunks        = row_groups_info.size() * num_input_columns;
+
+  // Initialize column chunk information
+  auto remaining_rows = num_rows;
+  for (auto const& rg : row_groups_info) {
+    auto const& row_group      = _metadata->get_row_group(rg.index, rg.source_index);
+    auto const row_group_start = rg.start_row;
+    auto const row_group_rows  = std::min<int>(remaining_rows, row_group.num_rows);
+
+    // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
+    for (size_t i = 0; i < num_input_columns; ++i) {
+      auto col = _input_columns[i];
+      // look up metadata
+      auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
+      auto& schema   = _metadata->get_schema(col.schema_idx);
+
+      auto [type_width, clock_rate, converted_type] =
+        conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
+                        _timestamp_type.id(),
+                        schema.type,
+                        schema.converted_type,
+                        schema.type_length);
+
+      chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size,
+                                       nullptr,
+                                       col_meta.num_values,
+                                       schema.type,
+                                       type_width,
+                                       row_group_start,
+                                       row_group_rows,
+                                       schema.max_definition_level,
+                                       schema.max_repetition_level,
+                                       _metadata->get_output_nesting_depth(col.schema_idx),
+                                       required_bits(schema.max_definition_level),
+                                       required_bits(schema.max_repetition_level),
+                                       col_meta.codec,
+                                       converted_type,
+                                       schema.logical_type,
+                                       schema.decimal_precision,
+                                       clock_rate,
+                                       i,
+                                       col.schema_idx));
+    }
+
+    remaining_rows -= row_group_rows;
+  }
+}
+
+void reader::impl::compute_input_passes()
+{
+  // at this point, row_groups has already been filtered down to just the row groups we need to
+  // handle optional skip_rows/num_rows parameters.
+  auto const& row_groups_info = _file_itm_data.row_groups;
+
+  // if the user hasn't specified an input size limit, read everything in a single pass.
+  if (_input_pass_read_limit == 0) {
+    _file_itm_data.input_pass_row_group_offsets.push_back(0);
+    _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
+    return;
+  }
+
+  // generate passes. make sure to account for the case where a single row group doesn't fit within
+  //
+  std::size_t const read_limit =
+    _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits<std::size_t>::max();
+  std::size_t cur_pass_byte_size = 0;
+  std::size_t cur_rg_start       = 0;
+  std::size_t cur_row_count      = 0;
+  _file_itm_data.input_pass_row_group_offsets.push_back(0);
+  _file_itm_data.input_pass_row_count.push_back(0);
+
+  for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
+    auto const& rgi       = row_groups_info[cur_rg_index];
+    auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
+
+    // can we add this row group
+    if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) {
+      // A single row group (the current one) is larger than the read limit:
+      // We always need to include at least one row group, so end the pass at the end of the current
+      // row group
+      if (cur_rg_start == cur_rg_index) {
+        _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1);
+        _file_itm_data.input_pass_row_count.push_back(cur_row_count + row_group.num_rows);
+        cur_rg_start       = cur_rg_index + 1;
+        cur_pass_byte_size = 0;
+      }
+      // End the pass at the end of the previous row group
+      else {
+        _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index);
+        _file_itm_data.input_pass_row_count.push_back(cur_row_count);
+        cur_rg_start       = cur_rg_index;
+        cur_pass_byte_size = row_group.total_byte_size;
+      }
+    } else {
+      cur_pass_byte_size += row_group.total_byte_size;
+    }
+    cur_row_count += row_group.num_rows;
+  }
+  // add the last pass if necessary
+  if (_file_itm_data.input_pass_row_group_offsets.back() != row_groups_info.size()) {
+    _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size());
+    _file_itm_data.input_pass_row_count.push_back(cur_row_count);
+  }
+}
+
+void reader::impl::setup_next_pass()
+{
+  // this will also cause the previous pass information to be deleted
+  _pass_itm_data = std::make_unique<cudf::io::parquet::detail::pass_intermediate_data>();
+
+  // setup row groups to be loaded for this pass
+  auto const row_group_start = _file_itm_data.input_pass_row_group_offsets[_current_input_pass];
+  auto const row_group_end   = _file_itm_data.input_pass_row_group_offsets[_current_input_pass + 1];
+  auto const num_row_groups  = row_group_end - row_group_start;
+  _pass_itm_data->row_groups.resize(num_row_groups);
+  std::copy(_file_itm_data.row_groups.begin() + row_group_start,
+            _file_itm_data.row_groups.begin() + row_group_end,
+            _pass_itm_data->row_groups.begin());
+
+  auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1;
+  CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index");
+
+  auto const chunks_per_rowgroup = _input_columns.size();
+  auto const num_chunks          = chunks_per_rowgroup * num_row_groups;
+
+  auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
+  auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
+
+  _pass_itm_data->chunks = cudf::detail::hostdevice_vector<ColumnChunkDesc>(num_chunks, _stream);
+  std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
+
+  // adjust skip_rows and num_rows by what's available in the row groups we are processing
+  if (num_passes == 1) {
+    _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows;
+    _pass_itm_data->num_rows  = _file_itm_data.global_num_rows;
+  } else {
+    auto const global_start_row = _file_itm_data.global_skip_rows;
+    auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
+    auto const start_row =
+      std::max(_file_itm_data.input_pass_row_count[_current_input_pass], global_start_row);
+    auto const end_row =
+      std::min(_file_itm_data.input_pass_row_count[_current_input_pass + 1], global_end_row);
+
+    // skip_rows is always global in the sense that it is relative to the first row of
+    // everything we will be reading, regardless of what pass we are on.
+    // num_rows is how many rows we are reading this pass.
+    _pass_itm_data->skip_rows =
+      global_start_row + _file_itm_data.input_pass_row_count[_current_input_pass];
+    _pass_itm_data->num_rows = end_row - start_row;
+  }
+}
+
+void reader::impl::compute_splits_for_pass()
+{
+  auto const skip_rows = _pass_itm_data->skip_rows;
+  auto const num_rows  = _pass_itm_data->num_rows;
+
+  // simple case : no chunk size, no splits
+  if (_output_chunk_read_limit <= 0) {
+    _pass_itm_data->output_chunk_read_info = std::vector<chunk_read_info>{{skip_rows, num_rows}};
+    return;
+  }
+
+  auto& pages = _pass_itm_data->pages_info;
+
+  auto const& page_keys  = _pass_itm_data->page_keys;
+  auto const& page_index = _pass_itm_data->page_index;
+
+  // generate cumulative row counts and sizes
+  rmm::device_uvector<cumulative_row_info> c_info(page_keys.size(), _stream);
+  // convert PageInfo to cumulative_row_info
+  auto page_input = thrust::make_transform_iterator(page_index.begin(),
+                                                    get_cumulative_row_info{pages.device_ptr()});
+  thrust::inclusive_scan_by_key(rmm::exec_policy(_stream),
+                                page_keys.begin(),
+                                page_keys.end(),
+                                page_input,
+                                c_info.begin(),
+                                thrust::equal_to{},
+                                cumulative_row_sum{});
+  // print_cumulative_page_info(pages, page_index, c_info, stream);
+
+  // sort by row count
+  rmm::device_uvector<cumulative_row_info> c_info_sorted{c_info, _stream};
+  thrust::sort(
+    rmm::exec_policy(_stream), c_info_sorted.begin(), c_info_sorted.end(), row_count_compare{});
+
+  // std::vector<cumulative_row_info> h_c_info_sorted(c_info_sorted.size());
+  // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(),
+  //                          c_info_sorted.data(),
+  //                          sizeof(cumulative_row_info) * c_info_sorted.size(),
+  //                          cudaMemcpyDefault));
+  // print_cumulative_row_info(h_c_info_sorted, "raw");
+
+  // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
+  // key
+  rmm::device_uvector<size_type> key_offsets(page_keys.size() + 1, _stream);
+  auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(_stream),
+                                                     page_keys.begin(),
+                                                     page_keys.end(),
+                                                     thrust::make_constant_iterator(1),
+                                                     thrust::make_discard_iterator(),
+                                                     key_offsets.begin())
+                                 .second;
+  size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
+  thrust::exclusive_scan(
+    rmm::exec_policy(_stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
+
+  // adjust the cumulative info such that for each row count, the size includes any pages that span
+  // that row count. this is so that if we have this case:
+  //              page row counts
+  // Column A:    0 <----> 100 <----> 200
+  // Column B:    0 <---------------> 200 <--------> 400
+  //                        |
+  // if we decide to split at row 100, we don't really know the actual amount of bytes in column B
+  // at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
+  // page.
+  //
+  rmm::device_uvector<cumulative_row_info> aggregated_info(c_info.size(), _stream);
+  thrust::transform(rmm::exec_policy(_stream),
+                    c_info_sorted.begin(),
+                    c_info_sorted.end(),
+                    aggregated_info.begin(),
+                    row_total_size{c_info.data(), key_offsets.data(), num_unique_keys});
+
+  // bring back to the cpu
+  std::vector<cumulative_row_info> h_aggregated_info(aggregated_info.size());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
+                                aggregated_info.data(),
+                                sizeof(cumulative_row_info) * c_info.size(),
+                                cudaMemcpyDefault,
+                                _stream.value()));
+  _stream.synchronize();
+
+  // generate the actual splits
+  _pass_itm_data->output_chunk_read_info =
+    find_splits(h_aggregated_info, num_rows, _output_chunk_read_limit);
+}
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
new file mode 100644
index 00000000000..dfc239d8451
--- /dev/null
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "reader_impl_helpers.hpp"
+
+#include <cudf/types.hpp>
+
+namespace cudf::io::parquet::detail {
+
+/**
+ * @brief Struct to store file-level data that remains constant for
+ * all passes/chunks in the file.
+ */
+struct file_intermediate_data {
+  // all row groups to read
+  std::vector<row_group_info> row_groups{};
+
+  // all chunks from the selected row groups. We may end up reading these chunks progressively
+  // instead of all at once
+  std::vector<ColumnChunkDesc> chunks{};
+
+  // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents
+  // the start/end of the chunks to be loaded for a given pass.
+  std::vector<std::size_t> input_pass_row_group_offsets{};
+  // row counts per input-pass
+  std::vector<std::size_t> input_pass_row_count{};
+
+  // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we
+  // may not be visiting every row group that contains these bounds
+  size_t global_skip_rows;
+  size_t global_num_rows;
+};
+
+/**
+ * @brief Struct to identify the range for each chunk of rows during a chunked reading pass.
+ */
+struct chunk_read_info {
+  size_t skip_rows;
+  size_t num_rows;
+};
+
+/**
+ * @brief Struct to store pass-level data that remains constant for a single pass.
+ */
+struct pass_intermediate_data {
+  std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
+  rmm::device_buffer decomp_page_data;
+
+  // rowgroup, chunk and page information for the current pass.
+  std::vector<row_group_info> row_groups{};
+  cudf::detail::hostdevice_vector<ColumnChunkDesc> chunks{};
+  cudf::detail::hostdevice_vector<PageInfo> pages_info{};
+  cudf::detail::hostdevice_vector<PageNestingInfo> page_nesting_info{};
+  cudf::detail::hostdevice_vector<PageNestingDecodeInfo> page_nesting_decode_info{};
+
+  rmm::device_uvector<int32_t> page_keys{0, rmm::cuda_stream_default};
+  rmm::device_uvector<int32_t> page_index{0, rmm::cuda_stream_default};
+  rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
+
+  std::vector<chunk_read_info> output_chunk_read_info;
+  std::size_t current_output_chunk{0};
+
+  rmm::device_buffer level_decode_data{};
+  int level_type_size{0};
+
+  // skip_rows and num_rows values for this particular pass. these may be adjusted values from the
+  // global values stored in file_intermediate_data.
+  size_t skip_rows;
+  size_t num_rows;
+};
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 1a73e2f55ac..8d8ab8707be 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -34,6 +34,23 @@
 
 namespace cudf::io::parquet::detail {
 
+/**
+ * @brief The row_group_info class
+ */
+struct row_group_info {
+  size_type index;  // row group index within a file. aggregate_reader_metadata::get_row_group() is
+                    // called with index and source_index
+  size_t start_row;
+  size_type source_index;  // file index.
+
+  row_group_info() = default;
+
+  row_group_info(size_type index, size_t start_row, size_type source_index)
+    : index{index}, start_row{start_row}, source_index{source_index}
+  {
+  }
+};
+
 /**
  * @brief Function that translates Parquet datatype to cuDF type enum
  */
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 4bc6bb6f43b..ce45f709ee1 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -18,7 +18,6 @@
 
 #include <io/comp/nvcomp_adapter.hpp>
 #include <io/utilities/config_utils.hpp>
-#include <io/utilities/time_utils.cuh>
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
@@ -44,7 +43,6 @@
 #include <numeric>
 
 namespace cudf::io::parquet::detail {
-
 namespace {
 
 /**
@@ -170,46 +168,6 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
   }
 }
 
-/**
- * @brief Return the required number of bits to store a value.
- */
-template <typename T = uint8_t>
-[[nodiscard]] T required_bits(uint32_t max_level)
-{
-  return static_cast<T>(CompactProtocolReader::NumRequiredBits(max_level));
-}
-
-/**
- * @brief Converts cuDF units to Parquet units.
- *
- * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
- */
-[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
-                                                                   type_id timestamp_type_id,
-                                                                   Type physical,
-                                                                   int8_t converted,
-                                                                   int32_t length)
-{
-  int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0;
-  int32_t clock_rate = 0;
-  if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
-    type_width = 1;  // I32 -> I8
-  } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) {
-    type_width = 2;  // I32 -> I16
-  } else if (column_type_id == type_id::INT32) {
-    type_width = 4;  // str -> hash32
-  } else if (is_chrono(data_type{column_type_id})) {
-    clock_rate = to_clockrate(timestamp_type_id);
-  }
-
-  int8_t converted_type = converted;
-  if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 &&
-      not cudf::is_fixed_point(data_type{column_type_id})) {
-    converted_type = UNKNOWN;  // Not converting to float64 or decimal
-  }
-  return std::make_tuple(type_width, clock_rate, converted_type);
-}
-
 /**
  * @brief Reads compressed page data to device memory.
  *
@@ -790,163 +748,6 @@ std::pair<bool, std::vector<std::future<void>>> reader::impl::read_and_decompres
   return {total_decompressed_size > 0, std::move(read_chunk_tasks)};
 }
 
-void reader::impl::load_global_chunk_info()
-{
-  auto const num_rows         = _file_itm_data.global_num_rows;
-  auto const& row_groups_info = _file_itm_data.row_groups;
-  auto& chunks                = _file_itm_data.chunks;
-
-  // Descriptors for all the chunks that make up the selected columns
-  auto const num_input_columns = _input_columns.size();
-  auto const num_chunks        = row_groups_info.size() * num_input_columns;
-
-  // Initialize column chunk information
-  auto remaining_rows = num_rows;
-  for (auto const& rg : row_groups_info) {
-    auto const& row_group      = _metadata->get_row_group(rg.index, rg.source_index);
-    auto const row_group_start = rg.start_row;
-    auto const row_group_rows  = std::min<int>(remaining_rows, row_group.num_rows);
-
-    // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
-    for (size_t i = 0; i < num_input_columns; ++i) {
-      auto col = _input_columns[i];
-      // look up metadata
-      auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
-      auto& schema   = _metadata->get_schema(col.schema_idx);
-
-      auto [type_width, clock_rate, converted_type] =
-        conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
-                        _timestamp_type.id(),
-                        schema.type,
-                        schema.converted_type,
-                        schema.type_length);
-
-      chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size,
-                                       nullptr,
-                                       col_meta.num_values,
-                                       schema.type,
-                                       type_width,
-                                       row_group_start,
-                                       row_group_rows,
-                                       schema.max_definition_level,
-                                       schema.max_repetition_level,
-                                       _metadata->get_output_nesting_depth(col.schema_idx),
-                                       required_bits(schema.max_definition_level),
-                                       required_bits(schema.max_repetition_level),
-                                       col_meta.codec,
-                                       converted_type,
-                                       schema.logical_type,
-                                       schema.decimal_precision,
-                                       clock_rate,
-                                       i,
-                                       col.schema_idx));
-    }
-
-    remaining_rows -= row_group_rows;
-  }
-}
-
-void reader::impl::compute_input_pass_row_group_info()
-{
-  // at this point, row_groups has already been filtered down to just the row groups we need to
-  // handle optional skip_rows/num_rows parameters.
-  auto const& row_groups_info = _file_itm_data.row_groups;
-
-  // if the user hasn't specified an input size limit, read everything in a single pass.
-  if (_input_pass_read_limit == 0) {
-    _input_pass_row_group_offsets.push_back(0);
-    _input_pass_row_group_offsets.push_back(row_groups_info.size());
-    return;
-  }
-
-  // generate passes. make sure to account for the case where a single row group doesn't fit within
-  //
-  std::size_t const read_limit =
-    _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits<std::size_t>::max();
-  std::size_t cur_pass_byte_size = 0;
-  std::size_t cur_rg_start       = 0;
-  std::size_t cur_row_count      = 0;
-  _input_pass_row_group_offsets.push_back(0);
-  _input_pass_row_count.push_back(0);
-
-  for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
-    auto const& rgi       = row_groups_info[cur_rg_index];
-    auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
-
-    // can we add this row group
-    if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) {
-      // A single row group (the current one) is larger than the read limit:
-      // We always need to include at least one row group, so end the pass at the end of the current
-      // row group
-      if (cur_rg_start == cur_rg_index) {
-        _input_pass_row_group_offsets.push_back(cur_rg_index + 1);
-        _input_pass_row_count.push_back(cur_row_count + row_group.num_rows);
-        cur_rg_start       = cur_rg_index + 1;
-        cur_pass_byte_size = 0;
-      }
-      // End the pass at the end of the previous row group
-      else {
-        _input_pass_row_group_offsets.push_back(cur_rg_index);
-        _input_pass_row_count.push_back(cur_row_count);
-        cur_rg_start       = cur_rg_index;
-        cur_pass_byte_size = row_group.total_byte_size;
-      }
-    } else {
-      cur_pass_byte_size += row_group.total_byte_size;
-    }
-    cur_row_count += row_group.num_rows;
-  }
-  // add the last pass if necessary
-  if (_input_pass_row_group_offsets.back() != row_groups_info.size()) {
-    _input_pass_row_group_offsets.push_back(row_groups_info.size());
-    _input_pass_row_count.push_back(cur_row_count);
-  }
-}
-
-void reader::impl::setup_pass()
-{
-  // this will also cause the previous pass information to be deleted
-  _pass_itm_data = std::make_unique<pass_intermediate_data>();
-
-  // setup row groups to be loaded for this pass
-  auto const row_group_start = _input_pass_row_group_offsets[_current_input_pass];
-  auto const row_group_end   = _input_pass_row_group_offsets[_current_input_pass + 1];
-  auto const num_row_groups  = row_group_end - row_group_start;
-  _pass_itm_data->row_groups.resize(num_row_groups);
-  std::copy(_file_itm_data.row_groups.begin() + row_group_start,
-            _file_itm_data.row_groups.begin() + row_group_end,
-            _pass_itm_data->row_groups.begin());
-
-  auto const num_passes = _input_pass_row_group_offsets.size() - 1;
-  CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index");
-
-  auto const chunks_per_rowgroup = _input_columns.size();
-  auto const num_chunks          = chunks_per_rowgroup * num_row_groups;
-
-  auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup);
-  auto chunk_end   = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup);
-
-  _pass_itm_data->chunks = cudf::detail::hostdevice_vector<ColumnChunkDesc>(num_chunks, _stream);
-  std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin());
-
-  // adjust skip_rows and num_rows by what's available in the row groups we are processing
-  if (num_passes == 1) {
-    _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows;
-    _pass_itm_data->num_rows  = _file_itm_data.global_num_rows;
-  } else {
-    auto const global_start_row = _file_itm_data.global_skip_rows;
-    auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
-    auto const start_row = std::max(_input_pass_row_count[_current_input_pass], global_start_row);
-    auto const end_row   = std::min(_input_pass_row_count[_current_input_pass + 1], global_end_row);
-
-    // skip_rows is always global in the sense that it is relative to the first row of
-    // everything we will be reading, regardless of what pass we are on.
-    // num_rows is how many rows we are reading this pass.
-    _pass_itm_data->skip_rows = global_start_row + _input_pass_row_count[_current_input_pass];
-    _pass_itm_data->num_rows  = end_row - start_row;
-  }
-}
-
 void reader::impl::load_and_decompress_data()
 {
   // This function should never be called if `num_rows == 0`.
@@ -1034,359 +835,8 @@ void print_pages(cudf::detail::hostdevice_vector<PageInfo>& pages, rmm::cuda_str
       p.str_bytes);
   }
 }
-
-void print_cumulative_page_info(cudf::detail::hostdevice_vector<PageInfo>& pages,
-                                rmm::device_uvector<int32_t> const& page_index,
-                                rmm::device_uvector<cumulative_row_info> const& c_info,
-                                rmm::cuda_stream_view stream)
-{
-  pages.device_to_host_sync(stream);
-
-  printf("------------\nCumulative sizes by page\n");
-
-  std::vector<int> schemas(pages.size());
-  std::vector<int> h_page_index(pages.size());
-  CUDF_CUDA_TRY(cudaMemcpy(
-    h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault));
-  std::vector<cumulative_row_info> h_cinfo(pages.size());
-  CUDF_CUDA_TRY(cudaMemcpy(
-    h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault));
-  auto schema_iter = cudf::detail::make_counting_transform_iterator(
-    0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; });
-  thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin());
-  auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end());
-  schemas.resize(last - schemas.begin());
-  printf("Num schemas: %lu\n", schemas.size());
-
-  for (size_t idx = 0; idx < schemas.size(); idx++) {
-    printf("Schema %d\n", schemas[idx]);
-    for (size_t pidx = 0; pidx < pages.size(); pidx++) {
-      auto const& page = pages[h_page_index[pidx]];
-      if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
-        continue;
-      }
-      printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes);
-    }
-  }
-}
-
-void print_cumulative_row_info(host_span<cumulative_row_info const> sizes,
-                               std::string const& label,
-                               std::optional<std::vector<chunk_read_info>> splits = std::nullopt)
-{
-  if (splits.has_value()) {
-    printf("------------\nSplits\n");
-    for (size_t idx = 0; idx < splits->size(); idx++) {
-      printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows);
-    }
-  }
-
-  printf("------------\nCumulative sizes %s\n", label.c_str());
-  for (size_t idx = 0; idx < sizes.size(); idx++) {
-    printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key);
-    if (splits.has_value()) {
-      // if we have a split at this row count and this is the last instance of this row count
-      auto start = thrust::make_transform_iterator(
-        splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; });
-      auto end               = start + splits->size();
-      auto split             = std::find(start, end, sizes[idx].row_count);
-      auto const split_index = [&]() -> int {
-        if (split != end &&
-            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) {
-          return static_cast<int>(std::distance(start, split));
-        }
-        return idx == 0 ? 0 : -1;
-      }();
-      if (split_index >= 0) {
-        printf(" <-- split {%lu, %lu}",
-               splits.value()[split_index].skip_rows,
-               splits.value()[split_index].num_rows);
-      }
-    }
-    printf("\n");
-  }
-}
 #endif  // PREPROCESS_DEBUG
 
-/**
- * @brief Functor which reduces two cumulative_row_info structs of the same key.
- */
-struct cumulative_row_sum {
-  cumulative_row_info operator()
-    __device__(cumulative_row_info const& a, cumulative_row_info const& b) const
-  {
-    return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key};
-  }
-};
-
-/**
- * @brief Functor which computes the total data size for a given type of cudf column.
- *
- * In the case of strings, the return size does not include the chars themselves. That
- * information is tracked separately (see PageInfo::str_bytes).
- */
-struct row_size_functor {
-  __device__ size_t validity_size(size_t num_rows, bool nullable)
-  {
-    return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0;
-  }
-
-  template <typename T>
-  __device__ size_t operator()(size_t num_rows, bool nullable)
-  {
-    auto const element_size = sizeof(device_storage_type_t<T>);
-    return (element_size * num_rows) + validity_size(num_rows, nullable);
-  }
-};
-
-template <>
-__device__ size_t row_size_functor::operator()<list_view>(size_t num_rows, bool nullable)
-{
-  auto const offset_size = sizeof(size_type);
-  // NOTE: Adding the + 1 offset here isn't strictly correct.  There will only be 1 extra offset
-  // for the entire column, whereas this is adding an extra offset per page.  So we will get a
-  // small over-estimate of the real size of the order :  # of pages * 4 bytes. It seems better
-  // to overestimate size somewhat than to underestimate it and potentially generate chunks
-  // that are too large.
-  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
-}
-
-template <>
-__device__ size_t row_size_functor::operator()<struct_view>(size_t num_rows, bool nullable)
-{
-  return validity_size(num_rows, nullable);
-}
-
-template <>
-__device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, bool nullable)
-{
-  // only returns the size of offsets and validity. the size of the actual string chars
-  // is tracked separately.
-  auto const offset_size = sizeof(size_type);
-  // see note about offsets in the list_view template.
-  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
-}
-
-/**
- * @brief Functor which computes the total output cudf data size for all of
- * the data in this page.
- *
- * Sums across all nesting levels.
- */
-struct get_cumulative_row_info {
-  PageInfo const* const pages;
-
-  __device__ cumulative_row_info operator()(size_type index)
-  {
-    auto const& page = pages[index];
-    if (page.flags & PAGEINFO_FLAGS_DICTIONARY) {
-      return cumulative_row_info{0, 0, page.src_col_schema};
-    }
-
-    // total nested size, not counting string data
-    auto iter =
-      cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) {
-        auto const& pni = page.nesting[i];
-        return cudf::type_dispatcher(
-          data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
-      });
-
-    size_t const row_count = static_cast<size_t>(page.nesting[0].size);
-    return {
-      row_count,
-      thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes,
-      page.src_col_schema};
-  }
-};
-
-/**
- * @brief Functor which computes the effective size of all input columns by page.
- *
- * For a given row, we want to find the cost of all pages for all columns involved
- * in loading up to that row.  The complication here is that not all pages are the
- * same size between columns. Example:
- *
- *              page row counts
- * Column A:    0 <----> 100 <----> 200
- * Column B:    0 <---------------> 200 <--------> 400
-                          |
- * if we decide to split at row 100, we don't really know the actual amount of bytes in column B
- * at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
- * page. Essentially, a conservative over-estimate of the real size.
- */
-struct row_total_size {
-  cumulative_row_info const* c_info;
-  size_type const* key_offsets;
-  size_t num_keys;
-
-  __device__ cumulative_row_info operator()(cumulative_row_info const& i)
-  {
-    // sum sizes for each input column at this row
-    size_t sum = 0;
-    for (int idx = 0; idx < num_keys; idx++) {
-      auto const start = key_offsets[idx];
-      auto const end   = key_offsets[idx + 1];
-      auto iter        = cudf::detail::make_counting_transform_iterator(
-        0, [&] __device__(size_type i) { return c_info[i].row_count; });
-      auto const page_index =
-        thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter;
-      sum += c_info[page_index].size_bytes;
-    }
-    return {i.row_count, sum, i.key};
-  }
-};
-
-/**
- * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read
- * limit, determine the set of splits.
- *
- * @param sizes Vector of cumulative {row_count, byte_size} pairs
- * @param num_rows Total number of rows to read
- * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
- */
-std::vector<chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
-                                         size_t num_rows,
-                                         size_t chunk_read_limit)
-{
-  // now we have an array of {row_count, real output bytes}. just walk through it and generate
-  // splits.
-  // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch
-  // sizes are reasonably large, this shouldn't iterate too many times
-  std::vector<chunk_read_info> splits;
-  {
-    size_t cur_pos             = 0;
-    size_t cur_cumulative_size = 0;
-    size_t cur_row_count       = 0;
-    auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) {
-      return i.size_bytes - cur_cumulative_size;
-    });
-    auto end   = start + sizes.size();
-    while (cur_row_count < num_rows) {
-      int64_t split_pos =
-        thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start;
-
-      // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back
-      // one.
-      if (static_cast<size_t>(split_pos) >= sizes.size() ||
-          (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) {
-        split_pos--;
-      }
-
-      // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
-      // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
-      // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
-      // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
-      // either do this, or we have to call unique() on the input first.
-      while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
-             (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) {
-        split_pos++;
-      }
-
-      auto const start_row = cur_row_count;
-      cur_row_count        = sizes[split_pos].row_count;
-      splits.push_back(chunk_read_info{start_row, cur_row_count - start_row});
-      cur_pos             = split_pos;
-      cur_cumulative_size = sizes[split_pos].size_bytes;
-    }
-  }
-  // print_cumulative_row_info(sizes, "adjusted", splits);
-
-  return splits;
-}
-
-/**
- * @brief Given a set of pages that have had their sizes computed by nesting level and
- * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing
- * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes.
- *
- * @param pages All pages in the file
- * @param id Additional intermediate information required to process the pages
- * @param num_rows Total number of rows to read
- * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
- * @param stream CUDA stream to use
- */
-std::vector<chunk_read_info> compute_splits(cudf::detail::hostdevice_vector<PageInfo>& pages,
-                                            pass_intermediate_data const& id,
-                                            size_t num_rows,
-                                            size_t chunk_read_limit,
-                                            rmm::cuda_stream_view stream)
-{
-  auto const& page_keys  = id.page_keys;
-  auto const& page_index = id.page_index;
-
-  // generate cumulative row counts and sizes
-  rmm::device_uvector<cumulative_row_info> c_info(page_keys.size(), stream);
-  // convert PageInfo to cumulative_row_info
-  auto page_input = thrust::make_transform_iterator(page_index.begin(),
-                                                    get_cumulative_row_info{pages.device_ptr()});
-  thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                page_keys.begin(),
-                                page_keys.end(),
-                                page_input,
-                                c_info.begin(),
-                                thrust::equal_to{},
-                                cumulative_row_sum{});
-  // print_cumulative_page_info(pages, page_index, c_info, stream);
-
-  // sort by row count
-  rmm::device_uvector<cumulative_row_info> c_info_sorted{c_info, stream};
-  thrust::sort(rmm::exec_policy(stream),
-               c_info_sorted.begin(),
-               c_info_sorted.end(),
-               [] __device__(cumulative_row_info const& a, cumulative_row_info const& b) {
-                 return a.row_count < b.row_count;
-               });
-
-  // std::vector<cumulative_row_info> h_c_info_sorted(c_info_sorted.size());
-  // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(),
-  //                          c_info_sorted.data(),
-  //                          sizeof(cumulative_row_info) * c_info_sorted.size(),
-  //                          cudaMemcpyDefault));
-  // print_cumulative_row_info(h_c_info_sorted, "raw");
-
-  // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
-  // key
-  rmm::device_uvector<size_type> key_offsets(page_keys.size() + 1, stream);
-  auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(stream),
-                                                     page_keys.begin(),
-                                                     page_keys.end(),
-                                                     thrust::make_constant_iterator(1),
-                                                     thrust::make_discard_iterator(),
-                                                     key_offsets.begin())
-                                 .second;
-  size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
-
-  // adjust the cumulative info such that for each row count, the size includes any pages that span
-  // that row count. this is so that if we have this case:
-  //              page row counts
-  // Column A:    0 <----> 100 <----> 200
-  // Column B:    0 <---------------> 200 <--------> 400
-  //                        |
-  // if we decide to split at row 100, we don't really know the actual amount of bytes in column B
-  // at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
-  // page.
-  //
-  rmm::device_uvector<cumulative_row_info> aggregated_info(c_info.size(), stream);
-  thrust::transform(rmm::exec_policy(stream),
-                    c_info_sorted.begin(),
-                    c_info_sorted.end(),
-                    aggregated_info.begin(),
-                    row_total_size{c_info.data(), key_offsets.data(), num_unique_keys});
-
-  // bring back to the cpu
-  std::vector<cumulative_row_info> h_aggregated_info(aggregated_info.size());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(),
-                                aggregated_info.data(),
-                                sizeof(cumulative_row_info) * c_info.size(),
-                                cudaMemcpyDefault,
-                                stream.value()));
-  stream.synchronize();
-
-  return find_splits(h_aggregated_info, num_rows, chunk_read_limit);
-}
-
 struct get_page_chunk_idx {
   __device__ size_type operator()(PageInfo const& page) { return page.chunk_idx; }
 };
@@ -1822,12 +1272,8 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re
   _pass_itm_data->page_keys  = std::move(page_keys);
   _pass_itm_data->page_index = std::move(page_index);
 
-  // compute splits if necessary. otherwise return a single split representing
-  // the whole file.
-  _pass_itm_data->output_chunk_read_info =
-    _output_chunk_read_limit > 0
-      ? compute_splits(pages, *_pass_itm_data, num_rows, chunk_read_limit, _stream)
-      : std::vector<chunk_read_info>{{skip_rows, num_rows}};
+  // compute splits for the pass
+  compute_splits_for_pass();
 }
 
 void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds)
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index f3a43cbc63c..dd049d401cf 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -51,19 +51,21 @@ std::unique_ptr<column> gather_column_buffer::make_string_column_impl(rmm::cuda_
   return make_strings_column(*_strings, stream, _mr);
 }
 
-void inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
+void cudf::io::detail::inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
   // size + 1 for final offset. _string_data will be initialized later.
   _data = create_data(data_type{type_id::INT32}, size + 1, stream, _mr);
 }
 
-void inline_column_buffer::create_string_data(size_t num_bytes, rmm::cuda_stream_view stream)
+void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes,
+                                                                rmm::cuda_stream_view stream)
 {
   _string_data = rmm::device_buffer(num_bytes, stream, _mr);
 }
 
-std::unique_ptr<column> inline_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream)
+std::unique_ptr<column> cudf::io::detail::inline_column_buffer::make_string_column_impl(
+  rmm::cuda_stream_view stream)
 {
   // no need for copies, just transfer ownership of the data_buffers to the columns
   auto const state = mask_state::UNALLOCATED;
@@ -324,7 +326,7 @@ std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
 }
 
 using pointer_type = gather_column_buffer;
-using string_type  = inline_column_buffer;
+using string_type  = cudf::io::detail::inline_column_buffer;
 
 using pointer_column_buffer = column_buffer_base<pointer_type>;
 using string_column_buffer  = column_buffer_base<string_type>;

From 053da82810ad78286602cfd09e37f8a22cb0a15b Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Tue, 10 Oct 2023 13:28:01 -0400
Subject: [PATCH 015/118] Make parquet schema index type consistent (#14256)

While working on parquet schema issue I noticed that the parent and child index didn't match. Discussion ensued and `size_type` was decided.

Authors:
  - Mike Wilson (https://github.com/hyperbolic2346)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14256
---
 cpp/src/io/parquet/parquet.hpp             | 6 ++++--
 cpp/src/io/parquet/reader_impl_helpers.cpp | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index c5993d73dec..dbec59670c7 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -18,6 +18,8 @@
 
 #include "parquet_common.hpp"
 
+#include <cudf/types.hpp>
+
 #include <thrust/optional.h>
 
 #include <cstdint>
@@ -152,8 +154,8 @@ struct SchemaElement {
   // The following fields are filled in later during schema initialization
   int max_definition_level = 0;
   int max_repetition_level = 0;
-  int parent_idx           = 0;
-  std::vector<size_t> children_idx;
+  size_type parent_idx     = 0;
+  std::vector<size_type> children_idx;
 
   bool operator==(SchemaElement const& other) const
   {
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 171cf07da3e..040c6403f57 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -213,7 +213,7 @@ void metadata::sanitize_schema()
         // add a struct child and move this element's children to the struct
         schema_elem.converted_type  = LIST;
         schema_elem.repetition_type = OPTIONAL;
-        auto const struct_node_idx  = schema.size();
+        auto const struct_node_idx  = static_cast<size_type>(schema.size());
 
         SchemaElement struct_elem;
         struct_elem.name            = "struct_node";

From 5039d043a08e7ea7e5656bab60a6fced4dfa2f1d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 10 Oct 2023 15:06:24 -0400
Subject: [PATCH 016/118] Expose stream parameter in public strings APIs
 (#14260)

Add stream parameter to public APIs:

- `cudf::strings::strip()`
- `cudf::strings::slice_strings()`
- `cudf::strings::pad()`
- `cudf::strings::zfill()`
- `cudf::strings::wrap()`

Also cleaned up some of the doxygen comments and added stream-tests.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14260
---
 cpp/include/cudf/strings/padding.hpp        | 14 ++--
 cpp/include/cudf/strings/slice.hpp          | 30 +++++----
 cpp/include/cudf/strings/strip.hpp          |  4 +-
 cpp/include/cudf/strings/wrap.hpp           | 14 ++--
 cpp/src/strings/padding.cu                  |  6 +-
 cpp/src/strings/slice.cu                    |  7 +-
 cpp/src/strings/strip.cu                    |  5 +-
 cpp/src/strings/wrap.cu                     |  8 +--
 cpp/tests/CMakeLists.txt                    |  4 +-
 cpp/tests/streams/strings/strings_tests.cpp | 71 +++++++++++++++++++++
 10 files changed, 125 insertions(+), 38 deletions(-)
 create mode 100644 cpp/tests/streams/strings/strings_tests.cpp

diff --git a/cpp/include/cudf/strings/padding.hpp b/cpp/include/cudf/strings/padding.hpp
index 7699159fbea..f0cb351eeda 100644
--- a/cpp/include/cudf/strings/padding.hpp
+++ b/cpp/include/cudf/strings/padding.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,6 +51,7 @@ namespace strings {
  *        Default is pad right (left justify)
  * @param fill_char Single UTF-8 character to use for padding;
  *        Default is the space character
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column with padded strings
  */
@@ -59,6 +60,7 @@ std::unique_ptr<column> pad(
   size_type width,
   side_type side                      = side_type::RIGHT,
   std::string_view fill_char          = " ",
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -79,14 +81,16 @@ std::unique_ptr<column> pad(
  * r is now ['001234','-09876','+00.34','-342567', '0002+2']
  * @endcode
  *
- * @param input Strings instance for this operation.
- * @param width The minimum number of characters for each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of strings.
+ * @param input Strings instance for this operation
+ * @param width The minimum number of characters for each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of strings
  */
 std::unique_ptr<column> zfill(
   strings_column_view const& input,
   size_type width,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/slice.hpp b/cpp/include/cudf/strings/slice.hpp
index 5f2c71725eb..f106663be9b 100644
--- a/cpp/include/cudf/strings/slice.hpp
+++ b/cpp/include/cudf/strings/slice.hpp
@@ -50,18 +50,20 @@ namespace strings {
  * r2 is now ["lo","ob"]
  * @endcode
  *
- * @param strings Strings column for this operation.
- * @param start First character position to begin the substring.
- * @param stop Last character position (exclusive) to end the substring.
- * @param step Distance between input characters retrieved.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with sorted elements of this instance.
+ * @param input Strings column for this operation
+ * @param start First character position to begin the substring
+ * @param stop Last character position (exclusive) to end the substring
+ * @param step Distance between input characters retrieved
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with sorted elements of this instance
  */
 std::unique_ptr<column> slice_strings(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   numeric_scalar<size_type> const& start = numeric_scalar<size_type>(0, false),
   numeric_scalar<size_type> const& stop  = numeric_scalar<size_type>(0, false),
   numeric_scalar<size_type> const& step  = numeric_scalar<size_type>(1),
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
 
 /**
@@ -95,16 +97,18 @@ std::unique_ptr<column> slice_strings(
  * @throw cudf::logic_error if starts and stops are not same integer type.
  * @throw cudf::logic_error if starts or stops contains nulls.
  *
- * @param strings Strings column for this operation.
- * @param starts First character positions to begin the substring.
- * @param stops Last character (exclusive) positions to end the substring.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with sorted elements of this instance.
+ * @param input Strings column for this operation
+ * @param starts First character positions to begin the substring
+ * @param stops Last character (exclusive) positions to end the substring
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with sorted elements of this instance
  */
 std::unique_ptr<column> slice_strings(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   column_view const& starts,
   column_view const& stops,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/strip.hpp b/cpp/include/cudf/strings/strip.hpp
index adf3b291144..556d6805ac3 100644
--- a/cpp/include/cudf/strings/strip.hpp
+++ b/cpp/include/cudf/strings/strip.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,6 +57,7 @@ namespace strings {
  *        string; Default is both
  * @param to_strip UTF-8 encoded characters to strip from each string;
  *        Default is empty string which indicates strip whitespace characters
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
@@ -64,6 +65,7 @@ std::unique_ptr<column> strip(
   strings_column_view const& input,
   side_type side                      = side_type::BOTH,
   string_scalar const& to_strip       = string_scalar(""),
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/wrap.hpp b/cpp/include/cudf/strings/wrap.hpp
index 8d2d43c7f0f..efdc3e62aff 100644
--- a/cpp/include/cudf/strings/wrap.hpp
+++ b/cpp/include/cudf/strings/wrap.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,14 +57,16 @@ namespace strings {
  * wrapped_string_tbl = ["the quick\nbrown fox\njumped over\nthe lazy\nbrown dog", "hello, world"]
  * ```
  *
- * @param[in] strings String column.
- * @param[in] width Maximum character width of a line within each string.
- * @param[in] mr Device memory resource used to allocate the returned column's device memory
- * @return Column of wrapped strings.
+ * @param input String column
+ * @param width Maximum character width of a line within each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Column of wrapped strings
  */
 std::unique_ptr<column> wrap(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   size_type width,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index c501a8bf7b4..850ccaa4535 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -168,18 +168,20 @@ std::unique_ptr<column> pad(strings_column_view const& input,
                             size_type width,
                             side_type side,
                             std::string_view fill_char,
+                            rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::pad(input, width, side, fill_char, cudf::get_default_stream(), mr);
+  return detail::pad(input, width, side, fill_char, stream, mr);
 }
 
 std::unique_ptr<column> zfill(strings_column_view const& input,
                               size_type width,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::zfill(input, width, cudf::get_default_stream(), mr);
+  return detail::zfill(input, width, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index cce6a19a5a6..5a1fee92c7d 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -248,20 +248,21 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       numeric_scalar<size_type> const& start,
                                       numeric_scalar<size_type> const& stop,
                                       numeric_scalar<size_type> const& step,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice_strings(strings, start, stop, step, cudf::get_default_stream(), mr);
+  return detail::slice_strings(strings, start, stop, step, stream, mr);
 }
 
 std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       column_view const& starts_column,
                                       column_view const& stops_column,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice_strings(
-    strings, starts_column, stops_column, cudf::get_default_stream(), mr);
+  return detail::slice_strings(strings, starts_column, stops_column, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu
index 6fb7c671a87..26df76850f7 100644
--- a/cpp/src/strings/strip.cu
+++ b/cpp/src/strings/strip.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -86,10 +86,11 @@ std::unique_ptr<column> strip(strings_column_view const& input,
 std::unique_ptr<column> strip(strings_column_view const& input,
                               side_type side,
                               string_scalar const& to_strip,
+                              rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::strip(input, side, to_strip, cudf::get_default_stream(), mr);
+  return detail::strip(input, side, to_strip, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu
index 335908d65d1..aa87a663964 100644
--- a/cpp/src/strings/wrap.cu
+++ b/cpp/src/strings/wrap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,10 +19,9 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/case.hpp>
-#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/wrap.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -133,10 +132,11 @@ std::unique_ptr<column> wrap(strings_column_view const& strings,
 
 std::unique_ptr<column> wrap(strings_column_view const& strings,
                              size_type width,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::wrap<detail::execute_wrap>(strings, width, cudf::get_default_stream(), mr);
+  return detail::wrap<detail::execute_wrap>(strings, width, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index ffaba7d6fa7..b15a6c41d39 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -633,8 +633,8 @@ ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
 ConfigureTest(
-  STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE
-  testing
+  STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp
+  streams/strings/strings_tests.cpp STREAM_MODE testing
 )
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/strings/strings_tests.cpp b/cpp/tests/streams/strings/strings_tests.cpp
new file mode 100644
index 00000000000..0db467a6895
--- /dev/null
+++ b/cpp/tests/streams/strings/strings_tests.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/strings/padding.hpp>
+#include <cudf/strings/slice.hpp>
+#include <cudf/strings/strip.hpp>
+#include <cudf/strings/wrap.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <string>
+
+class StringsTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsTest, Strip)
+{
+  auto input = cudf::test::strings_column_wrapper({"  aBc  ", "   ", "aaaa ", "\tb"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const strip = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  auto const side  = cudf::strings::side_type::BOTH;
+  cudf::strings::strip(view, side, strip, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsTest, Pad)
+{
+  auto input = cudf::test::strings_column_wrapper({"333", "", "4444", "1"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const side = cudf::strings::side_type::BOTH;
+  cudf::strings::pad(view, 6, side, " ", cudf::test::get_default_stream());
+  cudf::strings::zfill(view, 6, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsTest, Wrap)
+{
+  auto input = cudf::test::strings_column_wrapper({"the quick brown fox jumped"});
+  auto view  = cudf::strings_column_view(input);
+
+  cudf::strings::wrap(view, 6, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsTest, Slice)
+{
+  auto input = cudf::test::strings_column_wrapper({"hello", "these", "are test strings"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto start = cudf::numeric_scalar(2, true, cudf::test::get_default_stream());
+  auto stop  = cudf::numeric_scalar(5, true, cudf::test::get_default_stream());
+  auto step  = cudf::numeric_scalar(1, true, cudf::test::get_default_stream());
+  cudf::strings::slice_strings(view, start, stop, step, cudf::test::get_default_stream());
+
+  auto starts = cudf::test::fixed_width_column_wrapper<cudf::size_type>({1, 2, 3});
+  auto stops  = cudf::test::fixed_width_column_wrapper<cudf::size_type>({4, 5, 6});
+  cudf::strings::slice_strings(view, starts, stops, cudf::test::get_default_stream());
+}

From c0c7ed8405c679752439081ee1b42b22658264c9 Mon Sep 17 00:00:00 2001
From: Martin Marenz <martin.marenz+github@gmail.com>
Date: Wed, 11 Oct 2023 00:04:58 +0200
Subject: [PATCH 017/118] Add `bytes_per_second` to transpose benchmark
 (#14170)

This patch relates to #13735.

Benchmark: [transpose_benchmark.txt](https://github.com/rapidsai/cudf/files/12699834/transpose_benchmark.txt)

Authors:
  - Martin Marenz (https://github.com/Blonck)
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/14170
---
 cpp/benchmarks/transpose/transpose.cpp | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/transpose/transpose.cpp b/cpp/benchmarks/transpose/transpose.cpp
index 2f41bda4b88..c2737325462 100644
--- a/cpp/benchmarks/transpose/transpose.cpp
+++ b/cpp/benchmarks/transpose/transpose.cpp
@@ -20,17 +20,19 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/transpose.hpp>
+#include <cudf/types.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
 static void BM_transpose(benchmark::State& state)
 {
-  auto count = state.range(0);
+  auto count                    = state.range(0);
+  constexpr auto column_type_id = cudf::type_id::INT32;
   auto int_column_generator =
     thrust::make_transform_iterator(thrust::counting_iterator(0), [count](int i) {
       return cudf::make_numeric_column(
-        cudf::data_type{cudf::type_id::INT32}, count, cudf::mask_state::ALL_VALID);
+        cudf::data_type{column_type_id}, count, cudf::mask_state::ALL_VALID);
     });
 
   auto input_table = cudf::table(std::vector(int_column_generator, int_column_generator + count));
@@ -40,6 +42,17 @@ static void BM_transpose(benchmark::State& state)
     cuda_event_timer raii(state, true);
     auto output = cudf::transpose(input);
   }
+
+  // Collect memory statistics.
+  auto const bytes_read = static_cast<uint64_t>(input.num_columns()) * input.num_rows() *
+                          sizeof(cudf::id_to_type<column_type_id>);
+  auto const bytes_written = bytes_read;
+  // Account for nullability in input and output.
+  auto const null_bytes = 2 * static_cast<uint64_t>(input.num_columns()) *
+                          cudf::bitmask_allocation_size_bytes(input.num_rows());
+
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+                          (bytes_read + bytes_written + null_bytes));
 }
 
 class Transpose : public cudf::benchmark {};

From 0ed7725416879a824ee5b96292eda2d1048a9ada Mon Sep 17 00:00:00 2001
From: Martin Marenz <martin.marenz+github@gmail.com>
Date: Wed, 11 Oct 2023 00:06:23 +0200
Subject: [PATCH 018/118] Add `bytes_per_second` to shift benchmark (#13950)

Adds `bytes_per_second` to `SHIFT_BENCH`.

This patch relates to #13735.

Authors:
  - Martin Marenz (https://github.com/Blonck)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/13950
---
 cpp/benchmarks/copying/shift.cu | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/cpp/benchmarks/copying/shift.cu b/cpp/benchmarks/copying/shift.cu
index 460100a8fe9..e1169e3bcd6 100644
--- a/cpp/benchmarks/copying/shift.cu
+++ b/cpp/benchmarks/copying/shift.cu
@@ -56,18 +56,32 @@ static void BM_shift(benchmark::State& state)
   cudf::size_type size   = state.range(0);
   cudf::size_type offset = size * (static_cast<double>(shift_factor) / 100.0);
 
-  auto const input_table =
-    create_sequence_table({cudf::type_to_id<int>()},
-                          row_count{size},
-                          use_validity ? std::optional<double>{1.0} : std::nullopt);
+  auto constexpr column_type_id = cudf::type_id::INT32;
+  using column_type             = cudf::id_to_type<column_type_id>;
+
+  auto const input_table = create_sequence_table(
+    {column_type_id}, row_count{size}, use_validity ? std::optional<double>{1.0} : std::nullopt);
   cudf::column_view input{input_table->get_column(0)};
 
-  auto fill = use_validity ? make_scalar<int>() : make_scalar<int>(777);
+  auto fill = use_validity ? make_scalar<column_type>() : make_scalar<column_type>(777);
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);
     auto output = cudf::shift(input, offset, *fill);
   }
+
+  auto const elems_read = (size - offset);
+  auto const bytes_read = elems_read * sizeof(column_type);
+
+  // If 'use_validity' is false, the fill value is a number, and the entire column
+  // (excluding the null bitmask) needs to be written. On the other hand, if 'use_validity'
+  // is true, only the elements that can be shifted are written, along with the full null bitmask.
+  auto const elems_written = use_validity ? (size - offset) : size;
+  auto const bytes_written = elems_written * sizeof(column_type);
+  auto const null_bytes    = use_validity ? 2 * cudf::bitmask_allocation_size_bytes(size) : 0;
+
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+                          (bytes_written + bytes_read + null_bytes));
 }
 
 class Shift : public cudf::benchmark {};

From aa8b0f8e4e71a8e2b076656e0a8bf00bfc15ecb8 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 10 Oct 2023 16:14:51 -0700
Subject: [PATCH 019/118] Handle empty string correctly in Parquet statistics
 (#14257)

An empty string should be a valid minimum value for a string column, but the current parquet writer considers an empty string to have no value when writing the column chunk statistics. This PR changes all fields in the Statistics struct to be `thrust::optional` to help distinguish between a valid empty string and no value.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14257
---
 .../io/parquet/compact_protocol_reader.cpp    | 15 ++--
 .../io/parquet/compact_protocol_writer.cpp    | 12 +--
 cpp/src/io/parquet/parquet.hpp                | 18 ++--
 cpp/src/io/parquet/predicate_pushdown.cpp     | 14 +--
 cpp/tests/io/parquet_test.cpp                 | 85 +++++++++++++++----
 5 files changed, 104 insertions(+), 40 deletions(-)

diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 81d1be64a45..1a345ee0750 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -767,12 +767,15 @@ bool CompactProtocolReader::read(ColumnIndex* c)
 
 bool CompactProtocolReader::read(Statistics* s)
 {
-  auto op = std::make_tuple(parquet_field_binary(1, s->max),
-                            parquet_field_binary(2, s->min),
-                            parquet_field_int64(3, s->null_count),
-                            parquet_field_int64(4, s->distinct_count),
-                            parquet_field_binary(5, s->max_value),
-                            parquet_field_binary(6, s->min_value));
+  using optional_binary = parquet_field_optional<std::vector<uint8_t>, parquet_field_binary>;
+  using optional_int64  = parquet_field_optional<int64_t, parquet_field_int64>;
+
+  auto op = std::make_tuple(optional_binary(1, s->max),
+                            optional_binary(2, s->min),
+                            optional_int64(3, s->null_count),
+                            optional_int64(4, s->distinct_count),
+                            optional_binary(5, s->max_value),
+                            optional_binary(6, s->min_value));
   return function_builder(this, op);
 }
 
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 9adc8767880..00810269d3c 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -195,12 +195,12 @@ size_t CompactProtocolWriter::write(ColumnChunkMetaData const& s)
 size_t CompactProtocolWriter::write(Statistics const& s)
 {
   CompactProtocolFieldWriter c(*this);
-  if (not s.max.empty()) { c.field_binary(1, s.max); }
-  if (not s.min.empty()) { c.field_binary(2, s.min); }
-  if (s.null_count != -1) { c.field_int(3, s.null_count); }
-  if (s.distinct_count != -1) { c.field_int(4, s.distinct_count); }
-  if (not s.max_value.empty()) { c.field_binary(5, s.max_value); }
-  if (not s.min_value.empty()) { c.field_binary(6, s.min_value); }
+  if (s.max.has_value()) { c.field_binary(1, s.max.value()); }
+  if (s.min.has_value()) { c.field_binary(2, s.min.value()); }
+  if (s.null_count.has_value()) { c.field_int(3, s.null_count.value()); }
+  if (s.distinct_count.has_value()) { c.field_int(4, s.distinct_count.value()); }
+  if (s.max_value.has_value()) { c.field_binary(5, s.max_value.value()); }
+  if (s.min_value.has_value()) { c.field_binary(6, s.min_value.value()); }
   return c.value();
 }
 
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index dbec59670c7..1cd16ac6102 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -215,12 +215,18 @@ struct SchemaElement {
  * @brief Thrift-derived struct describing column chunk statistics
  */
 struct Statistics {
-  std::vector<uint8_t> max;        // deprecated max value in signed comparison order
-  std::vector<uint8_t> min;        // deprecated min value in signed comparison order
-  int64_t null_count     = -1;     // count of null values in the column
-  int64_t distinct_count = -1;     // count of distinct values occurring
-  std::vector<uint8_t> max_value;  // max value for column determined by ColumnOrder
-  std::vector<uint8_t> min_value;  // min value for column determined by ColumnOrder
+  // deprecated max value in signed comparison order
+  thrust::optional<std::vector<uint8_t>> max;
+  // deprecated min value in signed comparison order
+  thrust::optional<std::vector<uint8_t>> min;
+  // count of null values in the column
+  thrust::optional<int64_t> null_count;
+  // count of distinct values occurring
+  thrust::optional<int64_t> distinct_count;
+  // max value for column determined by ColumnOrder
+  thrust::optional<std::vector<uint8_t>> max_value;
+  // min value for column determined by ColumnOrder
+  thrust::optional<std::vector<uint8_t>> min_value;
 };
 
 /**
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 9083be1c2dd..a5851de3c20 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -150,12 +150,14 @@ struct stats_caster {
         {
         }
 
-        void set_index(size_type index, std::vector<uint8_t> const& binary_value, Type const type)
+        void set_index(size_type index,
+                       thrust::optional<std::vector<uint8_t>> const& binary_value,
+                       Type const type)
         {
-          if (!binary_value.empty()) {
-            val[index] = convert<T>(binary_value.data(), binary_value.size(), type);
+          if (binary_value.has_value()) {
+            val[index] = convert<T>(binary_value.value().data(), binary_value.value().size(), type);
           }
-          if (binary_value.empty()) {
+          if (not binary_value.has_value()) {
             clear_bit_unsafe(null_mask.data(), index);
             null_count++;
           }
@@ -210,10 +212,10 @@ struct stats_caster {
           auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx];
           auto const& colchunk  = row_group.columns[col_idx];
           // To support deprecated min, max fields.
-          auto const& min_value = colchunk.meta_data.statistics.min_value.size() > 0
+          auto const& min_value = colchunk.meta_data.statistics.min_value.has_value()
                                     ? colchunk.meta_data.statistics.min_value
                                     : colchunk.meta_data.statistics.min;
-          auto const& max_value = colchunk.meta_data.statistics.max_value.size() > 0
+          auto const& max_value = colchunk.meta_data.statistics.max_value.has_value()
                                     ? colchunk.meta_data.statistics.max_value
                                     : colchunk.meta_data.statistics.max;
           // translate binary data to Type then to <T>
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 3e5d7033e60..fa85e3a4a1d 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -4161,8 +4161,10 @@ TEST_P(ParquetV2Test, LargeColumnIndex)
       // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
-      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0);
-      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0);
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0);
+      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0);
     }
   }
 }
@@ -4242,6 +4244,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
       auto const ci    = read_column_index(source, chunk);
       auto const stats = get_statistics(chunk);
 
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+
       // schema indexing starts at 1
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
@@ -4250,10 +4255,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
         EXPECT_FALSE(ci.null_pages[p]);
         // null_counts should always be 0
         EXPECT_EQ(ci.null_counts[p], 0);
-        EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0);
+        EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
       }
       for (size_t p = 0; p < ci.max_values.size(); p++)
-        EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0);
+        EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0);
     }
   }
 }
@@ -4344,7 +4349,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
       auto const stats = get_statistics(chunk);
 
       // should be half nulls, except no nulls in column 0
-      EXPECT_EQ(stats.null_count, c == 0 ? 0 : num_rows / 2);
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+      ASSERT_TRUE(stats.null_count.has_value());
+      EXPECT_EQ(stats.null_count.value(), c == 0 ? 0 : num_rows / 2);
 
       // schema indexing starts at 1
       auto const ptype = fmd.schema[c + 1].type;
@@ -4356,10 +4364,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
         } else {
           EXPECT_EQ(ci.null_counts[p], 0);
         }
-        EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0);
+        EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
       }
       for (size_t p = 0; p < ci.max_values.size(); p++) {
-        EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0);
+        EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0);
       }
     }
   }
@@ -4436,7 +4444,12 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
       auto const stats = get_statistics(chunk);
 
       // there should be no nulls except column 1 which is all nulls
-      EXPECT_EQ(stats.null_count, c == 1 ? num_rows : 0);
+      if (c != 1) {
+        ASSERT_TRUE(stats.min_value.has_value());
+        ASSERT_TRUE(stats.max_value.has_value());
+      }
+      ASSERT_TRUE(stats.null_count.has_value());
+      EXPECT_EQ(stats.null_count.value(), c == 1 ? num_rows : 0);
 
       // schema indexing starts at 1
       auto const ptype = fmd.schema[c + 1].type;
@@ -4449,12 +4462,12 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
         }
         if (not ci.null_pages[p]) {
           EXPECT_EQ(ci.null_counts[p], 0);
-          EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0);
+          EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
         }
       }
       for (size_t p = 0; p < ci.max_values.size(); p++) {
         if (not ci.null_pages[p]) {
-          EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0);
+          EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0);
         }
       }
     }
@@ -4533,13 +4546,16 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
       auto const ci    = read_column_index(source, chunk);
       auto const stats = get_statistics(chunk);
 
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+
       auto const ptype = fmd.schema[colidx].type;
       auto const ctype = fmd.schema[colidx].converted_type;
       for (size_t p = 0; p < ci.min_values.size(); p++) {
-        EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0);
+        EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
       }
       for (size_t p = 0; p < ci.max_values.size(); p++) {
-        EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0);
+        EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0);
       }
     }
   }
@@ -4829,11 +4845,14 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
       auto const ci    = read_column_index(source, chunk);
       auto const stats = get_statistics(chunk);
 
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+
       // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
-      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0);
-      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0);
+      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0);
+      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0);
 
       // check that truncated values == expected
       EXPECT_EQ(memcmp(ci.min_values[0].data(), truncated_min[c], ci.min_values[0].size()), 0);
@@ -4890,8 +4909,10 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation)
       // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
-      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0);
-      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0);
+      ASSERT_TRUE(stats.min_value.has_value());
+      ASSERT_TRUE(stats.max_value.has_value());
+      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0);
+      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0);
 
       // check that truncated values == expected
       EXPECT_EQ(ci.min_values[0], truncated_min[c]);
@@ -6737,6 +6758,38 @@ TEST_P(ParquetV2Test, CheckEncodings)
   }
 }
 
+TEST_F(ParquetWriterTest, EmptyMinStringStatistics)
+{
+  char const* const min_val = "";
+  char const* const max_val = "zzz";
+  std::vector<char const*> strings{min_val, max_val, "pining", "for", "the", "fjords"};
+
+  column_wrapper<cudf::string_view> string_col{strings.begin(), strings.end()};
+  auto const output   = table_view{{string_col}};
+  auto const filepath = temp_env->get_temp_filepath("EmptyMinStringStatistics.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, output);
+  cudf::io::write_parquet(out_opts);
+
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::detail::FileMetaData fmd;
+  read_footer(source, &fmd);
+
+  ASSERT_TRUE(fmd.row_groups.size() > 0);
+  ASSERT_TRUE(fmd.row_groups[0].columns.size() > 0);
+  auto const& chunk = fmd.row_groups[0].columns[0];
+  auto const stats  = get_statistics(chunk);
+
+  ASSERT_TRUE(stats.min_value.has_value());
+  ASSERT_TRUE(stats.max_value.has_value());
+  auto const min_value = std::string{reinterpret_cast<char const*>(stats.min_value.value().data()),
+                                     stats.min_value.value().size()};
+  auto const max_value = std::string{reinterpret_cast<char const*>(stats.max_value.value().data()),
+                                     stats.max_value.value().size()};
+  EXPECT_EQ(min_value, std::string(min_val));
+  EXPECT_EQ(max_value, std::string(max_val));
+}
+
 TEST_F(ParquetReaderTest, RepeatedNoAnnotations)
 {
   constexpr unsigned char repeated_bytes[] = {

From b17904dbaa4de1a162fcb4a0f64862f9f83b976f Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Tue, 10 Oct 2023 19:51:02 -0500
Subject: [PATCH 020/118] Add in java bindings for DataSource (#14254)

This PR adds DataSource Java bindings. It also fixes a small bug in CUDF that made it so the bindings would not work for anything but CSV.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/14254
---
 cpp/src/io/utilities/datasource.cpp           |   8 +
 java/src/main/java/ai/rapids/cudf/Cuda.java   |  24 +-
 .../main/java/ai/rapids/cudf/DataSource.java  | 189 ++++++++++++++
 .../java/ai/rapids/cudf/DataSourceHelper.java |  44 ++++
 .../ai/rapids/cudf/DeviceMemoryBuffer.java    |   6 +-
 .../ai/rapids/cudf/MultiBufferDataSource.java | 230 +++++++++++++++++
 .../ai/rapids/cudf/ParquetChunkedReader.java  |  45 +++-
 java/src/main/java/ai/rapids/cudf/Table.java  |  99 +++++++-
 java/src/main/native/CMakeLists.txt           |   1 +
 java/src/main/native/src/ChunkedReaderJni.cpp |  36 ++-
 java/src/main/native/src/CudfJni.cpp          |   8 +
 .../main/native/src/DataSourceHelperJni.cpp   | 237 ++++++++++++++++++
 java/src/main/native/src/TableJni.cpp         | 212 +++++++++++++++-
 java/src/main/native/src/cudf_jni_apis.hpp    |   8 +
 .../test/java/ai/rapids/cudf/TableTest.java   | 225 +++++++++++++++++
 15 files changed, 1358 insertions(+), 14 deletions(-)
 create mode 100644 java/src/main/java/ai/rapids/cudf/DataSource.java
 create mode 100644 java/src/main/java/ai/rapids/cudf/DataSourceHelper.java
 create mode 100644 java/src/main/java/ai/rapids/cudf/MultiBufferDataSource.java
 create mode 100644 java/src/main/native/src/DataSourceHelperJni.cpp

diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 7a7121aa91d..5cdd92ce3b7 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -375,6 +375,14 @@ class user_datasource_wrapper : public datasource {
     return source->device_read(offset, size, stream);
   }
 
+  std::future<size_t> device_read_async(size_t offset,
+                                        size_t size,
+                                        uint8_t* dst,
+                                        rmm::cuda_stream_view stream) override
+  {
+    return source->device_read_async(offset, size, dst, stream);
+  }
+
   [[nodiscard]] size_t size() const override { return source->size(); }
 
  private:
diff --git a/java/src/main/java/ai/rapids/cudf/Cuda.java b/java/src/main/java/ai/rapids/cudf/Cuda.java
index e1298e29925..7cc3d30a9cf 100755
--- a/java/src/main/java/ai/rapids/cudf/Cuda.java
+++ b/java/src/main/java/ai/rapids/cudf/Cuda.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,6 @@
  */
 package ai.rapids.cudf;
 
-import ai.rapids.cudf.NvtxColor;
-import ai.rapids.cudf.NvtxRange;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -90,6 +87,21 @@ private Stream() {
       this.id = -1;
     }
 
+    private Stream(long id) {
+      this.cleaner = null;
+      this.id = id;
+    }
+
+    /**
+     * Wrap a given stream ID to make it accessible.
+     */
+    static Stream wrap(long id) {
+      if (id == -1) {
+        return DEFAULT_STREAM;
+      }
+      return new Stream(id);
+    }
+
     /**
      * Have this stream not execute new work until the work recorded in event completes.
      * @param event the event to wait on.
@@ -122,7 +134,9 @@ public synchronized void close() {
         cleaner.delRef();
       }
       if (closed) {
-        cleaner.logRefCountDebug("double free " + this);
+        if (cleaner != null) {
+          cleaner.logRefCountDebug("double free " + this);
+        }
         throw new IllegalStateException("Close called too many times " + this);
       }
       if (cleaner != null) {
diff --git a/java/src/main/java/ai/rapids/cudf/DataSource.java b/java/src/main/java/ai/rapids/cudf/DataSource.java
new file mode 100644
index 00000000000..1e5893235df
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/DataSource.java
@@ -0,0 +1,189 @@
+/*
+ *
+ *  Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.HashMap;
+
+/**
+ * Base class that can be used to provide data dynamically to CUDF. This follows somewhat
+ * closely with cudf::io::datasource. There are a few main differences.
+ * <br/>
+ * First this does not expose async device reads. It will call the non-async device read API
+ * instead. This might be added in the future, but there was no direct use case for it in java
+ * right now to warrant the added complexity.
+ * <br/>
+ * Second there is no implementation of the device read API that returns a buffer instead of
+ * writing into one. This is not used by CUDF yet so testing an implementation that isn't used
+ * didn't feel ideal. If it is needed we will add one in the future.
+ */
+public abstract class DataSource implements AutoCloseable {
+  private static final Logger log = LoggerFactory.getLogger(DataSource.class);
+
+  /**
+   * This is used to keep track of the HostMemoryBuffers in java land so the C++ layer
+   * does not have to do it.
+   */
+  private final HashMap<Long, HostMemoryBuffer> cachedBuffers = new HashMap<>();
+
+  @Override
+  public void close() {
+    if (!cachedBuffers.isEmpty()) {
+      throw new IllegalStateException("DataSource closed before all returned host buffers were closed");
+    }
+  }
+
+  /**
+   * Get the size of the source in bytes.
+   */
+  public abstract long size();
+
+  /**
+   * Read data from the source at the given offset. Return a HostMemoryBuffer for the data
+   * that was read.
+   * @param offset where to start reading from.
+   * @param amount the maximum number of bytes to read.
+   * @return a buffer that points to the data.
+   * @throws IOException on any error.
+   */
+  public abstract HostMemoryBuffer hostRead(long offset, long amount) throws IOException;
+
+
+  /**
+   * Called when the buffer returned from hostRead is done. The default is to close the buffer.
+   */
+  protected void onHostBufferDone(HostMemoryBuffer buffer) {
+    if (buffer != null) {
+      buffer.close();
+    }
+  }
+
+  /**
+   * Read data from the source at the given offset into dest. Note that dest should not be closed,
+   * and no reference to it can outlive the call to hostRead. The target amount to read is
+   * dest's length.
+   * @param offset the offset to start reading from in the source.
+   * @param dest where to write the data.
+   * @return the actual number of bytes written to dest.
+   */
+  public abstract long hostRead(long offset, HostMemoryBuffer dest) throws IOException;
+
+  /**
+   * Return true if this supports reading directly to the device else false. The default is
+   * no device support. This cannot change dynamically. It is typically read just once.
+   */
+  public boolean supportsDeviceRead() {
+    return false;
+  }
+
+  /**
+   * Get the size cutoff between device reads and host reads when device reads are supported.
+   * Anything larger than the cutoff will be a device read and anything smaller will be a
+   * host read. By default, the cutoff is 0 so all reads will be device reads if device reads
+   * are supported.
+   */
+  public long getDeviceReadCutoff() {
+    return 0;
+  }
+
+  /**
+   * Read data from the source at the given offset into dest. Note that dest should not be closed,
+   * and no reference to it can outlive the call to hostRead. The target amount to read is
+   * dest's length.
+   * @param offset the offset to start reading from
+   * @param dest where to write the data.
+   * @param stream the stream to do the copy on.
+   * @return the actual number of bytes written to dest.
+   */
+  public long deviceRead(long offset, DeviceMemoryBuffer dest,
+                         Cuda.Stream stream) throws IOException {
+    throw new IllegalStateException("Device read is not implemented");
+  }
+
+  /////////////////////////////////////////////////
+  // Internal methods called from JNI
+  /////////////////////////////////////////////////
+
+  private static class NoopCleaner extends MemoryBuffer.MemoryBufferCleaner {
+    @Override
+    protected boolean cleanImpl(boolean logErrorIfNotClean) {
+      return true;
+    }
+
+    @Override
+    public boolean isClean() {
+      return true;
+    }
+  }
+  private static final NoopCleaner cleaner = new NoopCleaner();
+
+  // Called from JNI
+  private void onHostBufferDone(long bufferId) {
+    HostMemoryBuffer hmb = cachedBuffers.remove(bufferId);
+    if (hmb != null) {
+      onHostBufferDone(hmb);
+    } else {
+      // Called from C++ destructor so avoid throwing...
+      log.warn("Got a close callback for a buffer we could not find " + bufferId);
+    }
+  }
+
+  // Called from JNI
+  private long hostRead(long offset, long amount, long dst) throws IOException {
+    if (amount < 0) {
+      throw new IllegalArgumentException("Cannot allocate more than " + Long.MAX_VALUE + " bytes");
+    }
+    try (HostMemoryBuffer dstBuffer = new HostMemoryBuffer(dst, amount, cleaner)) {
+      return hostRead(offset, dstBuffer);
+    }
+  }
+
+  // Called from JNI
+  private long[] hostReadBuff(long offset, long amount) throws IOException {
+    if (amount < 0) {
+      throw new IllegalArgumentException("Cannot read more than " + Long.MAX_VALUE + " bytes");
+    }
+    HostMemoryBuffer buff = hostRead(offset, amount);
+    long[] ret = new long[3];
+    if (buff != null) {
+      long id = buff.id;
+      if (cachedBuffers.put(id, buff) != null) {
+        throw new IllegalStateException("Already had a buffer cached for " + buff);
+      }
+      ret[0] = buff.address;
+      ret[1] = buff.length;
+      ret[2] = id;
+    } // else they are all 0 because java does that already
+    return ret;
+  }
+
+  // Called from JNI
+  private long deviceRead(long offset, long amount, long dst, long stream) throws IOException {
+    if (amount < 0) {
+      throw new IllegalArgumentException("Cannot read more than " + Long.MAX_VALUE + " bytes");
+    }
+    Cuda.Stream strm = Cuda.Stream.wrap(stream);
+    try (DeviceMemoryBuffer dstBuffer = new DeviceMemoryBuffer(dst, amount, cleaner)) {
+      return deviceRead(offset, dstBuffer, strm);
+    }
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/DataSourceHelper.java b/java/src/main/java/ai/rapids/cudf/DataSourceHelper.java
new file mode 100644
index 00000000000..5d4dcb8e4e7
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/DataSourceHelper.java
@@ -0,0 +1,44 @@
+/*
+ *
+ *  Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * This is here because we need some JNI methods to work with a DataSource, but
+ * we also want to cache callback methods at startup for performance reasons. If
+ * we put both in the same class we will get a deadlock because of how we load
+ * the JNI. We have a static block that blocks loading the class until the JNI
+ * library is loaded and the JNI library cannot load until the class is loaded
+ * and cached. This breaks the loop.
+ */
+class DataSourceHelper {
+    static {
+        NativeDepsLoader.loadNativeDeps();
+    }
+
+    static long createWrapperDataSource(DataSource ds) {
+        return createWrapperDataSource(ds, ds.size(), ds.supportsDeviceRead(),
+                ds.getDeviceReadCutoff());
+    }
+
+    private static native long createWrapperDataSource(DataSource ds, long size,
+                                                       boolean deviceReadSupport,
+                                                       long deviceReadCutoff);
+
+    static native void destroyWrapperDataSource(long handle);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java
index c4d9bdb8f91..9eab607ed0b 100644
--- a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java
+++ b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -112,6 +112,10 @@ public static DeviceMemoryBuffer fromRmm(long address, long lengthInBytes, long
     return new DeviceMemoryBuffer(address, lengthInBytes, rmmBufferAddress);
   }
 
+  DeviceMemoryBuffer(long address, long lengthInBytes, MemoryBufferCleaner cleaner) {
+    super(address, lengthInBytes, cleaner);
+  }
+
   DeviceMemoryBuffer(long address, long lengthInBytes, long rmmBufferAddress) {
     super(address, lengthInBytes, new RmmDeviceBufferCleaner(rmmBufferAddress));
   }
diff --git a/java/src/main/java/ai/rapids/cudf/MultiBufferDataSource.java b/java/src/main/java/ai/rapids/cudf/MultiBufferDataSource.java
new file mode 100644
index 00000000000..6986b6a7fec
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/MultiBufferDataSource.java
@@ -0,0 +1,230 @@
+/*
+ *
+ *  Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * This is a DataSource that can take multiple HostMemoryBuffers. They
+ * are treated as if they are all part of a single file connected end to end.
+ */
+public class MultiBufferDataSource extends DataSource {
+  private final long sizeInBytes;
+  private final HostMemoryBuffer[] hostBuffers;
+  private final long[] startOffsets;
+  private final HostMemoryAllocator allocator;
+
+  // Metrics
+  private long hostReads = 0;
+  private long hostReadBytes = 0;
+  private long devReads = 0;
+  private long devReadBytes = 0;
+
+  /**
+   * Create a new data source backed by multiple buffers.
+   * @param buffers the buffers that will back the data source.
+   */
+  public MultiBufferDataSource(HostMemoryBuffer ... buffers) {
+    this(DefaultHostMemoryAllocator.get(), buffers);
+  }
+
+  /**
+   * Create a new data source backed by multiple buffers.
+   * @param allocator the allocator to use for host buffers, if needed.
+   * @param buffers the buffers that will back the data source.
+   */
+  public MultiBufferDataSource(HostMemoryAllocator allocator, HostMemoryBuffer ... buffers) {
+    int numBuffers = buffers.length;
+    hostBuffers = new HostMemoryBuffer[numBuffers];
+    startOffsets = new long[numBuffers];
+
+    long currentOffset = 0;
+    for (int i = 0; i < numBuffers; i++) {
+      HostMemoryBuffer hmb = buffers[i];
+      hmb.incRefCount();
+      hostBuffers[i] = hmb;
+      startOffsets[i] = currentOffset;
+      currentOffset += hmb.getLength();
+    }
+    sizeInBytes = currentOffset;
+    this.allocator = allocator;
+  }
+
+  @Override
+  public long size() {
+    return sizeInBytes;
+  }
+
+  private int getStartBufferIndexForOffset(long offset) {
+    assert (offset >= 0);
+
+    // It is super common to read from the start or end of a file (the header or footer)
+    // so special case them
+    if (offset == 0) {
+      return 0;
+    }
+    int startIndex = 0;
+    int endIndex = startOffsets.length - 1;
+    if (offset >= startOffsets[endIndex]) {
+      return endIndex;
+    }
+    while (startIndex != endIndex) {
+      int midIndex = (int)(((long)startIndex + endIndex) / 2);
+      long midStartOffset = startOffsets[midIndex];
+      if (offset >= midStartOffset) {
+        // It is either in mid or after mid.
+        if (midIndex == endIndex || offset <= startOffsets[midIndex + 1]) {
+          // We found it in mid
+          return midIndex;
+        } else {
+          // It is after mid
+          startIndex = midIndex + 1;
+        }
+      } else {
+        // It is before mid
+        endIndex = midIndex - 1;
+      }
+    }
+    return startIndex;
+  }
+
+
+  interface DoCopy<T extends MemoryBuffer> {
+    void copyFromHostBuffer(T dest, long destOffset, HostMemoryBuffer src,
+                            long srcOffset, long srcAmount);
+  }
+
+  private <T extends MemoryBuffer> long read(long offset, T dest, DoCopy<T> doCopy) {
+    assert (offset >= 0);
+    long realOffset = Math.min(offset, sizeInBytes);
+    long realAmount = Math.min(sizeInBytes - realOffset, dest.getLength());
+
+    int index = getStartBufferIndexForOffset(realOffset);
+
+    HostMemoryBuffer buffer = hostBuffers[index];
+    long bufferOffset = realOffset - startOffsets[index];
+    long bufferAmount = Math.min(buffer.length - bufferOffset, realAmount);
+    long remainingAmount = realAmount;
+    long currentOffset = realOffset;
+    long outputOffset = 0;
+
+    while (remainingAmount > 0) {
+      doCopy.copyFromHostBuffer(dest, outputOffset, buffer,
+          bufferOffset, bufferAmount);
+      remainingAmount -= bufferAmount;
+      outputOffset += bufferAmount;
+      currentOffset += bufferAmount;
+      index++;
+      if (index < hostBuffers.length) {
+        buffer = hostBuffers[index];
+        bufferOffset = currentOffset - startOffsets[index];
+        bufferAmount = Math.min(buffer.length - bufferOffset, remainingAmount);
+      }
+    }
+
+    return realAmount;
+  }
+
+  @Override
+  public HostMemoryBuffer hostRead(long offset, long amount) {
+    assert (offset >= 0);
+    assert (amount >= 0);
+    long realOffset = Math.min(offset, sizeInBytes);
+    long realAmount = Math.min(sizeInBytes - realOffset, amount);
+
+    int index = getStartBufferIndexForOffset(realOffset);
+
+    HostMemoryBuffer buffer = hostBuffers[index];
+    long bufferOffset = realOffset - startOffsets[index];
+    long bufferAmount = Math.min(buffer.length - bufferOffset, realAmount);
+    if (bufferAmount == realAmount) {
+      hostReads += 1;
+      hostReadBytes += realAmount;
+      // It all fits in a single buffer, so do a zero copy operation
+      return buffer.slice(bufferOffset, bufferAmount);
+    } else {
+      // We will have to allocate a new buffer and copy data into it.
+      boolean success = false;
+      HostMemoryBuffer ret = allocator.allocate(realAmount, true);
+      try {
+        long amountRead = read(offset, ret, HostMemoryBuffer::copyFromHostBuffer);
+        assert(amountRead == realAmount);
+        hostReads += 1;
+        hostReadBytes += amountRead;
+        success = true;
+        return ret;
+      } finally {
+        if (!success) {
+          ret.close();
+        }
+      }
+    }
+  }
+
+  @Override
+  public long hostRead(long offset, HostMemoryBuffer dest) {
+    long ret = read(offset, dest, HostMemoryBuffer::copyFromHostBuffer);
+    hostReads += 1;
+    hostReadBytes += ret;
+    return ret;
+  }
+
+  @Override
+  public boolean supportsDeviceRead() {
+    return true;
+  }
+
+  @Override
+  public long deviceRead(long offset, DeviceMemoryBuffer dest,
+                         Cuda.Stream stream) {
+    long ret = read(offset, dest, (destParam, destOffset, src, srcOffset, srcAmount) ->
+        destParam.copyFromHostBufferAsync(destOffset, src, srcOffset, srcAmount, stream));
+    devReads += 1;
+    devReadBytes += ret;
+    return ret;
+  }
+
+
+  @Override
+  public void close() {
+    try {
+      super.close();
+    } finally {
+      for (HostMemoryBuffer hmb: hostBuffers) {
+        if (hmb != null) {
+          hmb.close();
+        }
+      }
+    }
+  }
+
+  public long getHostReads() {
+    return hostReads;
+  }
+
+  public long getHostReadBytes() {
+    return hostReadBytes;
+  }
+
+  public long getDevReads() {
+    return devReads;
+  }
+
+  public long getDevReadBytes() {
+    return devReadBytes;
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
index c34336ac73f..17d59b757c3 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -51,7 +51,7 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, File f
     handle = create(chunkSizeByteLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
         filePath.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId());
 
-    if(handle == 0) {
+    if (handle == 0) {
       throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
     }
   }
@@ -71,18 +71,45 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, HostMe
     handle = create(chunkSizeByteLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null,
         buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId());
 
-    if(handle == 0) {
+    if (handle == 0) {
       throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
     }
   }
 
+  /**
+   * Construct a reader instance from a DataSource
+   * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read,
+   *                           or 0 if there is no limit.
+   * @param opts The options for Parquet reading.
+   * @param ds the data source to read from
+   */
+  public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, DataSource ds) {
+    dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds);
+    if (dataSourceHandle == 0) {
+      throw new IllegalStateException("Cannot create native datasource object");
+    }
+
+    boolean passed = false;
+    try {
+      handle = createWithDataSource(chunkSizeByteLimit, opts.getIncludeColumnNames(),
+              opts.getReadBinaryAsString(), opts.timeUnit().typeId.getNativeId(),
+              dataSourceHandle);
+      passed = true;
+    } finally {
+      if (!passed) {
+        DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
+        dataSourceHandle = 0;
+      }
+    }
+  }
+
   /**
    * Check if the given file has anything left to read.
    *
    * @return A boolean value indicating if there is more data to read from file.
    */
   public boolean hasNext() {
-    if(handle == 0) {
+    if (handle == 0) {
       throw new IllegalStateException("Native chunked Parquet reader object may have been closed.");
     }
 
@@ -104,7 +131,7 @@ public boolean hasNext() {
    * @return A table of new rows reading from the given file.
    */
   public Table readChunk() {
-    if(handle == 0) {
+    if (handle == 0) {
       throw new IllegalStateException("Native chunked Parquet reader object may have been closed.");
     }
 
@@ -118,6 +145,10 @@ public void close() {
       close(handle);
       handle = 0;
     }
+    if (dataSourceHandle != 0) {
+      DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
+      dataSourceHandle = 0;
+    }
   }
 
 
@@ -131,6 +162,7 @@ public void close() {
    */
   private long handle;
 
+  private long dataSourceHandle = 0;
 
   /**
    * Create a native chunked Parquet reader object on heap and return its memory address.
@@ -147,6 +179,9 @@ public void close() {
   private static native long create(long chunkSizeByteLimit, String[] filterColumnNames,
       boolean[] binaryToString, String filePath, long bufferAddrs, long length, int timeUnit);
 
+  private static native long createWithDataSource(long chunkedSizeByteLimit,
+      String[] filterColumnNames, boolean[] binaryToString, int timeUnit, long dataSourceHandle);
+
   private static native boolean hasNext(long handle);
 
   private static native long[] readChunk(long handle);
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 51a33ebb72f..3bd1e3f25a7 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -235,6 +235,14 @@ private static native long[] readCSV(String[] columnNames,
                                        byte comment, String[] nullValues,
                                        String[] trueValues, String[] falseValues) throws CudfException;
 
+  private static native long[] readCSVFromDataSource(String[] columnNames,
+                                       int[] dTypeIds, int[] dTypeScales,
+                                       String[] filterColumnNames,
+                                       int headerRow, byte delim, int quoteStyle, byte quote,
+                                       byte comment, String[] nullValues,
+                                       String[] trueValues, String[] falseValues,
+                                       long dataSourceHandle) throws CudfException;
+
   /**
    * read JSON data and return a pointer to a TableWithMeta object.
    */
@@ -244,6 +252,12 @@ private static native long readJSON(String[] columnNames,
                                         boolean dayFirst, boolean lines,
                                         boolean recoverWithNulls) throws CudfException;
 
+  private static native long readJSONFromDataSource(String[] columnNames,
+                                      int[] dTypeIds, int[] dTypeScales,
+                                      boolean dayFirst, boolean lines,
+                                      boolean recoverWithNulls,
+                                      long dsHandle) throws CudfException;
+
   private static native long readAndInferJSON(long address, long length,
       boolean dayFirst, boolean lines, boolean recoverWithNulls) throws CudfException;
 
@@ -260,6 +274,10 @@ private static native long readAndInferJSON(long address, long length,
   private static native long[] readParquet(String[] filterColumnNames, boolean[] binaryToString, String filePath,
                                            long address, long length, int timeUnit) throws CudfException;
 
+  private static native long[] readParquetFromDataSource(String[] filterColumnNames,
+                                                         boolean[] binaryToString, int timeUnit,
+                                                         long dataSourceHandle) throws CudfException;
+
   /**
    * Read in Avro formatted data.
    * @param filterColumnNames  name of the columns to read, or an empty array if we want to read
@@ -271,6 +289,9 @@ private static native long[] readParquet(String[] filterColumnNames, boolean[] b
   private static native long[] readAvro(String[] filterColumnNames, String filePath,
                                         long address, long length) throws CudfException;
 
+  private static native long[] readAvroFromDataSource(String[] filterColumnNames,
+                                                      long dataSourceHandle) throws CudfException;
+
   /**
    * Setup everything to write parquet formatted data to a file.
    * @param columnNames     names that correspond to the table columns
@@ -372,6 +393,11 @@ private static native long[] readORC(String[] filterColumnNames,
                                        boolean usingNumPyTypes, int timeUnit,
                                        String[] decimal128Columns) throws CudfException;
 
+  private static native long[] readORCFromDataSource(String[] filterColumnNames,
+                                                     boolean usingNumPyTypes, int timeUnit,
+                                                     String[] decimal128Columns,
+                                                     long dataSourceHandle) throws CudfException;
+
   /**
    * Setup everything to write ORC formatted data to a file.
    * @param columnNames     names that correspond to the table columns
@@ -881,6 +907,27 @@ public static Table readCSV(Schema schema, CSVOptions opts, HostMemoryBuffer buf
         opts.getFalseValues()));
   }
 
+  public static Table readCSV(Schema schema, CSVOptions opts, DataSource ds) {
+    long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
+    try {
+      return new Table(readCSVFromDataSource(schema.getColumnNames(),
+              schema.getTypeIds(),
+              schema.getTypeScales(),
+              opts.getIncludeColumnNames(),
+              opts.getHeaderRow(),
+              opts.getDelim(),
+              opts.getQuoteStyle().nativeId,
+              opts.getQuote(),
+              opts.getComment(),
+              opts.getNullValues(),
+              opts.getTrueValues(),
+              opts.getFalseValues(),
+              dsHandle));
+    } finally {
+      DataSourceHelper.destroyWrapperDataSource(dsHandle);
+    }
+  }
+
   private static native void writeCSVToFile(long table,
                                             String[] columnNames,
                                             boolean includeHeader,
@@ -1128,6 +1175,24 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
     }
   }
 
+  /**
+   * Read JSON formatted data.
+   * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
+   * @param opts various JSON parsing options.
+   * @param ds the DataSource to read from.
+   * @return the data parsed as a table on the GPU.
+   */
+  public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
+    long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
+    try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getColumnNames(),
+            schema.getTypeIds(), schema.getTypeScales(), opts.isDayFirst(), opts.isLines(),
+            opts.isRecoverWithNull(), dsHandle))) {
+      return gatherJSONColumns(schema, twm);
+    } finally {
+      DataSourceHelper.destroyWrapperDataSource(dsHandle);
+    }
+  }
+
   /**
    * Read a Parquet file using the default ParquetOptions.
    * @param path the local file to read.
@@ -1214,6 +1279,17 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer,
         null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId()));
   }
 
+  public static Table readParquet(ParquetOptions opts, DataSource ds) {
+    long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds);
+    try {
+      return new Table(readParquetFromDataSource(opts.getIncludeColumnNames(),
+              opts.getReadBinaryAsString(), opts.timeUnit().typeId.getNativeId(),
+              dataSourceHandle));
+    } finally {
+      DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
+    }
+  }
+
   /**
    * Read an Avro file using the default AvroOptions.
    * @param path the local file to read.
@@ -1297,6 +1373,16 @@ public static Table readAvro(AvroOptions opts, HostMemoryBuffer buffer,
         null, buffer.getAddress() + offset, len));
   }
 
+  public static Table readAvro(AvroOptions opts, DataSource ds) {
+    long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds);
+    try {
+      return new Table(readAvroFromDataSource(opts.getIncludeColumnNames(),
+              dataSourceHandle));
+    } finally {
+      DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
+    }
+  }
+
   /**
    * Read a ORC file using the default ORCOptions.
    * @param path the local file to read.
@@ -1388,6 +1474,17 @@ public static Table readORC(ORCOptions opts, HostMemoryBuffer buffer,
         opts.getDecimal128Columns()));
   }
 
+  public static Table readORC(ORCOptions opts, DataSource ds) {
+    long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds);
+    try {
+      return new Table(readORCFromDataSource(opts.getIncludeColumnNames(),
+              opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(),
+              opts.getDecimal128Columns(), dataSourceHandle));
+    } finally {
+      DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
+    }
+  }
+
   private static class ParquetTableWriter extends TableWriter {
     HostBufferConsumer consumer;
 
@@ -2262,7 +2359,7 @@ public Table dropDuplicates(int[] keyColumns, DuplicateKeepOption keep, boolean
 
   /**
    * Count how many rows in the table are distinct from one another.
-   * @param nullEqual if nulls should be considered equal to each other or not.
+   * @param nullsEqual if nulls should be considered equal to each other or not.
    */
   public int distinctCount(NullEquality nullsEqual) {
     return distinctCount(nativeHandle, nullsEqual.nullsEqual);
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 0dcfee2cffe..01161a03dd4 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -135,6 +135,7 @@ add_library(
   src/ColumnViewJni.cu
   src/CompiledExpression.cpp
   src/ContiguousTableJni.cpp
+  src/DataSourceHelperJni.cpp
   src/HashJoinJni.cpp
   src/HostMemoryBufferNativeUtilsJni.cpp
   src/NvcompJni.cpp
diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp
index 8d0a8bdbfe7..0044385f267 100644
--- a/java/src/main/native/src/ChunkedReaderJni.cpp
+++ b/java/src/main/native/src/ChunkedReaderJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -85,6 +85,40 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create(
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_createWithDataSource(
+    JNIEnv *env, jclass, jlong chunk_read_limit, jobjectArray filter_col_names,
+    jbooleanArray j_col_binary_read, jint unit, jlong ds_handle) {
+  JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0);
+  JNI_NULL_CHECK(env, ds_handle, "Null DataSouurce", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+
+    // TODO: This variable is unused now, but we still don't know what to do with it yet.
+    // As such, it needs to stay here for a little more time before we decide to use it again,
+    // or remove it completely.
+    cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
+    (void)n_col_binary_read;
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    auto opts_builder = cudf::io::parquet_reader_options::builder(source);
+    if (n_filter_col_names.size() > 0) {
+      opts_builder = opts_builder.columns(n_filter_col_names.as_cpp_vector());
+    }
+    auto const read_opts = opts_builder.convert_strings_to_categories(false)
+                               .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+                               .build();
+
+    return reinterpret_cast<jlong>(new cudf::io::chunked_parquet_reader(
+        static_cast<std::size_t>(chunk_read_limit), read_opts));
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_hasNext(JNIEnv *env, jclass,
                                                                             jlong handle) {
   JNI_NULL_CHECK(env, handle, "handle is null", false);
diff --git a/java/src/main/native/src/CudfJni.cpp b/java/src/main/native/src/CudfJni.cpp
index 0f143086451..d0a25d449a6 100644
--- a/java/src/main/native/src/CudfJni.cpp
+++ b/java/src/main/native/src/CudfJni.cpp
@@ -175,6 +175,14 @@ JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *) {
     return JNI_ERR;
   }
 
+  if (!cudf::jni::cache_data_source_jni(env)) {
+    if (!env->ExceptionCheck()) {
+      env->ThrowNew(env->FindClass("java/lang/RuntimeException"),
+                    "Unable to locate data source helper methods needed by JNI");
+    }
+    return JNI_ERR;
+  }
+
   return cudf::jni::MINIMUM_JNI_VERSION;
 }
 
diff --git a/java/src/main/native/src/DataSourceHelperJni.cpp b/java/src/main/native/src/DataSourceHelperJni.cpp
new file mode 100644
index 00000000000..8d0e4d36413
--- /dev/null
+++ b/java/src/main/native/src/DataSourceHelperJni.cpp
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/io/datasource.hpp>
+
+#include "cudf_jni_apis.hpp"
+#include "jni_utils.hpp"
+
+namespace {
+
+#define DATA_SOURCE_CLASS "ai/rapids/cudf/DataSource"
+
+jclass DataSource_jclass;
+jmethodID hostRead_method;
+jmethodID hostReadBuff_method;
+jmethodID onHostBufferDone_method;
+jmethodID deviceRead_method;
+
+} // anonymous namespace
+
+namespace cudf {
+namespace jni {
+bool cache_data_source_jni(JNIEnv *env) {
+  jclass cls = env->FindClass(DATA_SOURCE_CLASS);
+  if (cls == nullptr) {
+    return false;
+  }
+
+  hostRead_method = env->GetMethodID(cls, "hostRead", "(JJJ)J");
+  if (hostRead_method == nullptr) {
+    return false;
+  }
+
+  hostReadBuff_method = env->GetMethodID(cls, "hostReadBuff", "(JJ)[J");
+  if (hostReadBuff_method == nullptr) {
+    return false;
+  }
+
+  onHostBufferDone_method = env->GetMethodID(cls, "onHostBufferDone", "(J)V");
+  if (onHostBufferDone_method == nullptr) {
+    return false;
+  }
+
+  deviceRead_method = env->GetMethodID(cls, "deviceRead", "(JJJJ)J");
+  if (deviceRead_method == nullptr) {
+    return false;
+  }
+
+  // Convert local reference to global so it cannot be garbage collected.
+  DataSource_jclass = static_cast<jclass>(env->NewGlobalRef(cls));
+  if (DataSource_jclass == nullptr) {
+    return false;
+  }
+  return true;
+}
+
+void release_data_source_jni(JNIEnv *env) {
+  DataSource_jclass = cudf::jni::del_global_ref(env, DataSource_jclass);
+}
+
+class host_buffer_done_callback {
+public:
+  explicit host_buffer_done_callback(JavaVM *jvm, jobject ds, long id) : jvm(jvm), ds(ds), id(id) {}
+
+  host_buffer_done_callback(host_buffer_done_callback const &other) = delete;
+  host_buffer_done_callback(host_buffer_done_callback &&other)
+      : jvm(other.jvm), ds(other.ds), id(other.id) {
+    other.jvm = nullptr;
+    other.ds = nullptr;
+    other.id = -1;
+  }
+
+  host_buffer_done_callback &operator=(host_buffer_done_callback &&other) = delete;
+  host_buffer_done_callback &operator=(host_buffer_done_callback const &other) = delete;
+
+  ~host_buffer_done_callback() {
+    // because we are in a destructor we cannot throw an exception, so for now we are
+    // just going to keep the java exceptions around and have them be thrown when this
+    // thread returns to the JVM. It might be kind of confusing, but we will not lose
+    // them.
+    if (jvm != nullptr) {
+      // We cannot throw an exception in the destructor, so this is really best effort
+      JNIEnv *env = nullptr;
+      if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+        env->CallVoidMethod(this->ds, onHostBufferDone_method, id);
+      }
+    }
+  }
+
+private:
+  JavaVM *jvm;
+  jobject ds;
+  long id;
+};
+
+class jni_datasource : public cudf::io::datasource {
+public:
+  explicit jni_datasource(JNIEnv *env, jobject ds, size_t ds_size, bool device_read_supported,
+                          size_t device_read_cutoff)
+      : ds_size(ds_size), device_read_supported(device_read_supported),
+        device_read_cutoff(device_read_cutoff) {
+    if (env->GetJavaVM(&jvm) < 0) {
+      throw std::runtime_error("GetJavaVM failed");
+    }
+    this->ds = add_global_ref(env, ds);
+  }
+
+  virtual ~jni_datasource() {
+    JNIEnv *env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+      ds = del_global_ref(env, ds);
+    }
+    ds = nullptr;
+  }
+
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override {
+    JNIEnv *env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+      throw cudf::jni::jni_exception("Could not load JNIEnv");
+    }
+
+    jlongArray jbuffer_info =
+        static_cast<jlongArray>(env->CallObjectMethod(this->ds, hostReadBuff_method, offset, size));
+    if (env->ExceptionOccurred()) {
+      throw cudf::jni::jni_exception("Java exception in hostRead");
+    }
+
+    cudf::jni::native_jlongArray buffer_info(env, jbuffer_info);
+    auto ptr = reinterpret_cast<uint8_t *>(buffer_info[0]);
+    size_t length = buffer_info[1];
+    long id = buffer_info[2];
+
+    cudf::jni::host_buffer_done_callback cb(this->jvm, this->ds, id);
+    return std::make_unique<owning_buffer<cudf::jni::host_buffer_done_callback>>(std::move(cb), ptr,
+                                                                                 length);
+  }
+
+  size_t host_read(size_t offset, size_t size, uint8_t *dst) override {
+    JNIEnv *env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+      throw cudf::jni::jni_exception("Could not load JNIEnv");
+    }
+
+    jlong amount_read =
+        env->CallLongMethod(this->ds, hostRead_method, offset, size, reinterpret_cast<jlong>(dst));
+    if (env->ExceptionOccurred()) {
+      throw cudf::jni::jni_exception("Java exception in hostRead");
+    }
+    return amount_read;
+  }
+
+  size_t size() const override { return ds_size; }
+
+  bool supports_device_read() const override { return device_read_supported; }
+
+  bool is_device_read_preferred(size_t size) const override {
+    return device_read_supported && size >= device_read_cutoff;
+  }
+
+  size_t device_read(size_t offset, size_t size, uint8_t *dst,
+                     rmm::cuda_stream_view stream) override {
+    JNIEnv *env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+      throw cudf::jni::jni_exception("Could not load JNIEnv");
+    }
+
+    jlong amount_read =
+        env->CallLongMethod(this->ds, deviceRead_method, offset, size, reinterpret_cast<jlong>(dst),
+                            reinterpret_cast<jlong>(stream.value()));
+    if (env->ExceptionOccurred()) {
+      throw cudf::jni::jni_exception("Java exception in deviceRead");
+    }
+    return amount_read;
+  }
+
+  std::future<size_t> device_read_async(size_t offset, size_t size, uint8_t *dst,
+                                        rmm::cuda_stream_view stream) override {
+    auto amount_read = device_read(offset, size, dst, stream);
+    // This is a bit ugly, but we don't have a good way or a need to return
+    // a future for the read
+    std::promise<size_t> ret;
+    ret.set_value(amount_read);
+    return ret.get_future();
+  }
+
+private:
+  size_t ds_size;
+  bool device_read_supported;
+  size_t device_read_cutoff;
+  JavaVM *jvm;
+  jobject ds;
+};
+} // namespace jni
+} // namespace cudf
+
+extern "C" {
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_DataSourceHelper_createWrapperDataSource(
+    JNIEnv *env, jclass, jobject ds, jlong ds_size, jboolean device_read_supported,
+    jlong device_read_cutoff) {
+  JNI_NULL_CHECK(env, ds, "Null data source", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto source =
+        new cudf::jni::jni_datasource(env, ds, ds_size, device_read_supported, device_read_cutoff);
+    return reinterpret_cast<jlong>(source);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_DataSourceHelper_destroyWrapperDataSource(JNIEnv *env,
+                                                                                     jclass,
+                                                                                     jlong handle) {
+  try {
+    cudf::jni::auto_set_device(env);
+    if (handle != 0) {
+      auto source = reinterpret_cast<cudf::jni::jni_datasource *>(handle);
+      delete (source);
+    }
+  }
+  CATCH_STD(env, );
+}
+
+} // extern "C"
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index b208ef8f381..fad19bdf895 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1135,6 +1135,67 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSVFromDataSource(
+    JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
+    jobjectArray filter_col_names, jint header_row, jbyte delim, jint j_quote_style, jbyte quote,
+    jbyte comment, jobjectArray null_values, jobjectArray true_values, jobjectArray false_values,
+    jlong ds_handle) {
+  JNI_NULL_CHECK(env, null_values, "null_values must be supplied, even if it is empty", NULL);
+  JNI_NULL_CHECK(env, ds_handle, "no data source handle given", NULL);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::jni::native_jstringArray n_col_names(env, col_names);
+    cudf::jni::native_jintArray n_types(env, j_types);
+    cudf::jni::native_jintArray n_scales(env, j_scales);
+    if (n_types.is_null() != n_scales.is_null()) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match null",
+                    NULL);
+    }
+    std::vector<cudf::data_type> data_types;
+    if (!n_types.is_null()) {
+      if (n_types.size() != n_scales.size()) {
+        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size",
+                      NULL);
+      }
+      data_types.reserve(n_types.size());
+      std::transform(n_types.begin(), n_types.end(), n_scales.begin(),
+                     std::back_inserter(data_types), [](auto type, auto scale) {
+                       return cudf::data_type{static_cast<cudf::type_id>(type), scale};
+                     });
+    }
+
+    cudf::jni::native_jstringArray n_null_values(env, null_values);
+    cudf::jni::native_jstringArray n_true_values(env, true_values);
+    cudf::jni::native_jstringArray n_false_values(env, false_values);
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    auto const quote_style = static_cast<cudf::io::quote_style>(j_quote_style);
+
+    cudf::io::csv_reader_options opts = cudf::io::csv_reader_options::builder(source)
+                                            .delimiter(delim)
+                                            .header(header_row)
+                                            .names(n_col_names.as_cpp_vector())
+                                            .dtypes(data_types)
+                                            .use_cols_names(n_filter_col_names.as_cpp_vector())
+                                            .true_values(n_true_values.as_cpp_vector())
+                                            .false_values(n_false_values.as_cpp_vector())
+                                            .na_values(n_null_values.as_cpp_vector())
+                                            .keep_default_na(false)
+                                            .na_filter(n_null_values.size() > 0)
+                                            .quoting(quote_style)
+                                            .quotechar(quote)
+                                            .comment(comment)
+                                            .build();
+
+    return convert_table_for_return(env, cudf::io::read_csv(opts).tbl);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
     jobjectArray filter_col_names, jstring inputfilepath, jlong buffer, jlong buffer_length,
@@ -1407,6 +1468,72 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
   CATCH_STD(env, nullptr);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
+    JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
+    jboolean day_first, jboolean lines, jboolean recover_with_null, jlong ds_handle) {
+
+  JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::jni::native_jstringArray n_col_names(env, col_names);
+    cudf::jni::native_jintArray n_types(env, j_types);
+    cudf::jni::native_jintArray n_scales(env, j_scales);
+    if (n_types.is_null() != n_scales.is_null()) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match null",
+                    0);
+    }
+    std::vector<cudf::data_type> data_types;
+    if (!n_types.is_null()) {
+      if (n_types.size() != n_scales.size()) {
+        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size",
+                      0);
+      }
+      data_types.reserve(n_types.size());
+      std::transform(n_types.begin(), n_types.end(), n_scales.begin(),
+                     std::back_inserter(data_types), [](auto const &type, auto const &scale) {
+                       return cudf::data_type{static_cast<cudf::type_id>(type), scale};
+                     });
+    }
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    cudf::io::json_recovery_mode_t recovery_mode =
+        recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
+                            cudf::io::json_recovery_mode_t::FAIL;
+    cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source)
+                                                     .dayfirst(static_cast<bool>(day_first))
+                                                     .lines(static_cast<bool>(lines))
+                                                     .recovery_mode(recovery_mode);
+
+    if (!n_col_names.is_null() && data_types.size() > 0) {
+      if (n_col_names.size() != n_types.size()) {
+        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
+                      "types and column names must match size", 0);
+      }
+
+      std::map<std::string, cudf::data_type> map;
+
+      auto col_names_vec = n_col_names.as_cpp_vector();
+      std::transform(col_names_vec.begin(), col_names_vec.end(), data_types.begin(),
+                     std::inserter(map, map.end()),
+                     [](std::string a, cudf::data_type b) { return std::make_pair(a, b); });
+      opts.dtypes(map);
+    } else if (data_types.size() > 0) {
+      opts.dtypes(data_types);
+    } else {
+      // should infer the types
+    }
+
+    auto result =
+        std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
+
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
     jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
@@ -1489,6 +1616,36 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquetFromDataSource(
+    JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jint unit,
+    jlong ds_handle) {
+
+  JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
+  JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+    cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    auto builder = cudf::io::parquet_reader_options::builder(source);
+    if (n_filter_col_names.size() > 0) {
+      builder = builder.columns(n_filter_col_names.as_cpp_vector());
+    }
+
+    cudf::io::parquet_reader_options opts =
+        builder.convert_strings_to_categories(false)
+            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+            .build();
+    return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
     JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read,
     jstring inputfilepath, jlong buffer, jlong buffer_length, jint unit) {
@@ -1535,10 +1692,31 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvroFromDataSource(
+    JNIEnv *env, jclass, jobjectArray filter_col_names, jlong ds_handle) {
+
+  JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    cudf::io::avro_reader_options opts = cudf::io::avro_reader_options::builder(source)
+                                             .columns(n_filter_col_names.as_cpp_vector())
+                                             .build();
+    return convert_table_for_return(env, cudf::io::read_avro(opts).tbl);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvro(JNIEnv *env, jclass,
                                                                 jobjectArray filter_col_names,
                                                                 jstring inputfilepath, jlong buffer,
-                                                                jlong buffer_length, jint unit) {
+                                                                jlong buffer_length) {
 
   const bool read_buffer = (buffer != 0);
   if (!read_buffer) {
@@ -1715,6 +1893,38 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv *env, jc
   CATCH_STD(env, )
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORCFromDataSource(
+    JNIEnv *env, jclass, jobjectArray filter_col_names, jboolean usingNumPyTypes, jint unit,
+    jobjectArray dec128_col_names, jlong ds_handle) {
+
+  JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+
+    cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names);
+
+    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    cudf::io::source_info source{ds};
+
+    auto builder = cudf::io::orc_reader_options::builder(source);
+    if (n_filter_col_names.size() > 0) {
+      builder = builder.columns(n_filter_col_names.as_cpp_vector());
+    }
+
+    cudf::io::orc_reader_options opts =
+        builder.use_index(false)
+            .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
+            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+            .decimal128_columns(n_dec128_col_names.as_cpp_vector())
+            .build();
+    return convert_table_for_return(env, cudf::io::read_orc(opts).tbl);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
     JNIEnv *env, jclass, jobjectArray filter_col_names, jstring inputfilepath, jlong buffer,
     jlong buffer_length, jboolean usingNumPyTypes, jint unit, jobjectArray dec128_col_names) {
diff --git a/java/src/main/native/src/cudf_jni_apis.hpp b/java/src/main/native/src/cudf_jni_apis.hpp
index 867df80b722..bd82bbd2899 100644
--- a/java/src/main/native/src/cudf_jni_apis.hpp
+++ b/java/src/main/native/src/cudf_jni_apis.hpp
@@ -134,5 +134,13 @@ void auto_set_device(JNIEnv *env);
  */
 void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value);
 
+//
+// DataSource APIs
+//
+
+bool cache_data_source_jni(JNIEnv *env);
+
+void release_data_source_jni(JNIEnv *env);
+
 } // namespace jni
 } // namespace cudf
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index faa73ac4322..b0dd4122b0e 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -327,6 +327,25 @@ void testReadJSONFile() {
     }
   }
 
+  @Test
+  void testReadJSONFromDataSource() throws IOException {
+    Schema schema = Schema.builder()
+            .column(DType.STRING, "name")
+            .column(DType.INT32, "age")
+            .build();
+    JSONOptions opts = JSONOptions.builder()
+            .withLines(true)
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column("Michael", "Andy", "Justin")
+            .column(null, 30, 19)
+            .build();
+         MultiBufferDataSource source = sourceFrom(TEST_SIMPLE_JSON_FILE);
+         Table table = Table.readJSON(schema, opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   @Test
   void testReadJSONFileWithInvalidLines() {
     Schema schema = Schema.builder()
@@ -560,6 +579,126 @@ void testReadCSVBuffer() {
     }
   }
 
+  byte[][] sliceBytes(byte[] data, int slices) {
+    slices = Math.min(data.length, slices);
+    // We are not going to worry about making it super even here.
+    // The last one gets the extras.
+    int bytesPerSlice = data.length / slices;
+    byte[][] ret = new byte[slices][];
+    int startingAt = 0;
+    for (int i = 0; i < (slices - 1); i++) {
+      ret[i] = new byte[bytesPerSlice];
+      System.arraycopy(data, startingAt, ret[i], 0, bytesPerSlice);
+      startingAt += bytesPerSlice;
+    }
+    // Now for the last one
+    ret[slices - 1] = new byte[data.length - startingAt];
+    System.arraycopy(data, startingAt, ret[slices - 1], 0, data.length - startingAt);
+    return ret;
+  }
+
+  @Test
+  void testReadCSVBufferMultiBuffer() {
+    CSVOptions opts = CSVOptions.builder()
+            .includeColumn("A")
+            .includeColumn("B")
+            .hasHeader()
+            .withDelim('|')
+            .withQuote('\'')
+            .withNullValue("NULL")
+            .build();
+    byte[][] data = sliceBytes(CSV_DATA_BUFFER, 10);
+    try (Table expected = new Table.TestBuilder()
+            .column(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
+            .column(110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, null, 118.2, 119.8)
+            .build();
+         MultiBufferDataSource source = sourceFrom(data);
+         Table table = Table.readCSV(TableTest.CSV_DATA_BUFFER_SCHEMA, opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  public static byte[] arrayFrom(File f) throws IOException {
+    long len = f.length();
+    if (len > Integer.MAX_VALUE) {
+      throw new IllegalArgumentException("Sorry cannot read " + f +
+              " into an array it does not fit");
+    }
+    int remaining = (int)len;
+    byte[] ret = new byte[remaining];
+    try (java.io.FileInputStream fin = new java.io.FileInputStream(f)) {
+      int at = 0;
+      while (remaining > 0) {
+        int amount = fin.read(ret, at, remaining);
+        at += amount;
+        remaining -= amount;
+      }
+    }
+    return ret;
+  }
+
+  public static MultiBufferDataSource sourceFrom(File f) throws IOException {
+    long len = f.length();
+    byte[] tmp = new byte[(int)Math.min(32 * 1024, len)];
+    try (HostMemoryBuffer buffer = HostMemoryBuffer.allocate(len)) {
+      try (java.io.FileInputStream fin = new java.io.FileInputStream(f)) {
+        long at = 0;
+        while (at < len) {
+          int amount = fin.read(tmp);
+          buffer.setBytes(at, tmp, 0, amount);
+          at += amount;
+        }
+      }
+      return new MultiBufferDataSource(buffer);
+    }
+  }
+
+  public static MultiBufferDataSource sourceFrom(byte[] data) {
+    long len = data.length;
+    try (HostMemoryBuffer buffer = HostMemoryBuffer.allocate(len)) {
+      buffer.setBytes(0, data, 0, len);
+      return new MultiBufferDataSource(buffer);
+    }
+  }
+
+  public static MultiBufferDataSource sourceFrom(byte[][] data) {
+    HostMemoryBuffer[] buffers = new HostMemoryBuffer[data.length];
+    try {
+      for (int i = 0; i < data.length; i++) {
+        byte[] subData = data[i];
+        buffers[i] = HostMemoryBuffer.allocate(subData.length);
+        buffers[i].setBytes(0, subData, 0, subData.length);
+      }
+      return new MultiBufferDataSource(buffers);
+    } finally {
+      for (HostMemoryBuffer buffer: buffers) {
+        if (buffer != null) {
+          buffer.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testReadCSVDataSource() {
+    CSVOptions opts = CSVOptions.builder()
+            .includeColumn("A")
+            .includeColumn("B")
+            .hasHeader()
+            .withDelim('|')
+            .withQuote('\'')
+            .withNullValue("NULL")
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
+            .column(110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, null, 118.2, 119.8)
+            .build();
+         MultiBufferDataSource source = sourceFrom(TableTest.CSV_DATA_BUFFER);
+         Table table = Table.readCSV(TableTest.CSV_DATA_BUFFER_SCHEMA, opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   @Test
   void testReadCSVWithOffset() {
     CSVOptions opts = CSVOptions.builder()
@@ -864,6 +1003,37 @@ void testReadParquet() {
     }
   }
 
+  @Test
+  void testReadParquetFromDataSource() throws IOException {
+    ParquetOptions opts = ParquetOptions.builder()
+            .includeColumn("loan_id")
+            .includeColumn("zip")
+            .includeColumn("num_units")
+            .build();
+    try (MultiBufferDataSource source = sourceFrom(TEST_PARQUET_FILE);
+         Table table = Table.readParquet(opts, source)) {
+      long rows = table.getRowCount();
+      assertEquals(1000, rows);
+      assertTableTypes(new DType[]{DType.INT64, DType.INT32, DType.INT32}, table);
+    }
+  }
+
+  @Test
+  void testReadParquetMultiBuffer() throws IOException {
+    ParquetOptions opts = ParquetOptions.builder()
+            .includeColumn("loan_id")
+            .includeColumn("zip")
+            .includeColumn("num_units")
+            .build();
+    byte [][] data = sliceBytes(arrayFrom(TEST_PARQUET_FILE), 10);
+    try (MultiBufferDataSource source = sourceFrom(data);
+         Table table = Table.readParquet(opts, source)) {
+      long rows = table.getRowCount();
+      assertEquals(1000, rows);
+      assertTableTypes(new DType[]{DType.INT64, DType.INT32, DType.INT32}, table);
+    }
+  }
+
   @Test
   void testReadParquetBinary() {
     ParquetOptions opts = ParquetOptions.builder()
@@ -1018,6 +1188,23 @@ void testChunkedReadParquet() {
     }
   }
 
+  @Test
+  void testChunkedReadParquetFromDataSource() throws IOException {
+    try (MultiBufferDataSource source = sourceFrom(TEST_PARQUET_FILE_CHUNKED_READ);
+         ParquetChunkedReader reader = new ParquetChunkedReader(240000, ParquetOptions.DEFAULT, source)) {
+      int numChunks = 0;
+      long totalRows = 0;
+      while(reader.hasNext()) {
+        ++numChunks;
+        try(Table chunk = reader.readChunk()) {
+          totalRows += chunk.getRowCount();
+        }
+      }
+      assertEquals(2, numChunks);
+      assertEquals(40000, totalRows);
+    }
+  }
+
   @Test
   void testReadAvro() {
     AvroOptions opts = AvroOptions.builder()
@@ -1037,6 +1224,26 @@ void testReadAvro() {
     }
   }
 
+  @Test
+  void testReadAvroFromDataSource() throws IOException {
+    AvroOptions opts = AvroOptions.builder()
+            .includeColumn("bool_col")
+            .includeColumn("int_col")
+            .includeColumn("timestamp_col")
+            .build();
+
+    try (Table expected = new Table.TestBuilder()
+            .column(true, false, true, false, true, false, true, false)
+            .column(0, 1, 0, 1, 0, 1, 0, 1)
+            .column(1235865600000000L, 1235865660000000L, 1238544000000000L, 1238544060000000L,
+                    1233446400000000L, 1233446460000000L, 1230768000000000L, 1230768060000000L)
+            .build();
+         MultiBufferDataSource source = sourceFrom(TEST_ALL_TYPES_PLAIN_AVRO_FILE);
+         Table table = Table.readAvro(opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   @Test
   void testReadAvroBuffer() throws IOException{
     AvroOptions opts = AvroOptions.builder()
@@ -1094,6 +1301,24 @@ void testReadORC() {
     }
   }
 
+  @Test
+  void testReadORCFromDataSource() throws IOException {
+    ORCOptions opts = ORCOptions.builder()
+            .includeColumn("string1")
+            .includeColumn("float1")
+            .includeColumn("int1")
+            .build();
+    try (Table expected = new Table.TestBuilder()
+            .column("hi","bye")
+            .column(1.0f,2.0f)
+            .column(65536,65536)
+            .build();
+         MultiBufferDataSource source = sourceFrom(TEST_ORC_FILE);
+         Table table = Table.readORC(opts, source)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   @Test
   void testReadORCBuffer() throws IOException {
     ORCOptions opts = ORCOptions.builder()

From 15baa00693ab4aa59f99ccb417c613880789d047 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 11 Oct 2023 11:31:39 +0200
Subject: [PATCH 021/118] Fixes behaviour for incomplete lines when
 `recover_with_nulls` is enabled (#14252)

Closes https://github.com/rapidsai/cudf/issues/14227. Adapts the behaviour of the JSON finite-state transducer (FST) when `recover_with_nulls` is `true` to be more strict and reject lines that contain incomplete JSON objects (aka records) or JSON arrays (aka lists).

Authors:
  - Elias Stehle (https://github.com/elstehle)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14252
---
 cpp/src/io/json/nested_json_gpu.cu | 703 +++++++++++++++--------------
 cpp/tests/io/json_test.cpp         |  45 +-
 cpp/tests/io/nested_json_test.cpp  |  23 +-
 3 files changed, 401 insertions(+), 370 deletions(-)

diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 06ac11485cb..c9107357239 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -660,13 +660,13 @@ auto get_transition_table(json_format_cfg_t format)
       PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_BOV, PD_LON,
       PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_BOV, PD_LON};
     pda_tt[static_cast<StateT>(pda_state_t::PD_STR)] = {
-      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
-      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
-      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR};
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR,
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR,
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR};
     pda_tt[static_cast<StateT>(pda_state_t::PD_SCE)] = {
-      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
-      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR,
-      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR};
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR,
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR,
+      PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR};
     pda_tt[static_cast<StateT>(pda_state_t::PD_PVL)] = {
       PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_BOV, PD_ERR,
       PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_BOV, PD_ERR,
@@ -680,9 +680,9 @@ auto get_transition_table(json_format_cfg_t format)
       PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR,
       PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_BOV, PD_FLN};
     pda_tt[static_cast<StateT>(pda_state_t::PD_FNE)] = {
-      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
-      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR,
-      PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN};
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR,
+      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR,
+      PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_BOV, PD_FLN};
     pda_tt[static_cast<StateT>(pda_state_t::PD_PFN)] = {
       PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR,
       PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR,
@@ -697,8 +697,11 @@ auto get_transition_table(json_format_cfg_t format)
 
 /**
  * @brief Getting the translation table
+ * @param recover_from_error Whether or not the tokenizer should recover from invalid lines. If
+ * `recover_from_error` is true, invalid JSON lines end with the token sequence (`ErrorBegin`,
+ * `LineEn`) and incomplete JSON lines (e.g., `{"a":123\n`) are treated as invalid lines.
  */
-auto get_translation_table(bool include_line_delimiter)
+auto get_translation_table(bool recover_from_error)
 {
   constexpr auto StructBegin       = token_t::StructBegin;
   constexpr auto StructEnd         = token_t::StructEnd;
@@ -715,76 +718,83 @@ auto get_translation_table(bool include_line_delimiter)
   constexpr auto ErrorBegin        = token_t::ErrorBegin;
 
   /**
-   * @brief Appends token_t::LineEnd token to the given token sequence, if and only if
-   * `include_line_delimiter` is true.
+   * @brief Instead of specifying the verbose translation tables twice (i.e., once when
+   * `recover_from_error` is true and once when it is false), we use `nl_tokens` to specialize the
+   * translation table where it differs depending on the `recover_from_error` option. If and only if
+   * `recover_from_error` is true, `recovering_tokens` are returned along with a token_t::LineEnd
+   * token, otherwise `regular_tokens` is returned.
    */
-  auto nl_tokens = [include_line_delimiter](std::vector<char> tokens) {
-    if (include_line_delimiter) { tokens.push_back(token_t::LineEnd); }
-    return tokens;
+  auto nl_tokens = [recover_from_error](std::vector<char> regular_tokens,
+                                        std::vector<char> recovering_tokens) {
+    if (recover_from_error) {
+      recovering_tokens.push_back(token_t::LineEnd);
+      return recovering_tokens;
+    }
+    return regular_tokens;
   };
 
   std::array<std::array<std::vector<char>, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tlt;
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_BOV)] = {{                /*ROOT*/
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_BOV)] = {{                    /*ROOT*/
+                                                        {StructBegin},      // OPENING_BRACE
+                                                        {ListBegin},        // OPENING_BRACKET
+                                                        {ErrorBegin},       // CLOSING_BRACE
+                                                        {ErrorBegin},       // CLOSING_BRACKET
+                                                        {StringBegin},      // QUOTE
+                                                        {ErrorBegin},       // ESCAPE
+                                                        {ErrorBegin},       // COMMA
+                                                        {ErrorBegin},       // COLON
+                                                        {},                 // WHITE_SPACE
+                                                        nl_tokens({}, {}),  // LINE_BREAK
+                                                        {ValueBegin},       // OTHER
+                                                        /*LIST*/
                                                         {StructBegin},  // OPENING_BRACE
                                                         {ListBegin},    // OPENING_BRACKET
                                                         {ErrorBegin},   // CLOSING_BRACE
-                                                        {ErrorBegin},   // CLOSING_BRACKET
+                                                        {ListEnd},      // CLOSING_BRACKET
                                                         {StringBegin},  // QUOTE
                                                         {ErrorBegin},   // ESCAPE
                                                         {ErrorBegin},   // COMMA
                                                         {ErrorBegin},   // COLON
                                                         {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {ValueBegin},   // OTHER
-                                                        /*LIST*/
+                                                        nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+                                                        {ValueBegin},                 // OTHER
+                                                        /*STRUCT*/
                                                         {StructBegin},  // OPENING_BRACE
                                                         {ListBegin},    // OPENING_BRACKET
                                                         {ErrorBegin},   // CLOSING_BRACE
-                                                        {ListEnd},      // CLOSING_BRACKET
+                                                        {ErrorBegin},   // CLOSING_BRACKET
                                                         {StringBegin},  // QUOTE
                                                         {ErrorBegin},   // ESCAPE
                                                         {ErrorBegin},   // COMMA
                                                         {ErrorBegin},   // COLON
                                                         {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {ValueBegin},   // OTHER
-                                                        /*STRUCT*/
-                                                        {StructBegin},   // OPENING_BRACE
-                                                        {ListBegin},     // OPENING_BRACKET
-                                                        {ErrorBegin},    // CLOSING_BRACE
-                                                        {ErrorBegin},    // CLOSING_BRACKET
-                                                        {StringBegin},   // QUOTE
-                                                        {ErrorBegin},    // ESCAPE
-                                                        {ErrorBegin},    // COMMA
-                                                        {ErrorBegin},    // COLON
-                                                        {},              // WHITE_SPACE
-                                                        nl_tokens({}),   // LINE_BREAK
-                                                        {ValueBegin}}};  // OTHER
+                                                        nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+                                                        {ValueBegin}}};               // OTHER
   pda_tlt[static_cast<StateT>(pda_state_t::PD_BOA)] = {
-    {                          /*ROOT*/
-     {ErrorBegin},             // OPENING_BRACE
-     {ErrorBegin},             // OPENING_BRACKET
-     {ErrorBegin},             // CLOSING_BRACE
-     {ErrorBegin},             // CLOSING_BRACKET
-     {ErrorBegin},             // QUOTE
-     {ErrorBegin},             // ESCAPE
-     {ErrorBegin},             // COMMA
-     {ErrorBegin},             // COLON
-     {ErrorBegin},             // WHITE_SPACE
-     nl_tokens({ErrorBegin}),  // LINE_BREAK
-     {ErrorBegin},             // OTHER
+    {                                        /*ROOT*/
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
      /*LIST*/
-     {StructBegin},  // OPENING_BRACE
-     {ListBegin},    // OPENING_BRACKET
-     {ErrorBegin},   // CLOSING_BRACE
-     {ListEnd},      // CLOSING_BRACKET
-     {StringBegin},  // QUOTE
-     {ErrorBegin},   // ESCAPE
-     {ErrorBegin},   // COMMA
-     {ErrorBegin},   // COLON
-     {},             // WHITE_SPACE
-     nl_tokens({}),  // LINE_BREAK
-     {ValueBegin},   // OTHER
+     {StructBegin},                // OPENING_BRACE
+     {ListBegin},                  // OPENING_BRACKET
+     {ErrorBegin},                 // CLOSING_BRACE
+     {ListEnd},                    // CLOSING_BRACKET
+     {StringBegin},                // QUOTE
+     {ErrorBegin},                 // ESCAPE
+     {ErrorBegin},                 // COMMA
+     {ErrorBegin},                 // COLON
+     {},                           // WHITE_SPACE
+     nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+     {ValueBegin},                 // OTHER
      /*STRUCT*/
      {ErrorBegin},                         // OPENING_BRACE
      {ErrorBegin},                         // OPENING_BRACKET
@@ -795,33 +805,33 @@ auto get_translation_table(bool include_line_delimiter)
      {ErrorBegin},                         // COMMA
      {ErrorBegin},                         // COLON
      {},                                   // WHITE_SPACE
-     nl_tokens({}),                        // LINE_BREAK
+     nl_tokens({}, {ErrorBegin}),          // LINE_BREAK
      {ErrorBegin}}};                       // OTHER
   pda_tlt[static_cast<StateT>(pda_state_t::PD_LON)] = {
-    {                        /*ROOT*/
-     {ErrorBegin},           // OPENING_BRACE
-     {ErrorBegin},           // OPENING_BRACKET
-     {ErrorBegin},           // CLOSING_BRACE
-     {ErrorBegin},           // CLOSING_BRACKET
-     {ErrorBegin},           // QUOTE
-     {ErrorBegin},           // ESCAPE
-     {ErrorBegin},           // COMMA
-     {ErrorBegin},           // COLON
-     {ValueEnd},             // WHITE_SPACE
-     nl_tokens({ValueEnd}),  // LINE_BREAK
-     {},                     // OTHER
+    {                                      /*ROOT*/
+     {ErrorBegin},                         // OPENING_BRACE
+     {ErrorBegin},                         // OPENING_BRACKET
+     {ErrorBegin},                         // CLOSING_BRACE
+     {ErrorBegin},                         // CLOSING_BRACKET
+     {ErrorBegin},                         // QUOTE
+     {ErrorBegin},                         // ESCAPE
+     {ErrorBegin},                         // COMMA
+     {ErrorBegin},                         // COLON
+     {ValueEnd},                           // WHITE_SPACE
+     nl_tokens({ValueEnd}, {ErrorBegin}),  // LINE_BREAK
+     {},                                   // OTHER
      /*LIST*/
-     {ErrorBegin},           // OPENING_BRACE
-     {ErrorBegin},           // OPENING_BRACKET
-     {ErrorBegin},           // CLOSING_BRACE
-     {ValueEnd, ListEnd},    // CLOSING_BRACKET
-     {ErrorBegin},           // QUOTE
-     {ErrorBegin},           // ESCAPE
-     {ValueEnd},             // COMMA
-     {ErrorBegin},           // COLON
-     {ValueEnd},             // WHITE_SPACE
-     nl_tokens({ValueEnd}),  // LINE_BREAK
-     {},                     // OTHER
+     {ErrorBegin},                         // OPENING_BRACE
+     {ErrorBegin},                         // OPENING_BRACKET
+     {ErrorBegin},                         // CLOSING_BRACE
+     {ValueEnd, ListEnd},                  // CLOSING_BRACKET
+     {ErrorBegin},                         // QUOTE
+     {ErrorBegin},                         // ESCAPE
+     {ValueEnd},                           // COMMA
+     {ErrorBegin},                         // COLON
+     {ValueEnd},                           // WHITE_SPACE
+     nl_tokens({ValueEnd}, {ErrorBegin}),  // LINE_BREAK
+     {},                                   // OTHER
      /*STRUCT*/
      {ErrorBegin},                            // OPENING_BRACE
      {ErrorBegin},                            // OPENING_BRACKET
@@ -832,108 +842,108 @@ auto get_translation_table(bool include_line_delimiter)
      {ValueEnd, StructMemberEnd},             // COMMA
      {ErrorBegin},                            // COLON
      {ValueEnd},                              // WHITE_SPACE
-     nl_tokens({ValueEnd}),                   // LINE_BREAK
+     nl_tokens({ValueEnd}, {ErrorBegin}),     // LINE_BREAK
      {}}};                                    // OTHER
 
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_STR)] = {{                /*ROOT*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {StringEnd},    // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {},             // OTHER
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_STR)] = {{              /*ROOT*/
+                                                        {},           // OPENING_BRACE
+                                                        {},           // OPENING_BRACKET
+                                                        {},           // CLOSING_BRACE
+                                                        {},           // CLOSING_BRACKET
+                                                        {StringEnd},  // QUOTE
+                                                        {},           // ESCAPE
+                                                        {},           // COMMA
+                                                        {},           // COLON
+                                                        {},           // WHITE_SPACE
+                                                        nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+                                                        {},                           // OTHER
                                                         /*LIST*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {StringEnd},    // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {},             // OTHER
+                                                        {},           // OPENING_BRACE
+                                                        {},           // OPENING_BRACKET
+                                                        {},           // CLOSING_BRACE
+                                                        {},           // CLOSING_BRACKET
+                                                        {StringEnd},  // QUOTE
+                                                        {},           // ESCAPE
+                                                        {},           // COMMA
+                                                        {},           // COLON
+                                                        {},           // WHITE_SPACE
+                                                        nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+                                                        {},                           // OTHER
                                                         /*STRUCT*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {StringEnd},    // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {}}};           // OTHER
-
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_SCE)] = {{                /*ROOT*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {},             // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {},             // OTHER
+                                                        {},           // OPENING_BRACE
+                                                        {},           // OPENING_BRACKET
+                                                        {},           // CLOSING_BRACE
+                                                        {},           // CLOSING_BRACKET
+                                                        {StringEnd},  // QUOTE
+                                                        {},           // ESCAPE
+                                                        {},           // COMMA
+                                                        {},           // COLON
+                                                        {},           // WHITE_SPACE
+                                                        nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+                                                        {}}};                         // OTHER
+
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_SCE)] = {{     /*ROOT*/
+                                                        {},  // OPENING_BRACE
+                                                        {},  // OPENING_BRACKET
+                                                        {},  // CLOSING_BRACE
+                                                        {},  // CLOSING_BRACKET
+                                                        {},  // QUOTE
+                                                        {},  // ESCAPE
+                                                        {},  // COMMA
+                                                        {},  // COLON
+                                                        {},  // WHITE_SPACE
+                                                        nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+                                                        {},                           // OTHER
                                                         /*LIST*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {},             // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {},             // OTHER
+                                                        {},  // OPENING_BRACE
+                                                        {},  // OPENING_BRACKET
+                                                        {},  // CLOSING_BRACE
+                                                        {},  // CLOSING_BRACKET
+                                                        {},  // QUOTE
+                                                        {},  // ESCAPE
+                                                        {},  // COMMA
+                                                        {},  // COLON
+                                                        {},  // WHITE_SPACE
+                                                        nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+                                                        {},                           // OTHER
                                                         /*STRUCT*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {},             // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {}}};           // OTHER
+                                                        {},  // OPENING_BRACE
+                                                        {},  // OPENING_BRACKET
+                                                        {},  // CLOSING_BRACE
+                                                        {},  // CLOSING_BRACKET
+                                                        {},  // QUOTE
+                                                        {},  // ESCAPE
+                                                        {},  // COMMA
+                                                        {},  // COLON
+                                                        {},  // WHITE_SPACE
+                                                        nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+                                                        {}}};                         // OTHER
 
   pda_tlt[static_cast<StateT>(pda_state_t::PD_PVL)] = {
-    {                /*ROOT*/
-     {ErrorBegin},   // OPENING_BRACE
-     {ErrorBegin},   // OPENING_BRACKET
-     {ErrorBegin},   // CLOSING_BRACE
-     {ErrorBegin},   // CLOSING_BRACKET
-     {ErrorBegin},   // QUOTE
-     {ErrorBegin},   // ESCAPE
-     {ErrorBegin},   // COMMA
-     {ErrorBegin},   // COLON
-     {},             // WHITE_SPACE
-     nl_tokens({}),  // LINE_BREAK
-     {ErrorBegin},   // OTHER
+    {                    /*ROOT*/
+     {ErrorBegin},       // OPENING_BRACE
+     {ErrorBegin},       // OPENING_BRACKET
+     {ErrorBegin},       // CLOSING_BRACE
+     {ErrorBegin},       // CLOSING_BRACKET
+     {ErrorBegin},       // QUOTE
+     {ErrorBegin},       // ESCAPE
+     {ErrorBegin},       // COMMA
+     {ErrorBegin},       // COLON
+     {},                 // WHITE_SPACE
+     nl_tokens({}, {}),  // LINE_BREAK
+     {ErrorBegin},       // OTHER
      /*LIST*/
-     {ErrorBegin},   // OPENING_BRACE
-     {ErrorBegin},   // OPENING_BRACKET
-     {ErrorBegin},   // CLOSING_BRACE
-     {ListEnd},      // CLOSING_BRACKET
-     {ErrorBegin},   // QUOTE
-     {ErrorBegin},   // ESCAPE
-     {},             // COMMA
-     {ErrorBegin},   // COLON
-     {},             // WHITE_SPACE
-     nl_tokens({}),  // LINE_BREAK
-     {ErrorBegin},   // OTHER
+     {ErrorBegin},                 // OPENING_BRACE
+     {ErrorBegin},                 // OPENING_BRACKET
+     {ErrorBegin},                 // CLOSING_BRACE
+     {ListEnd},                    // CLOSING_BRACKET
+     {ErrorBegin},                 // QUOTE
+     {ErrorBegin},                 // ESCAPE
+     {},                           // COMMA
+     {ErrorBegin},                 // COLON
+     {},                           // WHITE_SPACE
+     nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                 // OTHER
      /*STRUCT*/
      {ErrorBegin},                  // OPENING_BRACE
      {ErrorBegin},                  // OPENING_BRACKET
@@ -944,34 +954,34 @@ auto get_translation_table(bool include_line_delimiter)
      {StructMemberEnd},             // COMMA
      {ErrorBegin},                  // COLON
      {},                            // WHITE_SPACE
-     nl_tokens({}),                 // LINE_BREAK
+     nl_tokens({}, {ErrorBegin}),   // LINE_BREAK
      {ErrorBegin}}};                // OTHER
 
   pda_tlt[static_cast<StateT>(pda_state_t::PD_BFN)] = {
-    {                          /*ROOT*/
-     {ErrorBegin},             // OPENING_BRACE
-     {ErrorBegin},             // OPENING_BRACKET
-     {ErrorBegin},             // CLOSING_BRACE
-     {ErrorBegin},             // CLOSING_BRACKET
-     {ErrorBegin},             // QUOTE
-     {ErrorBegin},             // ESCAPE
-     {ErrorBegin},             // COMMA
-     {ErrorBegin},             // COLON
-     {ErrorBegin},             // WHITE_SPACE
-     nl_tokens({ErrorBegin}),  // LINE_BREAK
-     {ErrorBegin},             // OTHER
+    {                                        /*ROOT*/
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
      /*LIST*/
-     {ErrorBegin},             // OPENING_BRACE
-     {ErrorBegin},             // OPENING_BRACKET
-     {ErrorBegin},             // CLOSING_BRACE
-     {ErrorBegin},             // CLOSING_BRACKET
-     {ErrorBegin},             // QUOTE
-     {ErrorBegin},             // ESCAPE
-     {ErrorBegin},             // COMMA
-     {ErrorBegin},             // COLON
-     {ErrorBegin},             // WHITE_SPACE
-     nl_tokens({ErrorBegin}),  // LINE_BREAK
-     {ErrorBegin},             // OTHER
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
      /*STRUCT*/
      {ErrorBegin},                         // OPENING_BRACE
      {ErrorBegin},                         // OPENING_BRACKET
@@ -982,156 +992,159 @@ auto get_translation_table(bool include_line_delimiter)
      {ErrorBegin},                         // COMMA
      {ErrorBegin},                         // COLON
      {},                                   // WHITE_SPACE
-     nl_tokens({}),                        // LINE_BREAK
+     nl_tokens({}, {ErrorBegin}),          // LINE_BREAK
      {ErrorBegin}}};                       // OTHER
 
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_FLN)] = {{                          /*ROOT*/
-                                                        {ErrorBegin},             // OPENING_BRACE
-                                                        {ErrorBegin},             // OPENING_BRACKET
-                                                        {ErrorBegin},             // CLOSING_BRACE
-                                                        {ErrorBegin},             // CLOSING_BRACKET
-                                                        {ErrorBegin},             // QUOTE
-                                                        {ErrorBegin},             // ESCAPE
-                                                        {ErrorBegin},             // COMMA
-                                                        {ErrorBegin},             // COLON
-                                                        {ErrorBegin},             // WHITE_SPACE
-                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
-                                                        {ErrorBegin},             // OTHER
-                                                        /*LIST*/
-                                                        {ErrorBegin},             // OPENING_BRACE
-                                                        {ErrorBegin},             // OPENING_BRACKET
-                                                        {ErrorBegin},             // CLOSING_BRACE
-                                                        {ErrorBegin},             // CLOSING_BRACKET
-                                                        {ErrorBegin},             // QUOTE
-                                                        {ErrorBegin},             // ESCAPE
-                                                        {ErrorBegin},             // COMMA
-                                                        {ErrorBegin},             // COLON
-                                                        {ErrorBegin},             // WHITE_SPACE
-                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
-                                                        {ErrorBegin},             // OTHER
-                                                        /*STRUCT*/
-                                                        {},              // OPENING_BRACE
-                                                        {},              // OPENING_BRACKET
-                                                        {},              // CLOSING_BRACE
-                                                        {},              // CLOSING_BRACKET
-                                                        {FieldNameEnd},  // QUOTE
-                                                        {},              // ESCAPE
-                                                        {},              // COMMA
-                                                        {},              // COLON
-                                                        {},              // WHITE_SPACE
-                                                        nl_tokens({}),   // LINE_BREAK
-                                                        {}}};            // OTHER
-
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_FNE)] = {{                          /*ROOT*/
-                                                        {ErrorBegin},             // OPENING_BRACE
-                                                        {ErrorBegin},             // OPENING_BRACKET
-                                                        {ErrorBegin},             // CLOSING_BRACE
-                                                        {ErrorBegin},             // CLOSING_BRACKET
-                                                        {ErrorBegin},             // QUOTE
-                                                        {ErrorBegin},             // ESCAPE
-                                                        {ErrorBegin},             // COMMA
-                                                        {ErrorBegin},             // COLON
-                                                        {ErrorBegin},             // WHITE_SPACE
-                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
-                                                        {ErrorBegin},             // OTHER
-                                                        /*LIST*/
-                                                        {ErrorBegin},             // OPENING_BRACE
-                                                        {ErrorBegin},             // OPENING_BRACKET
-                                                        {ErrorBegin},             // CLOSING_BRACE
-                                                        {ErrorBegin},             // CLOSING_BRACKET
-                                                        {ErrorBegin},             // QUOTE
-                                                        {ErrorBegin},             // ESCAPE
-                                                        {ErrorBegin},             // COMMA
-                                                        {ErrorBegin},             // COLON
-                                                        {ErrorBegin},             // WHITE_SPACE
-                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
-                                                        {ErrorBegin},             // OTHER
-                                                        /*STRUCT*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {},             // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {}}};           // OTHER
-
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_PFN)] = {{                          /*ROOT*/
-                                                        {ErrorBegin},             // OPENING_BRACE
-                                                        {ErrorBegin},             // OPENING_BRACKET
-                                                        {ErrorBegin},             // CLOSING_BRACE
-                                                        {ErrorBegin},             // CLOSING_BRACKET
-                                                        {ErrorBegin},             // QUOTE
-                                                        {ErrorBegin},             // ESCAPE
-                                                        {ErrorBegin},             // COMMA
-                                                        {ErrorBegin},             // COLON
-                                                        {ErrorBegin},             // WHITE_SPACE
-                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
-                                                        {ErrorBegin},             // OTHER
-                                                        /*LIST*/
-                                                        {ErrorBegin},             // OPENING_BRACE
-                                                        {ErrorBegin},             // OPENING_BRACKET
-                                                        {ErrorBegin},             // CLOSING_BRACE
-                                                        {ErrorBegin},             // CLOSING_BRACKET
-                                                        {ErrorBegin},             // QUOTE
-                                                        {ErrorBegin},             // ESCAPE
-                                                        {ErrorBegin},             // COMMA
-                                                        {ErrorBegin},             // COLON
-                                                        {ErrorBegin},             // WHITE_SPACE
-                                                        nl_tokens({ErrorBegin}),  // LINE_BREAK
-                                                        {ErrorBegin},             // OTHER
-                                                        /*STRUCT*/
-                                                        {ErrorBegin},    // OPENING_BRACE
-                                                        {ErrorBegin},    // OPENING_BRACKET
-                                                        {ErrorBegin},    // CLOSING_BRACE
-                                                        {ErrorBegin},    // CLOSING_BRACKET
-                                                        {ErrorBegin},    // QUOTE
-                                                        {ErrorBegin},    // ESCAPE
-                                                        {ErrorBegin},    // COMMA
-                                                        {},              // COLON
-                                                        {},              // WHITE_SPACE
-                                                        nl_tokens({}),   // LINE_BREAK
-                                                        {ErrorBegin}}};  // OTHER
-
-  pda_tlt[static_cast<StateT>(pda_state_t::PD_ERR)] = {{                /*ROOT*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {},             // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {},             // OTHER
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_FLN)] = {
+    {                                        /*ROOT*/
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
+     /*LIST*/
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
+     /*STRUCT*/
+     {},                           // OPENING_BRACE
+     {},                           // OPENING_BRACKET
+     {},                           // CLOSING_BRACE
+     {},                           // CLOSING_BRACKET
+     {FieldNameEnd},               // QUOTE
+     {},                           // ESCAPE
+     {},                           // COMMA
+     {},                           // COLON
+     {},                           // WHITE_SPACE
+     nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+     {}}};                         // OTHER
+
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_FNE)] = {
+    {                                        /*ROOT*/
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
+     /*LIST*/
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
+     /*STRUCT*/
+     {},                           // OPENING_BRACE
+     {},                           // OPENING_BRACKET
+     {},                           // CLOSING_BRACE
+     {},                           // CLOSING_BRACKET
+     {},                           // QUOTE
+     {},                           // ESCAPE
+     {},                           // COMMA
+     {},                           // COLON
+     {},                           // WHITE_SPACE
+     nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+     {}}};                         // OTHER
+
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_PFN)] = {
+    {                                        /*ROOT*/
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
+     /*LIST*/
+     {ErrorBegin},                           // OPENING_BRACE
+     {ErrorBegin},                           // OPENING_BRACKET
+     {ErrorBegin},                           // CLOSING_BRACE
+     {ErrorBegin},                           // CLOSING_BRACKET
+     {ErrorBegin},                           // QUOTE
+     {ErrorBegin},                           // ESCAPE
+     {ErrorBegin},                           // COMMA
+     {ErrorBegin},                           // COLON
+     {ErrorBegin},                           // WHITE_SPACE
+     nl_tokens({ErrorBegin}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin},                           // OTHER
+     /*STRUCT*/
+     {ErrorBegin},                 // OPENING_BRACE
+     {ErrorBegin},                 // OPENING_BRACKET
+     {ErrorBegin},                 // CLOSING_BRACE
+     {ErrorBegin},                 // CLOSING_BRACKET
+     {ErrorBegin},                 // QUOTE
+     {ErrorBegin},                 // ESCAPE
+     {ErrorBegin},                 // COMMA
+     {},                           // COLON
+     {},                           // WHITE_SPACE
+     nl_tokens({}, {ErrorBegin}),  // LINE_BREAK
+     {ErrorBegin}}};               // OTHER
+
+  pda_tlt[static_cast<StateT>(pda_state_t::PD_ERR)] = {{                    /*ROOT*/
+                                                        {},                 // OPENING_BRACE
+                                                        {},                 // OPENING_BRACKET
+                                                        {},                 // CLOSING_BRACE
+                                                        {},                 // CLOSING_BRACKET
+                                                        {},                 // QUOTE
+                                                        {},                 // ESCAPE
+                                                        {},                 // COMMA
+                                                        {},                 // COLON
+                                                        {},                 // WHITE_SPACE
+                                                        nl_tokens({}, {}),  // LINE_BREAK
+                                                        {},                 // OTHER
                                                         /*LIST*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {},             // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {},             // OTHER
+                                                        {},                 // OPENING_BRACE
+                                                        {},                 // OPENING_BRACKET
+                                                        {},                 // CLOSING_BRACE
+                                                        {},                 // CLOSING_BRACKET
+                                                        {},                 // QUOTE
+                                                        {},                 // ESCAPE
+                                                        {},                 // COMMA
+                                                        {},                 // COLON
+                                                        {},                 // WHITE_SPACE
+                                                        nl_tokens({}, {}),  // LINE_BREAK
+                                                        {},                 // OTHER
                                                         /*STRUCT*/
-                                                        {},             // OPENING_BRACE
-                                                        {},             // OPENING_BRACKET
-                                                        {},             // CLOSING_BRACE
-                                                        {},             // CLOSING_BRACKET
-                                                        {},             // QUOTE
-                                                        {},             // ESCAPE
-                                                        {},             // COMMA
-                                                        {},             // COLON
-                                                        {},             // WHITE_SPACE
-                                                        nl_tokens({}),  // LINE_BREAK
-                                                        {}}};           // OTHER
+                                                        {},                 // OPENING_BRACE
+                                                        {},                 // OPENING_BRACKET
+                                                        {},                 // CLOSING_BRACE
+                                                        {},                 // CLOSING_BRACKET
+                                                        {},                 // QUOTE
+                                                        {},                 // ESCAPE
+                                                        {},                 // COMMA
+                                                        {},                 // COLON
+                                                        {},                 // WHITE_SPACE
+                                                        nl_tokens({}, {}),  // LINE_BREAK
+                                                        {}}};               // OTHER
   return pda_tlt;
 }
 
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 7c911ac2e04..2ddb0b76544 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1962,7 +1962,31 @@ TEST_F(JsonReaderTest, JSONLinesRecovering)
     "\n"
     "\n"
     // 4 -> a: 123 (valid)
-    R"({"a":123})";
+    R"({"a":4})"
+    "\n"
+    // 5 -> (invalid)
+    R"({"a":5)"
+    "\n"
+    // 6 -> (invalid)
+    R"({"a":6 )"
+    "\n"
+    // 7 -> (invalid)
+    R"({"b":[7 )"
+    "\n"
+    // 8 -> a: 8 (valid)
+    R"({"a":8})"
+    "\n"
+    // 9 -> (invalid)
+    R"({"d":{"unterminated_field_name)"
+    "\n"
+    // 10 -> (invalid)
+    R"({"d":{)"
+    "\n"
+    // 11 -> (invalid)
+    R"({"d":{"123",)"
+    "\n"
+    // 12 -> a: 12 (valid)
+    R"({"a":12})";
 
   auto filepath = temp_env->get_temp_dir() + "RecoveringLines.json";
   {
@@ -1978,17 +2002,22 @@ TEST_F(JsonReaderTest, JSONLinesRecovering)
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 2);
-  EXPECT_EQ(result.tbl->num_rows(), 5);
+  EXPECT_EQ(result.tbl->num_rows(), 13);
   EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
   EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64);
 
-  std::vector<bool> a_validity{true, false, false, false, true};
-  std::vector<bool> c_validity{false, false, false, true, false};
+  std::vector<bool> a_validity{
+    true, false, false, false, true, false, false, false, true, false, false, false, true};
+  std::vector<bool> c_validity{
+    false, false, false, true, false, false, false, false, false, false, false, false, false};
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
-                                 int64_wrapper{{-2, 0, 0, 0, 123}, a_validity.cbegin()});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1),
-                                 float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0}, c_validity.cbegin()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(0),
+    int64_wrapper{{-2, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0, 12}, a_validity.cbegin()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(1),
+    float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+                    c_validity.cbegin()});
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 00d657108b8..3cb7e1f287a 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -569,23 +569,12 @@ TEST_F(JsonTest, RecoveringTokenStream)
     // Line 0 (invalid)
     {0, token_t::StructBegin},
     {0, token_t::StructEnd},
-    // Line 1 (valid)
-    {10, token_t::StructBegin},
-    {11, token_t::StructMemberBegin},
-    {11, token_t::FieldNameBegin},
-    {13, token_t::FieldNameEnd},
-    // Line 2 (valid)
-    {16, token_t::StructBegin},
-    {17, token_t::StructMemberBegin},
-    {17, token_t::FieldNameBegin},
-    {19, token_t::FieldNameEnd},
-    {21, token_t::StructBegin},
-    {22, token_t::StructMemberBegin},
-    {22, token_t::FieldNameBegin},
-    {24, token_t::FieldNameEnd},
-    {26, token_t::ListBegin},
-    {27, token_t::ValueBegin},
-    {30, token_t::ValueEnd},
+    // Line 1 (invalid)
+    {0, token_t::StructBegin},
+    {0, token_t::StructEnd},
+    // Line 2 (invalid)
+    {0, token_t::StructBegin},
+    {0, token_t::StructEnd},
     // Line 3 (valid)
     {31, token_t::StructBegin},
     {32, token_t::StructMemberBegin},

From aa598bc28e6e2459ca6bcfa58f2056134e6591ea Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 11 Oct 2023 15:34:59 -0400
Subject: [PATCH 022/118] Expose stream parameter in public strings
 split/partition APIs (#14247)

Follow on to PR #13997 which did not include all the split APIs or a stream test.
Add stream parameter to public APIs:

- `cudf::strings::partition()`
- `cudf::strings::rpartition()`
- `cudf::strings::split_re()`
- `cudf::strings::rsplit_re()`
- `cudf::strings::split_record_re()`
- `cudf::strings::rsplit_record_re()`

Also cleaned up some of the doxygen comments.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14247
---
 cpp/include/cudf/strings/split/partition.hpp | 22 +++++----
 cpp/include/cudf/strings/split/split_re.hpp  | 16 +++++--
 cpp/src/strings/split/partition.cu           | 10 ++--
 cpp/src/strings/split/split_re.cu            | 12 +++--
 cpp/tests/CMakeLists.txt                     |  2 +-
 cpp/tests/streams/strings/split_test.cpp     | 49 ++++++++++++++++++++
 6 files changed, 89 insertions(+), 22 deletions(-)
 create mode 100644 cpp/tests/streams/strings/split_test.cpp

diff --git a/cpp/include/cudf/strings/split/partition.hpp b/cpp/include/cudf/strings/split/partition.hpp
index 52ffb735eb7..25eedf1e86b 100644
--- a/cpp/include/cudf/strings/split/partition.hpp
+++ b/cpp/include/cudf/strings/split/partition.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,15 +51,17 @@ namespace strings {
  * r[2] is ["cd","g_h"]
  * @endcode
  *
- * @param strings Strings instance for this operation.
+ * @param input Strings instance for this operation
  * @param delimiter UTF-8 encoded string indicating where to split each string.
  *        Default of empty string indicates split on whitespace.
- * @param mr Device memory resource used to allocate the returned table's device memory.
- * @return New table of strings columns.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ * @return New table of strings columns
  */
 std::unique_ptr<table> partition(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& delimiter      = string_scalar(""),
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -83,15 +85,17 @@ std::unique_ptr<table> partition(
  * r[2] is ["cd","h"]
  * @endcode
  *
- * @param strings Strings instance for this operation.
+ * @param input Strings instance for this operation
  * @param delimiter UTF-8 encoded string indicating where to split each string.
  *        Default of empty string indicates split on whitespace.
- * @param mr Device memory resource used to allocate the returned table's device memory.
- * @return New strings columns.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ * @return New strings columns
  */
 std::unique_ptr<table> rpartition(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& delimiter      = string_scalar(""),
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index 14fcfaecdcd..f1736cb7e0c 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -75,6 +75,7 @@ struct regex_program;
  * @param prog Regex program instance
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned result's device memory
  * @return A table of columns of strings
  */
@@ -82,6 +83,7 @@ std::unique_ptr<table> split_re(
   strings_column_view const& input,
   regex_program const& prog,
   size_type maxsplit                  = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -125,17 +127,19 @@ std::unique_ptr<table> split_re(
  *
  * @throw cudf::logic_error if `pattern` is empty.
  *
- * @param input A column of string elements to be split.
+ * @param input A column of string elements to be split
  * @param prog Regex program instance
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
- * @param mr Device memory resource used to allocate the returned result's device memory.
- * @return A table of columns of strings.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned result's device memory
+ * @return A table of columns of strings
  */
 std::unique_ptr<table> rsplit_re(
   strings_column_view const& input,
   regex_program const& prog,
   size_type maxsplit                  = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -185,13 +189,15 @@ std::unique_ptr<table> rsplit_re(
  * @param prog Regex program instance
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned result's device memory
- * @return Lists column of strings.
+ * @return Lists column of strings
  */
 std::unique_ptr<column> split_record_re(
   strings_column_view const& input,
   regex_program const& prog,
   size_type maxsplit                  = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -243,6 +249,7 @@ std::unique_ptr<column> split_record_re(
  * @param prog Regex program instance
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned result's device memory
  * @return Lists column of strings
  */
@@ -250,6 +257,7 @@ std::unique_ptr<column> rsplit_record_re(
   strings_column_view const& input,
   regex_program const& prog,
   size_type maxsplit                  = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu
index 0c7d119ea38..16e6402cfef 100644
--- a/cpp/src/strings/split/partition.cu
+++ b/cpp/src/strings/split/partition.cu
@@ -239,20 +239,22 @@ std::unique_ptr<table> rpartition(strings_column_view const& strings,
 
 // external APIs
 
-std::unique_ptr<table> partition(strings_column_view const& strings,
+std::unique_ptr<table> partition(strings_column_view const& input,
                                  string_scalar const& delimiter,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::partition(strings, delimiter, cudf::get_default_stream(), mr);
+  return detail::partition(input, delimiter, stream, mr);
 }
 
-std::unique_ptr<table> rpartition(strings_column_view const& strings,
+std::unique_ptr<table> rpartition(strings_column_view const& input,
                                   string_scalar const& delimiter,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rpartition(strings, delimiter, cudf::get_default_stream(), mr);
+  return detail::rpartition(input, delimiter, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 3be5937297f..913aec79758 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -340,37 +340,41 @@ std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
 std::unique_ptr<table> split_re(strings_column_view const& input,
                                 regex_program const& prog,
                                 size_type maxsplit,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
+  return detail::split_re(input, prog, maxsplit, stream, mr);
 }
 
 std::unique_ptr<column> split_record_re(strings_column_view const& input,
                                         regex_program const& prog,
                                         size_type maxsplit,
+                                        rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
+  return detail::split_record_re(input, prog, maxsplit, stream, mr);
 }
 
 std::unique_ptr<table> rsplit_re(strings_column_view const& input,
                                  regex_program const& prog,
                                  size_type maxsplit,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rsplit_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
+  return detail::rsplit_re(input, prog, maxsplit, stream, mr);
 }
 
 std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
                                          regex_program const& prog,
                                          size_type maxsplit,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rsplit_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
+  return detail::rsplit_record_re(input, prog, maxsplit, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b15a6c41d39..4de18fceac1 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -634,7 +634,7 @@ ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp
-  streams/strings/strings_tests.cpp STREAM_MODE testing
+  streams/strings/split_test.cpp streams/strings/strings_tests.cpp STREAM_MODE testing
 )
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/strings/split_test.cpp b/cpp/tests/streams/strings/split_test.cpp
new file mode 100644
index 00000000000..24247f6f79c
--- /dev/null
+++ b/cpp/tests/streams/strings/split_test.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/strings/regex/regex_program.hpp>
+#include <cudf/strings/split/partition.hpp>
+#include <cudf/strings/split/split.hpp>
+#include <cudf/strings/split/split_re.hpp>
+
+#include <string>
+
+class StringsSplitTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsSplitTest, SplitPartition)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo thesé", "tést strings", ""});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const delimiter = cudf::string_scalar("é", true, cudf::test::get_default_stream());
+  cudf::strings::split(view, delimiter, -1, cudf::test::get_default_stream());
+  cudf::strings::rsplit(view, delimiter, -1, cudf::test::get_default_stream());
+  cudf::strings::split_record(view, delimiter, -1, cudf::test::get_default_stream());
+  cudf::strings::rsplit_record(view, delimiter, -1, cudf::test::get_default_stream());
+  cudf::strings::partition(view, delimiter, cudf::test::get_default_stream());
+  cudf::strings::rpartition(view, delimiter, cudf::test::get_default_stream());
+
+  auto const pattern = std::string("\\s");
+  auto const prog    = cudf::strings::regex_program::create(pattern);
+  cudf::strings::split_re(view, *prog, -1, cudf::test::get_default_stream());
+  cudf::strings::split_record_re(view, *prog, -1, cudf::test::get_default_stream());
+  cudf::strings::rsplit_re(view, *prog, -1, cudf::test::get_default_stream());
+  cudf::strings::rsplit_record_re(view, *prog, -1, cudf::test::get_default_stream());
+}

From 737b7593a58679fc59fd68e23eaf92195b9bd34c Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 12 Oct 2023 08:13:11 -0700
Subject: [PATCH 023/118] Use branch-23.12 workflows. (#14271)

This PR switches back to using `branch-23.12` for CI workflows because the CUDA 12 ARM conda migration is complete.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/14271
---
 .github/workflows/build.yaml | 16 ++++++++--------
 .github/workflows/pr.yaml    | 28 ++++++++++++++--------------
 .github/workflows/test.yaml  | 16 ++++++++--------
 3 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index dc2c81d1c77..ab028eb89cc 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -100,7 +100,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 047b80f2e5c..214f9c90b41 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -26,34 +26,34 @@ jobs:
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.12
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.12
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: pull-request
       test_script: "ci/test_python_cudf.sh"
@@ -61,14 +61,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: pull-request
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -78,7 +78,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -88,7 +88,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -98,21 +98,21 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-dask-cudf:
     needs: wheel-tests-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
@@ -120,7 +120,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index e58227c30dc..9ca32bcfe03 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -36,7 +36,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -67,7 +67,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: nightly

From fa4e8ab1af4acfd2c88a619b4d9693f4a5fda168 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 12 Oct 2023 17:11:51 -0400
Subject: [PATCH 024/118] Expose stream parameter in public strings replace
 APIs (#14261)

Add stream parameter to public APIs:

- `cudf::strings::replace()` (x2)
- `cudf::strings::replace_slice()`
- `cudf::strings::replace_re()` (x2)
- `cudf::strings::replace_with_backrefs()`

Also cleaned up some of the doxygen comments and added stream-tests.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14261
---
 cpp/include/cudf/strings/replace.hpp       | 42 +++++++-----
 cpp/include/cudf/strings/replace_re.hpp    | 28 +++++---
 cpp/src/strings/replace/backref_re.cu      |  3 +-
 cpp/src/strings/replace/multi.cu           |  3 +-
 cpp/src/strings/replace/multi_re.cu        |  3 +-
 cpp/src/strings/replace/replace.cu         |  8 ++-
 cpp/src/strings/replace/replace_re.cu      |  4 +-
 cpp/tests/CMakeLists.txt                   | 10 ++-
 cpp/tests/streams/strings/replace_test.cpp | 80 ++++++++++++++++++++++
 9 files changed, 142 insertions(+), 39 deletions(-)
 create mode 100644 cpp/tests/streams/strings/replace_test.cpp

diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index 22818f7542e..2476a41e886 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,19 +54,21 @@ namespace strings {
  *
  * @throw cudf::logic_error if target is an empty string.
  *
- * @param strings Strings column for this operation.
- * @param target String to search for within each string.
- * @param repl Replacement string if target is found.
+ * @param input Strings column for this operation
+ * @param target String to search for within each string
+ * @param repl Replacement string if target is found
  * @param maxrepl Maximum times to replace if target appears multiple times in the input string.
  *        Default of -1 specifies replace all occurrences of target in each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> replace(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& target,
   string_scalar const& repl,
-  int32_t maxrepl                     = -1,
+  cudf::size_type maxrepl             = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -92,21 +94,23 @@ std::unique_ptr<column> replace(
  *
  * @throw cudf::logic_error if start is greater than stop.
  *
- * @param strings Strings column for this operation.
+ * @param input Strings column for this operation.
  * @param repl Replacement string for specified positions found.
  *        Default is empty string.
  * @param start Start position where repl will be added.
  *        Default is 0, first character position.
  * @param stop End position (exclusive) to use for replacement.
  *        Default of -1 specifies the end of each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> replace_slice(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& repl           = string_scalar(""),
   size_type start                     = 0,
   size_type stop                      = -1,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -141,16 +145,18 @@ std::unique_ptr<column> replace_slice(
  * if repls is a single string.
  * @throw cudf::logic_error if targets or repls contain null entries.
  *
- * @param strings Strings column for this operation.
- * @param targets Strings to search for in each string.
- * @param repls Corresponding replacement strings for target strings.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param input Strings column for this operation
+ * @param targets Strings to search for in each string
+ * @param repls Corresponding replacement strings for target strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> replace(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   strings_column_view const& targets,
   strings_column_view const& repls,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp
index bc6659835c3..77db2882253 100644
--- a/cpp/include/cudf/strings/replace_re.hpp
+++ b/cpp/include/cudf/strings/replace_re.hpp
@@ -43,20 +43,22 @@ struct regex_program;
  *
  * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
  *
- * @param strings Strings instance for this operation
+ * @param input Strings instance for this operation
  * @param prog Regex program instance
  * @param replacement The string used to replace the matched sequence in each string.
  *        Default is an empty string.
  * @param max_replace_count The maximum number of times to replace the matched pattern
  *        within each string. Default replaces every substring that is matched.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New strings column
  */
 std::unique_ptr<column> replace_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   regex_program const& prog,
   string_scalar const& replacement           = string_scalar(""),
   std::optional<size_type> max_replace_count = std::nullopt,
+  rmm::cuda_stream_view stream               = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr        = rmm::mr::get_current_device_resource());
 
 /**
@@ -67,18 +69,20 @@ std::unique_ptr<column> replace_re(
  *
  * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
  *
- * @param strings Strings instance for this operation.
- * @param patterns The regular expression patterns to search within each string.
- * @param replacements The strings used for replacement.
- * @param flags Regex flags for interpreting special characters in the patterns.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param input Strings instance for this operation
+ * @param patterns The regular expression patterns to search within each string
+ * @param replacements The strings used for replacement
+ * @param flags Regex flags for interpreting special characters in the patterns
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> replace_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::vector<std::string> const& patterns,
   strings_column_view const& replacements,
   regex_flags const flags             = regex_flags::DEFAULT,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -92,16 +96,18 @@ std::unique_ptr<column> replace_re(
  * @throw cudf::logic_error if capture index values in `replacement` are not in range 0-99, and also
  * if the index exceeds the group count specified in the pattern
  *
- * @param strings Strings instance for this operation
+ * @param input Strings instance for this operation
  * @param prog Regex program instance
  * @param replacement The replacement template for creating the output string
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New strings column
  */
 std::unique_ptr<column> replace_with_backrefs(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   regex_program const& prog,
   std::string_view replacement,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 31e06aac72b..74f38cbcc20 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -148,10 +148,11 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& input,
 std::unique_ptr<column> replace_with_backrefs(strings_column_view const& strings,
                                               regex_program const& prog,
                                               std::string_view replacement,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_with_backrefs(strings, prog, replacement, cudf::get_default_stream(), mr);
+  return detail::replace_with_backrefs(strings, prog, replacement, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 92ace4e7bc7..ee47932100a 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -490,10 +490,11 @@ std::unique_ptr<column> replace(strings_column_view const& input,
 std::unique_ptr<column> replace(strings_column_view const& strings,
                                 strings_column_view const& targets,
                                 strings_column_view const& repls,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace(strings, targets, repls, cudf::get_default_stream(), mr);
+  return detail::replace(strings, targets, repls, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 867b443c036..3375cb7a789 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -206,10 +206,11 @@ std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    std::vector<std::string> const& patterns,
                                    strings_column_view const& replacements,
                                    regex_flags const flags,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_re(strings, patterns, replacements, flags, cudf::get_default_stream(), mr);
+  return detail::replace_re(strings, patterns, replacements, flags, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index acc1502f4d6..a6a14f27dec 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -751,21 +751,23 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
 std::unique_ptr<column> replace(strings_column_view const& strings,
                                 string_scalar const& target,
                                 string_scalar const& repl,
-                                int32_t maxrepl,
+                                cudf::size_type maxrepl,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace(strings, target, repl, maxrepl, cudf::get_default_stream(), mr);
+  return detail::replace(strings, target, repl, maxrepl, stream, mr);
 }
 
 std::unique_ptr<column> replace_slice(strings_column_view const& strings,
                                       string_scalar const& repl,
                                       size_type start,
                                       size_type stop,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_slice(strings, repl, start, stop, cudf::get_default_stream(), mr);
+  return detail::replace_slice(strings, repl, start, stop, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 81ddb937be5..502d5f1a52e 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -134,11 +134,11 @@ std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    regex_program const& prog,
                                    string_scalar const& replacement,
                                    std::optional<size_type> max_replace_count,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_re(
-    strings, prog, replacement, max_replace_count, cudf::get_default_stream(), mr);
+  return detail::replace_re(strings, prog, replacement, max_replace_count, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 4de18fceac1..f36fcbc9246 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -633,8 +633,14 @@ ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
 ConfigureTest(
-  STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp
-  streams/strings/split_test.cpp streams/strings/strings_tests.cpp STREAM_MODE testing
+  STREAM_STRINGS_TEST
+  streams/strings/case_test.cpp
+  streams/strings/find_test.cpp
+  streams/strings/replace_test.cpp
+  streams/strings/split_test.cpp
+  streams/strings/strings_tests.cpp
+  STREAM_MODE
+  testing
 )
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/strings/replace_test.cpp b/cpp/tests/streams/strings/replace_test.cpp
new file mode 100644
index 00000000000..fc87460b706
--- /dev/null
+++ b/cpp/tests/streams/strings/replace_test.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/strings/regex/regex_program.hpp>
+#include <cudf/strings/replace.hpp>
+#include <cudf/strings/replace_re.hpp>
+
+#include <string>
+
+class StringsReplaceTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsReplaceTest, Replace)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést strings", ""});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const target = cudf::string_scalar("é", true, cudf::test::get_default_stream());
+  auto const repl   = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  cudf::strings::replace(view, target, repl, -1, cudf::test::get_default_stream());
+  cudf::strings::replace(view, view, view, cudf::test::get_default_stream());
+  cudf::strings::replace_slice(view, repl, 1, 2, cudf::test::get_default_stream());
+
+  auto const pattern = std::string("[a-z]");
+  auto const prog    = cudf::strings::regex_program::create(pattern);
+  cudf::strings::replace_re(view, *prog, repl, 1, cudf::test::get_default_stream());
+
+  cudf::test::strings_column_wrapper repls({"1", "a", " "});
+  cudf::strings::replace_re(view,
+                            {pattern, pattern, pattern},
+                            cudf::strings_column_view(repls),
+                            cudf::strings::regex_flags::DEFAULT,
+                            cudf::test::get_default_stream());
+}
+
+TEST_F(StringsReplaceTest, ReplaceRegex)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést strings", ""});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const repl    = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  auto const pattern = std::string("[a-z]");
+  auto const prog    = cudf::strings::regex_program::create(pattern);
+  cudf::strings::replace_re(view, *prog, repl, 1, cudf::test::get_default_stream());
+
+  cudf::test::strings_column_wrapper repls({"1", "a", " "});
+  cudf::strings::replace_re(view,
+                            {pattern, pattern, pattern},
+                            cudf::strings_column_view(repls),
+                            cudf::strings::regex_flags::DEFAULT,
+                            cudf::test::get_default_stream());
+}
+
+TEST_F(StringsReplaceTest, ReplaceRegexBackref)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo thesé", "tést strings"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const repl_template = std::string("\\2-\\1");
+  auto const pattern       = std::string("(\\w) (\\w)");
+  auto const prog          = cudf::strings::regex_program::create(pattern);
+  cudf::strings::replace_with_backrefs(
+    view, *prog, repl_template, cudf::test::get_default_stream());
+}

From 6e00ad06abb1152816ed6edda698cb26f08a64d2 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 12 Oct 2023 22:32:25 -0400
Subject: [PATCH 025/118] Return error if BOOL8 column-type is used with
 integers-to-hex (#14208)

Removes support to convert BOOL8 column-type to hex using `cudf::strings::integers_to_hex`.
Also fixed other integer to string conversions to remove this unsupported type.
Added gtests to verify an error is thrown for this case.

Closes #14232

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/14208
---
 .../cudf/strings/convert/convert_integers.hpp |  4 +-
 cpp/include/cudf/utilities/traits.hpp         | 24 ++++++++++++
 cpp/src/strings/convert/convert_hex.cu        | 27 ++++++-------
 cpp/src/strings/convert/convert_integers.cu   | 38 ++++++-------------
 cpp/src/utilities/traits.cpp                  | 15 +++++++-
 cpp/tests/strings/integers_tests.cpp          | 26 +++++++++++++
 6 files changed, 89 insertions(+), 45 deletions(-)

diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp
index 44213b84139..756ce48645d 100644
--- a/cpp/include/cudf/strings/convert/convert_integers.hpp
+++ b/cpp/include/cudf/strings/convert/convert_integers.hpp
@@ -199,14 +199,14 @@ std::unique_ptr<column> is_hex(
  *
  * @code{.pseudo}
  * Example:
- * input = [123, -1, 0, 27, 342718233] // int32 type input column
+ * input = [1234, -1, 0, 27, 342718233] // int32 type input column
  * s = integers_to_hex(input)
  * s is [ '04D2', 'FFFFFFFF', '00', '1B', '146D7719']
  * @endcode
  *
  * The example above shows an `INT32` type column where each integer is 4 bytes.
  * Leading zeros are suppressed unless filling out a complete byte as in
- * `123 -> '04D2'` instead of `000004D2` or `4D2`.
+ * `1234 -> '04D2'` instead of `000004D2` or `4D2`.
  *
  * @throw cudf::logic_error if the input column is not integral type.
  *
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 51f5d9d571a..2dda0740b96 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -279,6 +279,30 @@ constexpr inline bool is_integral()
  */
 bool is_integral(data_type type);
 
+/**
+ * @brief Indicates whether the type `T` is an integral type but not bool type.
+ *
+ * @tparam T  The type to verify
+ * @return true `T` is integral but not bool
+ * @return false  `T` is not integral or is bool
+ */
+template <typename T>
+constexpr inline bool is_integral_not_bool()
+{
+  return cuda::std::is_integral_v<T> and not std::is_same_v<T, bool>;
+}
+
+/**
+ * @brief Indicates whether `type` is a integral `data_type` and not BOOL8
+ *
+ * "Integral" types are fundamental integer types such as `INT*` and `UINT*`.
+ *
+ * @param type The `data_type` to verify
+ * @return true `type` is integral but not bool
+ * @return false `type` is integral or is bool
+ */
+bool is_integral_not_bool(data_type type);
+
 /**
  * @brief Indicates whether the type `T` is a floating point type.
  *
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index bed682aba71..f5bdbcbd199 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -93,7 +93,8 @@ struct hex_to_integer_fn {
  * The output_column is expected to be one of the integer types only.
  */
 struct dispatch_hex_to_integers_fn {
-  template <typename IntegerType, std::enable_if_t<std::is_integral_v<IntegerType>>* = nullptr>
+  template <typename IntegerType,
+            std::enable_if_t<cudf::is_integral_not_bool<IntegerType>()>* = nullptr>
   void operator()(column_device_view const& strings_column,
                   mutable_column_view& output_column,
                   rmm::cuda_stream_view stream) const
@@ -105,22 +106,14 @@ struct dispatch_hex_to_integers_fn {
                       d_results,
                       hex_to_integer_fn<IntegerType>{strings_column});
   }
-  // non-integral types throw an exception
+  // non-integer types throw an exception
   template <typename T, typename... Args>
-  std::enable_if_t<not std::is_integral_v<T>, void> operator()(Args&&...) const
+  std::enable_if_t<not cudf::is_integral_not_bool<T>(), void> operator()(Args&&...) const
   {
-    CUDF_FAIL("Output for hex_to_integers must be an integral type.");
+    CUDF_FAIL("Output for hex_to_integers must be an integer type.");
   }
 };
 
-template <>
-void dispatch_hex_to_integers_fn::operator()<bool>(column_device_view const&,
-                                                   mutable_column_view&,
-                                                   rmm::cuda_stream_view) const
-{
-  CUDF_FAIL("Output for hex_to_integers must not be a boolean type.");
-}
-
 /**
  * @brief Functor to convert integers to hexadecimal strings
  *
@@ -179,7 +172,8 @@ struct integer_to_hex_fn {
 };
 
 struct dispatch_integers_to_hex_fn {
-  template <typename IntegerType, std::enable_if_t<std::is_integral_v<IntegerType>>* = nullptr>
+  template <typename IntegerType,
+            std::enable_if_t<cudf::is_integral_not_bool<IntegerType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
@@ -195,11 +189,12 @@ struct dispatch_integers_to_hex_fn {
                                input.null_count(),
                                cudf::detail::copy_bitmask(input, stream, mr));
   }
-  // non-integral types throw an exception
+  // non-integer types throw an exception
   template <typename T, typename... Args>
-  std::enable_if_t<not std::is_integral_v<T>, std::unique_ptr<column>> operator()(Args...) const
+  std::enable_if_t<not cudf::is_integral_not_bool<T>(), std::unique_ptr<column>> operator()(
+    Args...) const
   {
-    CUDF_FAIL("integers_to_hex only supports integral type columns");
+    CUDF_FAIL("integers_to_hex only supports integer type columns");
   }
 };
 
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 5597d2831c0..2c21fc5d790 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -111,7 +111,7 @@ inline __device__ bool is_integer(string_view const& d_str)
  * @brief The dispatch functions for checking if strings are valid integers.
  */
 struct dispatch_is_integer_fn {
-  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+  template <typename T, std::enable_if_t<cudf::is_integral_not_bool<T>()>* = nullptr>
   std::unique_ptr<column> operator()(strings_column_view const& strings,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
@@ -145,7 +145,7 @@ struct dispatch_is_integer_fn {
     return results;
   }
 
-  template <typename T, std::enable_if_t<not std::is_integral_v<T>>* = nullptr>
+  template <typename T, std::enable_if_t<not cudf::is_integral_not_bool<T>()>* = nullptr>
   std::unique_ptr<column> operator()(strings_column_view const&,
                                      rmm::cuda_stream_view,
                                      rmm::mr::device_memory_resource*) const
@@ -243,7 +243,8 @@ struct string_to_integer_fn {
  * The output_column is expected to be one of the integer types only.
  */
 struct dispatch_to_integers_fn {
-  template <typename IntegerType, std::enable_if_t<std::is_integral_v<IntegerType>>* = nullptr>
+  template <typename IntegerType,
+            std::enable_if_t<cudf::is_integral_not_bool<IntegerType>()>* = nullptr>
   void operator()(column_device_view const& strings_column,
                   mutable_column_view& output_column,
                   rmm::cuda_stream_view stream) const
@@ -254,22 +255,14 @@ struct dispatch_to_integers_fn {
                       output_column.data<IntegerType>(),
                       string_to_integer_fn<IntegerType>{strings_column});
   }
-  // non-integral types throw an exception
-  template <typename T, std::enable_if_t<not std::is_integral_v<T>>* = nullptr>
+  // non-integer types throw an exception
+  template <typename T, std::enable_if_t<not cudf::is_integral_not_bool<T>()>* = nullptr>
   void operator()(column_device_view const&, mutable_column_view&, rmm::cuda_stream_view) const
   {
-    CUDF_FAIL("Output for to_integers must be an integral type.");
+    CUDF_FAIL("Output for to_integers must be an integer type.");
   }
 };
 
-template <>
-void dispatch_to_integers_fn::operator()<bool>(column_device_view const&,
-                                               mutable_column_view&,
-                                               rmm::cuda_stream_view) const
-{
-  CUDF_FAIL("Output for to_integers must not be a boolean type.");
-}
-
 }  // namespace
 
 // This will convert a strings column into any integer column type.
@@ -351,7 +344,8 @@ struct from_integers_fn {
  * The template function declaration ensures only integer types are used.
  */
 struct dispatch_from_integers_fn {
-  template <typename IntegerType, std::enable_if_t<std::is_integral_v<IntegerType>>* = nullptr>
+  template <typename IntegerType,
+            std::enable_if_t<cudf::is_integral_not_bool<IntegerType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& integers,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
@@ -373,23 +367,15 @@ struct dispatch_from_integers_fn {
                                std::move(null_mask));
   }
 
-  // non-integral types throw an exception
-  template <typename T, std::enable_if_t<not std::is_integral_v<T>>* = nullptr>
+  // non-integer types throw an exception
+  template <typename T, std::enable_if_t<not cudf::is_integral_not_bool<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
                                      rmm::mr::device_memory_resource*) const
   {
-    CUDF_FAIL("Values for from_integers function must be an integral type.");
+    CUDF_FAIL("Values for from_integers function must be an integer type.");
   }
 };
-
-template <>
-std::unique_ptr<column> dispatch_from_integers_fn::operator()<bool>(
-  column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const
-{
-  CUDF_FAIL("Input for from_integers must not be a boolean type.");
-}
-
 }  // namespace
 
 // This will convert all integer column types into a strings column.
diff --git a/cpp/src/utilities/traits.cpp b/cpp/src/utilities/traits.cpp
index bc10dd7845a..b0078ff85a2 100644
--- a/cpp/src/utilities/traits.cpp
+++ b/cpp/src/utilities/traits.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -158,6 +158,19 @@ struct is_integral_impl {
 
 bool is_integral(data_type type) { return cudf::type_dispatcher(type, is_integral_impl{}); }
 
+struct is_integral_not_bool_impl {
+  template <typename T>
+  constexpr bool operator()()
+  {
+    return is_integral_not_bool<T>();
+  }
+};
+
+bool is_integral_not_bool(data_type type)
+{
+  return cudf::type_dispatcher(type, is_integral_not_bool_impl{});
+}
+
 struct is_floating_point_impl {
   template <typename T>
   constexpr bool operator()()
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 59805f9cb6d..c8f292f55b2 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -456,3 +456,29 @@ TEST_F(StringsConvertTest, IntegerToHexWithNull)
   auto results = cudf::strings::integers_to_hex(integers);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
+
+TEST_F(StringsConvertTest, IntegerConvertErrors)
+{
+  cudf::test::fixed_width_column_wrapper<bool> bools(
+    {true, true, false, false, true, true, false, true});
+  cudf::test::fixed_width_column_wrapper<double> floats(
+    {123456.0, -1.0, 0.0, 0.0, 12.0, 12345.0, 123456789.0});
+  EXPECT_THROW(cudf::strings::integers_to_hex(bools), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::integers_to_hex(floats), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::from_integers(bools), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::from_integers(floats), cudf::logic_error);
+
+  auto input = cudf::test::strings_column_wrapper({"123456", "-1", "0"});
+  auto view  = cudf::strings_column_view(input);
+  EXPECT_THROW(cudf::strings::to_integers(view, cudf::data_type(cudf::type_id::BOOL8)),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::strings::to_integers(view, cudf::data_type(cudf::type_id::FLOAT32)),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::strings::to_integers(view, cudf::data_type(cudf::type_id::TIMESTAMP_SECONDS)),
+               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::strings::to_integers(view, cudf::data_type(cudf::type_id::DURATION_MILLISECONDS)),
+    cudf::logic_error);
+  EXPECT_THROW(cudf::strings::to_integers(view, cudf::data_type(cudf::type_id::DECIMAL32)),
+               cudf::logic_error);
+}

From d590e0bde9389b8a403b2b7ae4c5372ae6728016 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 16 Oct 2023 10:37:00 -0400
Subject: [PATCH 026/118] Expose stream parameter in public strings convert
 APIs (#14255)

Add stream parameter to public APIs:

- `cudf::strings::to_booleans()`
- `cudf::strings::from_booleans()`
- `cudf::strings::to_timestamps()`
- `cudf::strings::from_timestamps()`
- `cudf::strings::is_timestamp()`
- `cudf::strings::to_durations()`
- `cudf::strings::from_durations()`
- `cudf::strings::to_fixed_point()`
- `cudf::strings::from_fixed_point()`
- `cudf::strings::to_floats()`
- `cudf::strings::from_floats()`
- `cudf::strings::is_float()`
- `cudf::strings::to_integers()`
- `cudf::strings::from_integers()`
- `cudf::strings::is_integer()`
- `cudf::strings::hex_to_integers()`
- `cudf::strings::integers_to_hex()`
- `cudf::strings::is_hex()`
- `cudf::strings::ipv4_to_integers()`
- `cudf::strings::integers_to_ipv4()`
- `cudf::strings::is_ipv4()`
- `cudf::strings::url_encode()`
- `cudf::strings::url_decode()`
- `cudf::strings::format_list_column()`

Also cleaned up some of the doxygen comments and removed some default parameters.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14255
---
 .../cudf/strings/convert/convert_booleans.hpp |  32 ++--
 .../cudf/strings/convert/convert_datetime.hpp |  34 ++--
 .../strings/convert/convert_durations.hpp     |  26 ++--
 .../strings/convert/convert_fixed_point.hpp   |  30 ++--
 .../cudf/strings/convert/convert_floats.hpp   |  30 ++--
 .../cudf/strings/convert/convert_integers.hpp |  72 +++++----
 .../cudf/strings/convert/convert_ipv4.hpp     |  30 ++--
 .../cudf/strings/convert/convert_lists.hpp    |  14 +-
 .../cudf/strings/convert/convert_urls.hpp     |  22 +--
 cpp/src/strings/convert/convert_booleans.cu   |  20 +--
 cpp/src/strings/convert/convert_datetime.cu   |   9 +-
 cpp/src/strings/convert/convert_durations.cu  |  20 +--
 .../strings/convert/convert_fixed_point.cu    |  11 +-
 cpp/src/strings/convert/convert_floats.cu     |  42 ++---
 cpp/src/strings/convert/convert_hex.cu        |   9 +-
 cpp/src/strings/convert/convert_integers.cu   |  64 ++++----
 cpp/src/strings/convert/convert_ipv4.cu       |  39 ++---
 cpp/src/strings/convert/convert_lists.cu      |   3 +-
 cpp/src/strings/convert/convert_urls.cu       |  10 +-
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/streams/strings/convert_test.cpp    | 146 ++++++++++++++++++
 cpp/tests/strings/booleans_tests.cpp          |  33 +++-
 cpp/tests/strings/format_lists_tests.cpp      |   9 +-
 java/src/main/native/src/ColumnViewJni.cpp    |  11 +-
 24 files changed, 487 insertions(+), 230 deletions(-)
 create mode 100644 cpp/tests/streams/strings/convert_test.cpp

diff --git a/cpp/include/cudf/strings/convert/convert_booleans.hpp b/cpp/include/cudf/strings/convert/convert_booleans.hpp
index ab63503f166..9e9f25e800a 100644
--- a/cpp/include/cudf/strings/convert/convert_booleans.hpp
+++ b/cpp/include/cudf/strings/convert/convert_booleans.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,14 +35,16 @@ namespace strings {
  *
  * Any null entries will result in corresponding null entries in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param true_string String to expect for true. Non-matching strings are false.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New BOOL8 column converted from strings.
+ * @param input Strings instance for this operation
+ * @param true_string String to expect for true. Non-matching strings are false
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL8 column converted from strings
  */
 std::unique_ptr<column> to_booleans(
-  strings_column_view const& strings,
-  string_scalar const& true_string    = string_scalar("true"),
+  strings_column_view const& input,
+  string_scalar const& true_string,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -53,16 +55,18 @@ std::unique_ptr<column> to_booleans(
  *
  * @throw cudf::logic_error if the input column is not BOOL8 type.
  *
- * @param booleans Boolean column to convert.
- * @param true_string String to use for true in the output column.
- * @param false_string String to use for false in the output column.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param booleans Boolean column to convert
+ * @param true_string String to use for true in the output column
+ * @param false_string String to use for false in the output column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> from_booleans(
   column_view const& booleans,
-  string_scalar const& true_string    = string_scalar("true"),
-  string_scalar const& false_string   = string_scalar("false"),
+  string_scalar const& true_string,
+  string_scalar const& false_string,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_datetime.hpp b/cpp/include/cudf/strings/convert/convert_datetime.hpp
index fa729d26734..81cce14b53b 100644
--- a/cpp/include/cudf/strings/convert/convert_datetime.hpp
+++ b/cpp/include/cudf/strings/convert/convert_datetime.hpp
@@ -77,16 +77,18 @@ namespace strings {
  *
  * @throw cudf::logic_error if timestamp_type is not a timestamp type.
  *
- * @param strings Strings instance for this operation.
- * @param timestamp_type The timestamp type used for creating the output column.
- * @param format String specifying the timestamp format in strings.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New datetime column.
+ * @param input Strings instance for this operation
+ * @param timestamp_type The timestamp type used for creating the output column
+ * @param format String specifying the timestamp format in strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New datetime column
  */
 std::unique_ptr<column> to_timestamps(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   data_type timestamp_type,
   std::string_view format,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -124,14 +126,16 @@ std::unique_ptr<column> to_timestamps(
  * This will return a column of type BOOL8 where a `true` row indicates the corresponding
  * input string can be parsed correctly with the given format.
  *
- * @param strings Strings instance for this operation.
- * @param format String specifying the timestamp format in strings.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New BOOL8 column.
+ * @param input Strings instance for this operation
+ * @param format String specifying the timestamp format in strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL8 column
  */
 std::unique_ptr<column> is_timestamp(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string_view format,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -231,19 +235,21 @@ std::unique_ptr<column> is_timestamp(
  * @throw cudf::logic_error if the `format` string is empty
  * @throw cudf::logic_error if `names.size()` is an invalid size. Must be 0 or 40 strings.
  *
- * @param timestamps Timestamp values to convert.
+ * @param timestamps Timestamp values to convert
  * @param format The string specifying output format.
  *        Default format is "%Y-%m-%dT%H:%M:%SZ".
  * @param names The string names to use for weekdays ("%a", "%A") and months ("%b", "%B")
  *        Default is an empty `strings_column_view`.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with formatted timestamps.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with formatted timestamps
  */
 std::unique_ptr<column> from_timestamps(
   column_view const& timestamps,
   std::string_view format             = "%Y-%m-%dT%H:%M:%SZ",
   strings_column_view const& names    = strings_column_view(column_view{
     data_type{type_id::STRING}, 0, nullptr, nullptr, 0}),
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_durations.hpp b/cpp/include/cudf/strings/convert/convert_durations.hpp
index e915ec26279..a1f4e4ead1d 100644
--- a/cpp/include/cudf/strings/convert/convert_durations.hpp
+++ b/cpp/include/cudf/strings/convert/convert_durations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,16 +65,18 @@ namespace strings {
  *
  * @throw cudf::logic_error if duration_type is not a duration type.
  *
- * @param strings Strings instance for this operation.
- * @param duration_type The duration type used for creating the output column.
- * @param format String specifying the duration format in strings.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New duration column.
+ * @param input Strings instance for this operation
+ * @param duration_type The duration type used for creating the output column
+ * @param format String specifying the duration format in strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New duration column
  */
 std::unique_ptr<column> to_durations(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   data_type duration_type,
   std::string_view format,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -115,15 +117,17 @@ std::unique_ptr<column> to_durations(
  *
  * @throw cudf::logic_error if `durations` column parameter is not a duration type.
  *
- * @param durations Duration values to convert.
+ * @param durations Duration values to convert
  * @param format The string specifying output format.
- *        Default format is ""%d days %H:%M:%S".
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with formatted durations.
+ *        Default format is ""%D days %H:%M:%S".
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return New strings column with formatted durations
  */
 std::unique_ptr<column> from_durations(
   column_view const& durations,
   std::string_view format             = "%D days %H:%M:%S",
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
index 3852dc8e81a..8f37715967a 100644
--- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
+++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,14 +53,16 @@ namespace strings {
  *
  * @throw cudf::logic_error if `output_type` is not a fixed-point decimal type.
  *
- * @param input Strings instance for this operation.
- * @param output_type Type of fixed-point column to return including the scale value.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of `output_type`.
+ * @param input Strings instance for this operation
+ * @param output_type Type of fixed-point column to return including the scale value
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of `output_type`
  */
 std::unique_ptr<column> to_fixed_point(
   strings_column_view const& input,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -83,12 +85,14 @@ std::unique_ptr<column> to_fixed_point(
  *
  * @throw cudf::logic_error if the `input` column is not a fixed-point decimal type.
  *
- * @param input Fixed-point column to convert.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param input Fixed-point column to convert
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> from_fixed_point(
   column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -111,14 +115,16 @@ std::unique_ptr<column> from_fixed_point(
  *
  * @throw cudf::logic_error if the `decimal_type` is not a fixed-point decimal type.
  *
- * @param input Strings instance for this operation.
- * @param decimal_type Fixed-point type (with scale) used only for checking overflow.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
+ * @param input Strings instance for this operation
+ * @param decimal_type Fixed-point type (with scale) used only for checking overflow
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
  */
 std::unique_ptr<column> is_fixed_point(
   strings_column_view const& input,
   data_type decimal_type              = data_type{type_id::DECIMAL64},
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_floats.hpp b/cpp/include/cudf/strings/convert/convert_floats.hpp
index 38a84fc1548..a35cb68ef4e 100644
--- a/cpp/include/cudf/strings/convert/convert_floats.hpp
+++ b/cpp/include/cudf/strings/convert/convert_floats.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,14 +39,16 @@ namespace strings {
  *
  * @throw cudf::logic_error if output_type is not float type.
  *
- * @param strings Strings instance for this operation.
- * @param output_type Type of float numeric column to return.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column with floats converted from strings.
+ * @param strings Strings instance for this operation
+ * @param output_type Type of float numeric column to return
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with floats converted from strings
  */
 std::unique_ptr<column> to_floats(
   strings_column_view const& strings,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -62,12 +64,14 @@ std::unique_ptr<column> to_floats(
  *
  * @throw cudf::logic_error if floats column is not float type.
  *
- * @param floats Numeric column to convert.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with floats as strings.
+ * @param floats Numeric column to convert
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with floats as strings
  */
 std::unique_ptr<column> from_floats(
   column_view const& floats,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -86,12 +90,14 @@ std::unique_ptr<column> from_floats(
  *
  * Any null row results in a null entry for that row in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
+ * @param input Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
  */
 std::unique_ptr<column> is_float(
-  strings_column_view const& strings,
+  strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp
index 756ce48645d..74ec5d315a2 100644
--- a/cpp/include/cudf/strings/convert/convert_integers.hpp
+++ b/cpp/include/cudf/strings/convert/convert_integers.hpp
@@ -46,14 +46,16 @@ namespace strings {
  *
  * @throw cudf::logic_error if output_type is not integral type.
  *
- * @param strings Strings instance for this operation.
- * @param output_type Type of integer numeric column to return.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column with integers converted from strings.
+ * @param input Strings instance for this operation
+ * @param output_type Type of integer numeric column to return
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with integers converted from strings
  */
 std::unique_ptr<column> to_integers(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -67,12 +69,14 @@ std::unique_ptr<column> to_integers(
  *
  * @throw cudf::logic_error if integers column is not integral type.
  *
- * @param integers Numeric column to convert.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with integers as strings.
+ * @param integers Numeric column to convert
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with integers as strings
  */
 std::unique_ptr<column> from_integers(
   column_view const& integers,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -94,12 +98,14 @@ std::unique_ptr<column> from_integers(
  *
  * Any null row results in a null entry for that row in the output column.
  *
- * @param strings  Strings instance for this operation.
- * @param mr       Device memory resource used to allocate the returned column's device memory.
- * @return         New column of boolean results for each string.
+ * @param input Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
  */
 std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
+  strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -124,14 +130,16 @@ std::unique_ptr<column> is_integer(
  *
  * Any null row results in a null entry for that row in the output column.
  *
- * @param strings  Strings instance for this operation.
- * @param int_type Integer type used for checking underflow and overflow.
- * @param mr       Device memory resource used to allocate the returned column's device memory.
- * @return         New column of boolean results for each string.
+ * @param input Strings instance for this operation
+ * @param int_type Integer type used for checking underflow and overflow
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
  */
 std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   data_type int_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -152,14 +160,16 @@ std::unique_ptr<column> is_integer(
  *
  * @throw cudf::logic_error if output_type is not integral type.
  *
- * @param strings Strings instance for this operation.
- * @param output_type Type of integer numeric column to return.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column with integers converted from strings.
+ * @param input Strings instance for this operation
+ * @param output_type Type of integer numeric column to return
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with integers converted from strings
  */
 std::unique_ptr<column> hex_to_integers(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   data_type output_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -179,12 +189,14 @@ std::unique_ptr<column> hex_to_integers(
  *
  * Any null row results in a null entry for that row in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
+ * @param input Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
  */
 std::unique_ptr<column> is_hex(
-  strings_column_view const& strings,
+  strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -210,12 +222,14 @@ std::unique_ptr<column> is_hex(
  *
  * @throw cudf::logic_error if the input column is not integral type.
  *
- * @param input Integer column to convert to hex.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with hexadecimal characters.
+ * @param input Integer column to convert to hex
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with hexadecimal characters
  */
 std::unique_ptr<column> integers_to_hex(
   column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_ipv4.hpp b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
index 22272af74fc..25ad7b86748 100644
--- a/cpp/include/cudf/strings/convert/convert_ipv4.hpp
+++ b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,12 +48,14 @@ namespace strings {
  *
  * Any null entries will result in corresponding null entries in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT64 column converted from strings.
+ * @param input Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New INT64 column converted from strings
  */
 std::unique_ptr<column> ipv4_to_integers(
-  strings_column_view const& strings,
+  strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -71,12 +73,14 @@ std::unique_ptr<column> ipv4_to_integers(
  *
  * @throw cudf::logic_error if the input column is not INT64 type.
  *
- * @param integers Integer (INT64) column to convert.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param integers Integer (INT64) column to convert
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> integers_to_ipv4(
   column_view const& integers,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -96,12 +100,14 @@ std::unique_ptr<column> integers_to_ipv4(
  *
  * Any null row results in a null entry for that row in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
+ * @param input Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
  */
 std::unique_ptr<column> is_ipv4(
-  strings_column_view const& strings,
+  strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp
index 7ab1bf47b0a..dedf4e95138 100644
--- a/cpp/include/cudf/strings/convert/convert_lists.hpp
+++ b/cpp/include/cudf/strings/convert/convert_lists.hpp
@@ -50,17 +50,19 @@ namespace strings {
  *
  * @throw cudf::logic_error if the input column is not a LIST type with a STRING child.
  *
- * @param input Lists column to format.
- * @param na_rep Replacement string for null elements.
- * @param separators Strings to use for enclosing list components and separating elements.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param input Lists column to format
+ * @param na_rep Replacement string for null elements
+ * @param separators Strings to use for enclosing list components and separating elements
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> format_list_column(
   lists_column_view const& input,
-  string_scalar const& na_rep           = string_scalar("NULL"),
+  string_scalar const& na_rep           = string_scalar(""),
   strings_column_view const& separators = strings_column_view(column_view{
     data_type{type_id::STRING}, 0, nullptr, nullptr, 0}),
+  rmm::cuda_stream_view stream          = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_urls.hpp b/cpp/include/cudf/strings/convert/convert_urls.hpp
index 7f29a0d2149..902835081af 100644
--- a/cpp/include/cudf/strings/convert/convert_urls.hpp
+++ b/cpp/include/cudf/strings/convert/convert_urls.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,12 +39,14 @@ namespace strings {
  *
  * Any null entries will result in corresponding null entries in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param input Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> url_encode(
-  strings_column_view const& strings,
+  strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -60,12 +62,14 @@ std::unique_ptr<column> url_encode(
  *
  * Any null entries will result in corresponding null entries in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
+ * @param input Strings instance for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
  */
 std::unique_ptr<column> url_decode(
-  strings_column_view const& strings,
+  strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index 0d04fc74b0c..8196e1d90fb 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -39,25 +39,25 @@ namespace cudf {
 namespace strings {
 namespace detail {
 // Convert strings column to boolean column
-std::unique_ptr<column> to_booleans(strings_column_view const& strings,
+std::unique_ptr<column> to_booleans(strings_column_view const& input,
                                     string_scalar const& true_string,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  size_type strings_count = strings.size();
+  size_type strings_count = input.size();
   if (strings_count == 0) return make_numeric_column(data_type{type_id::BOOL8}, 0);
 
   CUDF_EXPECTS(true_string.is_valid(stream) && true_string.size() > 0,
                "Parameter true_string must not be empty.");
   auto d_true = string_view(true_string.data(), true_string.size());
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_strings      = *strings_column;
   // create output column copying the strings' null-mask
   auto results      = make_numeric_column(data_type{type_id::BOOL8},
                                      strings_count,
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
                                      stream,
                                      mr);
   auto results_view = results->mutable_view();
@@ -73,19 +73,20 @@ std::unique_ptr<column> to_booleans(strings_column_view const& strings,
                         result = d_strings.element<string_view>(idx).compare(d_true) == 0;
                       return result;
                     });
-  results->set_null_count(strings.null_count());
+  results->set_null_count(input.null_count());
   return results;
 }
 
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> to_booleans(strings_column_view const& strings,
+std::unique_ptr<column> to_booleans(strings_column_view const& input,
                                     string_scalar const& true_string,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_booleans(strings, true_string, cudf::get_default_stream(), mr);
+  return detail::to_booleans(input, true_string, stream, mr);
 }
 
 namespace detail {
@@ -156,10 +157,11 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
 std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       string_scalar const& true_string,
                                       string_scalar const& false_string,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_booleans(booleans, true_string, false_string, cudf::get_default_stream(), mr);
+  return detail::from_booleans(booleans, true_string, false_string, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 8a953d778ed..d2609441d72 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -710,18 +710,20 @@ std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& input,
 std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& input,
                                             data_type timestamp_type,
                                             std::string_view format,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_timestamps(input, timestamp_type, format, cudf::get_default_stream(), mr);
+  return detail::to_timestamps(input, timestamp_type, format, stream, mr);
 }
 
 std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& input,
                                            std::string_view format,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_timestamp(input, format, cudf::get_default_stream(), mr);
+  return detail::is_timestamp(input, format, stream, mr);
 }
 
 namespace detail {
@@ -1168,10 +1170,11 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
 std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                         std::string_view format,
                                         strings_column_view const& names,
+                                        rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_timestamps(timestamps, format, names, cudf::get_default_stream(), mr);
+  return detail::from_timestamps(timestamps, format, names, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 6ab70825a6b..e781581b378 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -690,30 +690,30 @@ std::unique_ptr<column> from_durations(column_view const& durations,
     durations.type(), dispatch_from_durations_fn{}, durations, format, stream, mr);
 }
 
-std::unique_ptr<column> to_durations(strings_column_view const& strings,
+std::unique_ptr<column> to_durations(strings_column_view const& input,
                                      data_type duration_type,
                                      std::string_view format,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
-  size_type strings_count = strings.size();
+  size_type strings_count = input.size();
   if (strings_count == 0) return make_duration_column(duration_type, 0);
 
   CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty.");
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_column       = *strings_column;
 
   auto results      = make_duration_column(duration_type,
                                       strings_count,
-                                      cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                      strings.null_count(),
+                                      cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                      input.null_count(),
                                       stream,
                                       mr);
   auto results_view = results->mutable_view();
   cudf::type_dispatcher(
     duration_type, dispatch_to_durations_fn(), d_column, format, results_view, stream);
-  results->set_null_count(strings.null_count());
+  results->set_null_count(input.null_count());
   return results;
 }
 
@@ -721,19 +721,21 @@ std::unique_ptr<column> to_durations(strings_column_view const& strings,
 
 std::unique_ptr<column> from_durations(column_view const& durations,
                                        std::string_view format,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_durations(durations, format, cudf::get_default_stream(), mr);
+  return detail::from_durations(durations, format, stream, mr);
 }
 
-std::unique_ptr<column> to_durations(strings_column_view const& strings,
+std::unique_ptr<column> to_durations(strings_column_view const& input,
                                      data_type duration_type,
                                      std::string_view format,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_durations(strings, duration_type, format, cudf::get_default_stream(), mr);
+  return detail::to_durations(input, duration_type, format, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 51aab9faeba..2c59f6dcd29 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -184,12 +184,13 @@ std::unique_ptr<column> to_fixed_point(strings_column_view const& input,
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> to_fixed_point(strings_column_view const& strings,
+std::unique_ptr<column> to_fixed_point(strings_column_view const& input,
                                        data_type output_type,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_fixed_point(strings, output_type, cudf::get_default_stream(), mr);
+  return detail::to_fixed_point(input, output_type, stream, mr);
 }
 
 namespace detail {
@@ -277,10 +278,11 @@ std::unique_ptr<column> from_fixed_point(column_view const& input,
 // external API
 
 std::unique_ptr<column> from_fixed_point(column_view const& input,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_fixed_point(input, cudf::get_default_stream(), mr);
+  return detail::from_fixed_point(input, stream, mr);
 }
 
 namespace detail {
@@ -341,10 +343,11 @@ std::unique_ptr<column> is_fixed_point(strings_column_view const& input,
 
 std::unique_ptr<column> is_fixed_point(strings_column_view const& input,
                                        data_type decimal_type,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_fixed_point(input, decimal_type, cudf::get_default_stream(), mr);
+  return detail::is_fixed_point(input, decimal_type, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 32167589ab4..81d686d690c 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -91,26 +91,26 @@ struct dispatch_to_floats_fn {
 }  // namespace
 
 // This will convert a strings column into any float column type.
-std::unique_ptr<column> to_floats(strings_column_view const& strings,
+std::unique_ptr<column> to_floats(strings_column_view const& input,
                                   data_type output_type,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
-  size_type strings_count = strings.size();
+  size_type strings_count = input.size();
   if (strings_count == 0) return make_numeric_column(output_type, 0);
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_strings      = *strings_column;
   // create float output column copying the strings null-mask
   auto results      = make_numeric_column(output_type,
                                      strings_count,
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
                                      stream,
                                      mr);
   auto results_view = results->mutable_view();
   // fill output column with floats
   type_dispatcher(output_type, dispatch_to_floats_fn{}, d_strings, results_view, stream);
-  results->set_null_count(strings.null_count());
+  results->set_null_count(input.null_count());
   return results;
 }
 
@@ -118,12 +118,13 @@ std::unique_ptr<column> to_floats(strings_column_view const& strings,
 
 // external API
 
-std::unique_ptr<column> to_floats(strings_column_view const& strings,
+std::unique_ptr<column> to_floats(strings_column_view const& input,
                                   data_type output_type,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_floats(strings, output_type, cudf::get_default_stream(), mr);
+  return detail::to_floats(input, output_type, stream, mr);
 }
 
 namespace detail {
@@ -436,48 +437,51 @@ std::unique_ptr<column> from_floats(column_view const& floats,
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> from_floats(column_view const& floats,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_floats(floats, cudf::get_default_stream(), mr);
+  return detail::from_floats(floats, stream, mr);
 }
 
 namespace detail {
-std::unique_ptr<column> is_float(strings_column_view const& strings,
+std::unique_ptr<column> is_float(strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_column       = *strings_column;
   // create output column
   auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
+                                     input.size(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
                                      stream,
                                      mr);
   auto d_results = results->mutable_view().data<bool>();
   // check strings for valid float chars
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings.size()),
+                    thrust::make_counting_iterator<size_type>(input.size()),
                     d_results,
                     [d_column] __device__(size_type idx) {
                       if (d_column.is_null(idx)) return false;
                       return is_float(d_column.element<string_view>(idx));
                     });
-  results->set_null_count(strings.null_count());
+  results->set_null_count(input.null_count());
   return results;
 }
 
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> is_float(strings_column_view const& strings,
+std::unique_ptr<column> is_float(strings_column_view const& input,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_float(strings, cudf::get_default_stream(), mr);
+  return detail::is_float(input, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index f5bdbcbd199..8f656b149a5 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -275,24 +275,27 @@ std::unique_ptr<column> integers_to_hex(column_view const& input,
 // external API
 std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
                                         data_type output_type,
+                                        rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::hex_to_integers(strings, output_type, cudf::get_default_stream(), mr);
+  return detail::hex_to_integers(strings, output_type, stream, mr);
 }
 
 std::unique_ptr<column> is_hex(strings_column_view const& strings,
+                               rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_hex(strings, cudf::get_default_stream(), mr);
+  return detail::is_hex(strings, stream, mr);
 }
 
 std::unique_ptr<column> integers_to_hex(column_view const& input,
+                                        rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::integers_to_hex(input, cudf::get_default_stream(), mr);
+  return detail::integers_to_hex(input, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 2c21fc5d790..4839e83d5dd 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -112,20 +112,20 @@ inline __device__ bool is_integer(string_view const& d_str)
  */
 struct dispatch_is_integer_fn {
   template <typename T, std::enable_if_t<cudf::is_integral_not_bool<T>()>* = nullptr>
-  std::unique_ptr<column> operator()(strings_column_view const& strings,
+  std::unique_ptr<column> operator()(strings_column_view const& input,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
-    auto const d_column = column_device_view::create(strings.parent(), stream);
+    auto const d_column = column_device_view::create(input.parent(), stream);
     auto results        = make_numeric_column(data_type{type_id::BOOL8},
-                                       strings.size(),
-                                       cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                       strings.null_count(),
+                                       input.size(),
+                                       cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                       input.null_count(),
                                        stream,
                                        mr);
 
     auto d_results = results->mutable_view().data<bool>();
-    if (strings.has_nulls()) {
+    if (input.has_nulls()) {
       thrust::transform(rmm::exec_policy(stream),
                         d_column->pair_begin<string_view, true>(),
                         d_column->pair_end<string_view, true>(),
@@ -140,7 +140,7 @@ struct dispatch_is_integer_fn {
     }
 
     // Calling mutable_view() on a column invalidates it's null count so we need to set it back
-    results->set_null_count(strings.null_count());
+    results->set_null_count(input.null_count());
 
     return results;
   }
@@ -156,20 +156,20 @@ struct dispatch_is_integer_fn {
 
 }  // namespace
 
-std::unique_ptr<column> is_integer(strings_column_view const& strings,
+std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
-  auto const d_column = column_device_view::create(strings.parent(), stream);
+  auto const d_column = column_device_view::create(input.parent(), stream);
   auto results        = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
+                                     input.size(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
                                      stream,
                                      mr);
 
   auto d_results = results->mutable_view().data<bool>();
-  if (strings.has_nulls()) {
+  if (input.has_nulls()) {
     thrust::transform(
       rmm::exec_policy(stream),
       d_column->pair_begin<string_view, true>(),
@@ -185,36 +185,38 @@ std::unique_ptr<column> is_integer(strings_column_view const& strings,
   }
 
   // Calling mutable_view() on a column invalidates it's null count so we need to set it back
-  results->set_null_count(strings.null_count());
+  results->set_null_count(input.null_count());
 
   return results;
 }
 
-std::unique_ptr<column> is_integer(strings_column_view const& strings,
+std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    data_type int_type,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) { return cudf::make_empty_column(type_id::BOOL8); }
-  return type_dispatcher(int_type, dispatch_is_integer_fn{}, strings, stream, mr);
+  if (input.is_empty()) { return cudf::make_empty_column(type_id::BOOL8); }
+  return type_dispatcher(int_type, dispatch_is_integer_fn{}, input, stream, mr);
 }
 
 }  // namespace detail
 
 // external APIs
-std::unique_ptr<column> is_integer(strings_column_view const& strings,
+std::unique_ptr<column> is_integer(strings_column_view const& input,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_integer(strings, cudf::get_default_stream(), mr);
+  return detail::is_integer(input, stream, mr);
 }
 
-std::unique_ptr<column> is_integer(strings_column_view const& strings,
+std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    data_type int_type,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_integer(strings, int_type, cudf::get_default_stream(), mr);
+  return detail::is_integer(input, int_type, stream, mr);
 }
 
 namespace detail {
@@ -266,28 +268,28 @@ struct dispatch_to_integers_fn {
 }  // namespace
 
 // This will convert a strings column into any integer column type.
-std::unique_ptr<column> to_integers(strings_column_view const& strings,
+std::unique_ptr<column> to_integers(strings_column_view const& input,
                                     data_type output_type,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  size_type strings_count = strings.size();
+  size_type strings_count = input.size();
   if (strings_count == 0) return make_numeric_column(output_type, 0);
 
   // Create integer output column copying the strings null-mask
   auto results = make_numeric_column(output_type,
                                      strings_count,
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
                                      stream,
                                      mr);
   // Fill output column with integers
-  auto const strings_dev_view = column_device_view::create(strings.parent(), stream);
+  auto const strings_dev_view = column_device_view::create(input.parent(), stream);
   auto results_view           = results->mutable_view();
   type_dispatcher(output_type, dispatch_to_integers_fn{}, *strings_dev_view, results_view, stream);
 
   // Calling mutable_view() on a column invalidates it's null count so we need to set it back
-  results->set_null_count(strings.null_count());
+  results->set_null_count(input.null_count());
 
   return results;
 }
@@ -295,12 +297,13 @@ std::unique_ptr<column> to_integers(strings_column_view const& strings,
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> to_integers(strings_column_view const& strings,
+std::unique_ptr<column> to_integers(strings_column_view const& input,
                                     data_type output_type,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_integers(strings, output_type, cudf::get_default_stream(), mr);
+  return detail::to_integers(input, output_type, stream, mr);
 }
 
 namespace detail {
@@ -393,10 +396,11 @@ std::unique_ptr<column> from_integers(column_view const& integers,
 
 // external API
 std::unique_ptr<column> from_integers(column_view const& integers,
+                                      rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_integers(integers, cudf::get_default_stream(), mr);
+  return detail::from_integers(integers, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index adb72cb0263..07e4b3e5b17 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -72,19 +72,19 @@ struct ipv4_to_integers_fn {
 }  // namespace
 
 // Convert strings column of IPv4 addresses to integers column
-std::unique_ptr<column> ipv4_to_integers(strings_column_view const& strings,
+std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
-  size_type strings_count = strings.size();
+  size_type strings_count = input.size();
   if (strings_count == 0) return make_numeric_column(data_type{type_id::INT64}, 0);
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   // create output column copying the strings' null-mask
   auto results   = make_numeric_column(data_type{type_id::INT64},
                                      strings_count,
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
                                      stream,
                                      mr);
   auto d_results = results->mutable_view().data<int64_t>();
@@ -95,18 +95,19 @@ std::unique_ptr<column> ipv4_to_integers(strings_column_view const& strings,
                     d_results,
                     ipv4_to_integers_fn{*strings_column});
   // done
-  results->set_null_count(strings.null_count());
+  results->set_null_count(input.null_count());
   return results;
 }
 
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> ipv4_to_integers(strings_column_view const& strings,
+std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ipv4_to_integers(strings, cudf::get_default_stream(), mr);
+  return detail::ipv4_to_integers(input, stream, mr);
 }
 
 namespace detail {
@@ -173,23 +174,23 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
                              cudf::detail::copy_bitmask(integers, stream, mr));
 }
 
-std::unique_ptr<column> is_ipv4(strings_column_view const& strings,
+std::unique_ptr<column> is_ipv4(strings_column_view const& input,
                                 rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_column       = *strings_column;
   // create output column
   auto results   = make_numeric_column(data_type{type_id::BOOL8},
-                                     strings.size(),
-                                     cudf::detail::copy_bitmask(strings.parent(), stream, mr),
-                                     strings.null_count(),
+                                     input.size(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
                                      stream,
                                      mr);
   auto d_results = results->mutable_view().data<bool>();
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings.size()),
+                    thrust::make_counting_iterator<size_type>(input.size()),
                     d_results,
                     [d_column] __device__(size_type idx) {
                       if (d_column.is_null(idx)) return false;
@@ -214,7 +215,7 @@ std::unique_ptr<column> is_ipv4(strings_column_view const& strings,
                       return ip_vals[0] >= 0 && ip_vals[1] >= 0 && ip_vals[2] >= 0 &&
                              ip_vals[3] >= 0;
                     });
-  results->set_null_count(strings.null_count());
+  results->set_null_count(input.null_count());
   return results;
 }
 
@@ -223,17 +224,19 @@ std::unique_ptr<column> is_ipv4(strings_column_view const& strings,
 // external API
 
 std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::integers_to_ipv4(integers, cudf::get_default_stream(), mr);
+  return detail::integers_to_ipv4(integers, stream, mr);
 }
 
-std::unique_ptr<column> is_ipv4(strings_column_view const& strings,
+std::unique_ptr<column> is_ipv4(strings_column_view const& input,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_ipv4(strings, cudf::get_default_stream(), mr);
+  return detail::is_ipv4(input, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
index 3aef37914fd..f9f2b91eb12 100644
--- a/cpp/src/strings/convert/convert_lists.cu
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -233,10 +233,11 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
 std::unique_ptr<column> format_list_column(lists_column_view const& input,
                                            string_scalar const& na_rep,
                                            strings_column_view const& separators,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::format_list_column(input, na_rep, separators, cudf::get_default_stream(), mr);
+  return detail::format_list_column(input, na_rep, separators, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 9efa148cfd2..9e847131be2 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -148,11 +148,12 @@ std::unique_ptr<column> url_encode(strings_column_view const& input,
 }  // namespace detail
 
 // external API
-std::unique_ptr<column> url_encode(strings_column_view const& strings,
+std::unique_ptr<column> url_encode(strings_column_view const& input,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::url_encode(strings, cudf::get_default_stream(), mr);
+  return detail::url_encode(input, stream, mr);
 }
 
 namespace detail {
@@ -428,11 +429,12 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
 
 // external API
 
-std::unique_ptr<column> url_decode(strings_column_view const& strings,
+std::unique_ptr<column> url_decode(strings_column_view const& input,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::url_decode(strings, cudf::get_default_stream(), mr);
+  return detail::url_decode(input, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f36fcbc9246..3e30db7abcb 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -635,6 +635,7 @@ ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE tes
 ConfigureTest(
   STREAM_STRINGS_TEST
   streams/strings/case_test.cpp
+  streams/strings/convert_test.cpp
   streams/strings/find_test.cpp
   streams/strings/replace_test.cpp
   streams/strings/split_test.cpp
diff --git a/cpp/tests/streams/strings/convert_test.cpp b/cpp/tests/streams/strings/convert_test.cpp
new file mode 100644
index 00000000000..8dc3f625746
--- /dev/null
+++ b/cpp/tests/streams/strings/convert_test.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/strings/convert/convert_booleans.hpp>
+#include <cudf/strings/convert/convert_datetime.hpp>
+#include <cudf/strings/convert/convert_durations.hpp>
+#include <cudf/strings/convert/convert_fixed_point.hpp>
+#include <cudf/strings/convert/convert_floats.hpp>
+#include <cudf/strings/convert/convert_integers.hpp>
+#include <cudf/strings/convert/convert_ipv4.hpp>
+#include <cudf/strings/convert/convert_lists.hpp>
+#include <cudf/strings/convert/convert_urls.hpp>
+
+#include <string>
+
+class StringsConvertTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsConvertTest, Booleans)
+{
+  auto input = cudf::test::strings_column_wrapper({"true", "false", "True", ""});
+  auto view  = cudf::strings_column_view(input);
+
+  auto true_scalar  = cudf::string_scalar("true", true, cudf::test::get_default_stream());
+  auto false_scalar = cudf::string_scalar("false", true, cudf::test::get_default_stream());
+
+  auto bools = cudf::strings::to_booleans(view, true_scalar, cudf::test::get_default_stream());
+  cudf::strings::from_booleans(
+    bools->view(), true_scalar, false_scalar, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsConvertTest, Timestamps)
+{
+  auto input = cudf::test::strings_column_wrapper({"2019-03-20T12:34:56Z", "2020-02-29T00:00:00Z"});
+  auto view  = cudf::strings_column_view(input);
+
+  std::string format = "%Y-%m-%dT%H:%M:%SZ";
+  auto dtype         = cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS};
+
+  cudf::strings::is_timestamp(view, format, cudf::test::get_default_stream());
+  auto timestamps =
+    cudf::strings::to_timestamps(view, dtype, format, cudf::test::get_default_stream());
+
+  auto empty = cudf::test::strings_column_wrapper();
+  cudf::strings::from_timestamps(
+    timestamps->view(), format, cudf::strings_column_view(empty), cudf::test::get_default_stream());
+}
+
+TEST_F(StringsConvertTest, Durations)
+{
+  auto input = cudf::test::strings_column_wrapper({"17975 days 12:34:56", "18321 days 00:00:00"});
+  auto view  = cudf::strings_column_view(input);
+
+  std::string format = "%D days %H:%M:%S";
+  auto dtype         = cudf::data_type{cudf::type_id::DURATION_SECONDS};
+
+  auto durations =
+    cudf::strings::to_durations(view, dtype, format, cudf::test::get_default_stream());
+  cudf::strings::from_durations(durations->view(), format, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsConvertTest, FixedPoint)
+{
+  auto input = cudf::test::strings_column_wrapper({"1.234E3", "-876", "543.2"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto dtype = cudf::data_type{cudf::type_id::DECIMAL64, numeric::scale_type{-3}};
+
+  auto values = cudf::strings::to_fixed_point(view, dtype, cudf::test::get_default_stream());
+  cudf::strings::from_fixed_point(values->view(), cudf::test::get_default_stream());
+}
+
+TEST_F(StringsConvertTest, Floats)
+{
+  auto input = cudf::test::strings_column_wrapper({"1.234E3", "-876", "543.2"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto dtype = cudf::data_type{cudf::type_id::FLOAT32};
+
+  auto values = cudf::strings::to_floats(view, dtype, cudf::test::get_default_stream());
+  cudf::strings::from_floats(values->view(), cudf::test::get_default_stream());
+  cudf::strings::is_float(view, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsConvertTest, Integers)
+{
+  auto input = cudf::test::strings_column_wrapper({"1234", "-876", "5432"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto dtype = cudf::data_type{cudf::type_id::INT32};
+
+  auto values = cudf::strings::to_integers(view, dtype, cudf::test::get_default_stream());
+  cudf::strings::from_integers(values->view(), cudf::test::get_default_stream());
+  cudf::strings::is_integer(view, cudf::test::get_default_stream());
+  cudf::strings::is_hex(view, cudf::test::get_default_stream());
+  cudf::strings::hex_to_integers(view, dtype, cudf::test::get_default_stream());
+  cudf::strings::integers_to_hex(values->view(), cudf::test::get_default_stream());
+}
+
+TEST_F(StringsConvertTest, IPv4)
+{
+  auto input = cudf::test::strings_column_wrapper({"192.168.0.1", "10.0.0.1"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto values = cudf::strings::ipv4_to_integers(view, cudf::test::get_default_stream());
+  cudf::strings::integers_to_ipv4(values->view(), cudf::test::get_default_stream());
+  cudf::strings::is_ipv4(view, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsConvertTest, URLs)
+{
+  auto input = cudf::test::strings_column_wrapper({"www.nvidia.com/rapids?p=é", "/_file-7.txt"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto values = cudf::strings::url_encode(view, cudf::test::get_default_stream());
+  cudf::strings::url_decode(values->view(), cudf::test::get_default_stream());
+}
+
+TEST_F(StringsConvertTest, ListsFormat)
+{
+  using STR_LISTS = cudf::test::lists_column_wrapper<cudf::string_view>;
+  auto const input =
+    STR_LISTS{{STR_LISTS{"a", "bb", "ccc"}, STR_LISTS{}, STR_LISTS{"ddd", "ee", "f"}},
+              {STR_LISTS{"gg", "hhh"}, STR_LISTS{"i", "", "", "jj"}}};
+  auto view        = cudf::lists_column_view(input);
+  auto null_scalar = cudf::string_scalar("NULL", true, cudf::test::get_default_stream());
+  auto separators  = cudf::strings_column_view(cudf::test::strings_column_wrapper());
+  cudf::strings::format_list_column(
+    view, null_scalar, separators, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/strings/booleans_tests.cpp b/cpp/tests/strings/booleans_tests.cpp
index 0c7fc992065..469ca77a4c5 100644
--- a/cpp/tests/strings/booleans_tests.cpp
+++ b/cpp/tests/strings/booleans_tests.cpp
@@ -36,7 +36,8 @@ TEST_F(StringsConvertTest, ToBooleans)
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
   auto strings_view = cudf::strings_column_view(strings);
-  auto results      = cudf::strings::to_booleans(strings_view);
+  auto true_scalar  = cudf::string_scalar("true");
+  auto results      = cudf::strings::to_booleans(strings_view, true_scalar);
 
   std::vector<bool> h_expected{false, false, false, true, false, false};
   cudf::test::fixed_width_column_wrapper<bool> expected(
@@ -60,26 +61,46 @@ TEST_F(StringsConvertTest, FromBooleans)
     h_column.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
-  auto results = cudf::strings::from_booleans(column);
+  auto true_scalar  = cudf::string_scalar("true");
+  auto false_scalar = cudf::string_scalar("false");
+  auto results      = cudf::strings::from_booleans(column, true_scalar, false_scalar);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
 }
 
 TEST_F(StringsConvertTest, ZeroSizeStringsColumnBoolean)
 {
   auto const zero_size_column = cudf::make_empty_column(cudf::type_id::BOOL8)->view();
-  auto results                = cudf::strings::from_booleans(zero_size_column);
+  auto true_scalar            = cudf::string_scalar("true");
+  auto false_scalar           = cudf::string_scalar("false");
+  auto results = cudf::strings::from_booleans(zero_size_column, true_scalar, false_scalar);
   cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsConvertTest, ZeroSizeBooleansColumn)
 {
   auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
-  auto results                        = cudf::strings::to_booleans(zero_size_strings_column);
+  auto true_scalar                    = cudf::string_scalar("true");
+  auto results = cudf::strings::to_booleans(zero_size_strings_column, true_scalar);
   EXPECT_EQ(0, results->size());
 }
 
 TEST_F(StringsConvertTest, BooleanError)
 {
-  auto column = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, 100);
-  EXPECT_THROW(cudf::strings::from_booleans(column->view()), cudf::logic_error);
+  auto int_column   = cudf::test::fixed_width_column_wrapper<int32_t>({1, 2, 3});
+  auto true_scalar  = cudf::string_scalar("true");
+  auto false_scalar = cudf::string_scalar("false");
+  EXPECT_THROW(cudf::strings::from_booleans(int_column, true_scalar, false_scalar),
+               cudf::logic_error);
+
+  auto bool_column = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1});
+  auto null_scalar = cudf::string_scalar("", false);
+  EXPECT_THROW(cudf::strings::from_booleans(bool_column, null_scalar, false_scalar),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::strings::from_booleans(bool_column, true_scalar, null_scalar),
+               cudf::logic_error);
+  auto empty_scalar = cudf::string_scalar("", true);
+  EXPECT_THROW(cudf::strings::from_booleans(int_column, empty_scalar, false_scalar),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::strings::from_booleans(int_column, true_scalar, empty_scalar),
+               cudf::logic_error);
 }
diff --git a/cpp/tests/strings/format_lists_tests.cpp b/cpp/tests/strings/format_lists_tests.cpp
index 95dc9725afc..6196b8ed6ad 100644
--- a/cpp/tests/strings/format_lists_tests.cpp
+++ b/cpp/tests/strings/format_lists_tests.cpp
@@ -60,8 +60,9 @@ TEST_F(StringsFormatListsTest, WithNulls)
                                cudf::test::iterators::null_at(1)};
   auto const view  = cudf::lists_column_view(input);
 
-  auto results  = cudf::strings::format_list_column(view);
-  auto expected = cudf::test::strings_column_wrapper(
+  auto null_scalar = cudf::string_scalar("NULL");
+  auto results     = cudf::strings::format_list_column(view, null_scalar);
+  auto expected    = cudf::test::strings_column_wrapper(
     {"[a,NULL,ccc]", "NULL", "[NULL,bb,ddd]", "[zzz,xxxxx]", "[v,,NULL,w]"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
@@ -132,11 +133,13 @@ TEST_F(StringsFormatListsTest, SlicedLists)
                                                     "[ééé,12345abcdef]",
                                                     "[www,12345]"});
 
+  auto null_scalar = cudf::string_scalar("NULL");
+
   // set of slice intervals: covers slicing the front, back, and middle
   std::vector<std::pair<int32_t, int32_t>> index_pairs({{0, 11}, {0, 4}, {3, 8}, {5, 11}});
   for (auto indexes : index_pairs) {
     auto sliced   = cudf::lists_column_view(cudf::slice(input, {indexes.first, indexes.second})[0]);
-    auto results  = cudf::strings::format_list_column(sliced);
+    auto results  = cudf::strings::format_list_column(sliced, null_scalar);
     auto expected = cudf::test::strings_column_wrapper(h_expected.begin() + indexes.first,
                                                        h_expected.begin() + indexes.second);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 0ddaa2c15b5..462f0d8eac9 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1130,7 +1130,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
     }
     if (n_data_type.id() == cudf::type_id::STRING) {
       switch (column->type().id()) {
-        case cudf::type_id::BOOL8: return release_as_jlong(cudf::strings::from_booleans(*column));
+        case cudf::type_id::BOOL8: {
+          auto const true_scalar = cudf::string_scalar("true");
+          auto const false_scalar = cudf::string_scalar("false");
+          return release_as_jlong(cudf::strings::from_booleans(*column, true_scalar, false_scalar));
+        }
         case cudf::type_id::FLOAT32:
         case cudf::type_id::FLOAT64: return release_as_jlong(cudf::strings::from_floats(*column));
         case cudf::type_id::INT8:
@@ -1149,7 +1153,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
       }
     } else if (column->type().id() == cudf::type_id::STRING) {
       switch (n_data_type.id()) {
-        case cudf::type_id::BOOL8: return release_as_jlong(cudf::strings::to_booleans(*column));
+        case cudf::type_id::BOOL8: {
+          auto const true_scalar = cudf::string_scalar("true");
+          return release_as_jlong(cudf::strings::to_booleans(*column, true_scalar));
+        }
         case cudf::type_id::FLOAT32:
         case cudf::type_id::FLOAT64:
           return release_as_jlong(cudf::strings::to_floats(*column, n_data_type));

From 655f3a4659653e95b9f12ed924c7e887be41c5d4 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Mon, 16 Oct 2023 12:46:48 -0400
Subject: [PATCH 027/118] Update rapids-cmake functions to non-deprecated
 signatures (#14265)

Update to use non deprecated signatures for `rapids_export` functions

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14265
---
 cpp/cmake/thirdparty/get_arrow.cmake      | 18 +++++++++++++-----
 cpp/cmake/thirdparty/get_cufile.cmake     |  4 ++--
 cpp/cmake/thirdparty/get_gtest.cmake      |  4 ++--
 cpp/cmake/thirdparty/get_kvikio.cmake     | 12 +++++++-----
 cpp/cmake/thirdparty/get_libcudacxx.cmake | 18 ++++++++----------
 cpp/cmake/thirdparty/get_spdlog.cmake     |  4 +++-
 cpp/cmake/thirdparty/get_thrust.cmake     | 15 +++++++--------
 7 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 894dc9649e2..10d3145a36f 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -387,11 +387,19 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
   endif()
 
   include("${rapids-cmake-dir}/export/find_package_root.cmake")
-  rapids_export_find_package_root(BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
-  if(ENABLE_PARQUET)
-    rapids_export_find_package_root(BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
-    rapids_export_find_package_root(BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
-  endif()
+  rapids_export_find_package_root(
+    BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports
+  )
+  rapids_export_find_package_root(
+    BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=]
+    EXPORT_SET cudf-exports
+    CONDITION ENABLE_PARQUET
+  )
+  rapids_export_find_package_root(
+    BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=]
+    EXPORT_SET cudf-exports
+    CONDITION ENABLE_PARQUET
+  )
 
   set(ARROW_LIBRARIES
       "${ARROW_LIBRARIES}"
diff --git a/cpp/cmake/thirdparty/get_cufile.cmake b/cpp/cmake/thirdparty/get_cufile.cmake
index c0235eba508..bfdff3a99ff 100644
--- a/cpp/cmake/thirdparty/get_cufile.cmake
+++ b/cpp/cmake/thirdparty/get_cufile.cmake
@@ -21,10 +21,10 @@ function(find_and_configure_cufile)
   if(cuFile_FOUND AND NOT BUILD_SHARED_LIBS)
     include("${rapids-cmake-dir}/export/find_package_file.cmake")
     rapids_export_find_package_file(
-      BUILD "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" cudf-exports
+      BUILD "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" EXPORT_SET cudf-exports
     )
     rapids_export_find_package_file(
-      INSTALL "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" cudf-exports
+      INSTALL "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" EXPORT_SET cudf-exports
     )
   endif()
 endfunction()
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 1363f43fae2..cfb219448f1 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -30,7 +30,7 @@ function(find_and_configure_gtest)
 
     include("${rapids-cmake-dir}/export/find_package_root.cmake")
     rapids_export_find_package_root(
-      BUILD GTest [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-testing-exports
+      BUILD GTest [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-testing-exports
     )
   endif()
 
diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake
index e94e024d6c9..20712beec41 100644
--- a/cpp/cmake/thirdparty/get_kvikio.cmake
+++ b/cpp/cmake/thirdparty/get_kvikio.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -25,10 +25,12 @@ function(find_and_configure_kvikio VERSION)
     OPTIONS "KvikIO_BUILD_EXAMPLES OFF"
   )
 
-  if(KvikIO_BINARY_DIR)
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(BUILD KvikIO "${KvikIO_BINARY_DIR}" cudf-exports)
-  endif()
+  include("${rapids-cmake-dir}/export/find_package_root.cmake")
+  rapids_export_find_package_root(
+    BUILD KvikIO "${KvikIO_BINARY_DIR}"
+    EXPORT_SET cudf-exports
+    CONDITION KvikIO_BINARY_DIR
+  )
 
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index 0e03352c335..285d66287f3 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -22,16 +22,14 @@ function(find_and_configure_libcudacxx)
   include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
   rapids_cpm_libcudacxx(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
 
-  if(libcudacxx_SOURCE_DIR)
-    # Store where CMake can find our custom Thrust install
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(
-      INSTALL
-      libcudacxx
-      [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/libcudacxx]=]
-      cudf-exports
-    )
-  endif()
+  # Store where CMake can find our custom Thrust install
+  include("${rapids-cmake-dir}/export/find_package_root.cmake")
+  rapids_export_find_package_root(
+    INSTALL libcudacxx
+    [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/libcudacxx]=]
+    EXPORT_SET cudf-exports
+    CONDITION libcudacxx_SOURCE_DIR
+  )
 endfunction()
 
 find_and_configure_libcudacxx()
diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake
index fff5b84af0d..c0e07d02d94 100644
--- a/cpp/cmake/thirdparty/get_spdlog.cmake
+++ b/cpp/cmake/thirdparty/get_spdlog.cmake
@@ -27,7 +27,9 @@ function(find_and_configure_spdlog)
       NAMESPACE spdlog::
     )
     include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
+    rapids_export_find_package_root(
+      BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports
+    )
   endif()
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index 39a9de15fa6..67ed4287d7b 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -33,14 +33,13 @@ function(find_and_configure_thrust)
     INSTALL_EXPORT_SET cudf-exports
   )
 
-  if(Thrust_SOURCE_DIR)
-    # Store where CMake can find our custom Thrust install
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(
-      INSTALL Thrust
-      [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/thrust]=] cudf-exports
-    )
-  endif()
+  # Store where CMake can find our custom Thrust install
+  include("${rapids-cmake-dir}/export/find_package_root.cmake")
+  rapids_export_find_package_root(
+    INSTALL Thrust [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/thrust]=]
+    EXPORT_SET cudf-exports
+    CONDITION Thrust_SOURCE_DIR
+  )
 endfunction()
 
 find_and_configure_thrust()

From ef92310155eb663859a47b73b6e8fc119d0bebc6 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 16 Oct 2023 15:04:06 -0400
Subject: [PATCH 028/118] Fix memset error in nvtext::edit_distance_matrix
 (#14283)

Fixes a bug in `nvtext::edit_distance_matrix` where the internal offsets vector is initialized to 0.
This error was introduced in #13912
The bug was found while working on a different PR which re-ordered the nvtext gtests execution causing device memory to be reused from the rmm pool in a different way.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mark Harris (https://github.com/harrism)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14283
---
 cpp/src/text/edit_distance.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu
index 1460be4fcf5..3d5f2d72e6f 100644
--- a/cpp/src/text/edit_distance.cu
+++ b/cpp/src/text/edit_distance.cu
@@ -224,7 +224,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
   cudf::size_type n_upper = (strings_count * (strings_count - 1)) / 2;
   rmm::device_uvector<std::ptrdiff_t> offsets(n_upper, stream);
   auto d_offsets = offsets.data();
-  CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(cudf::size_type), stream.value()));
+  CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(std::ptrdiff_t), stream.value()));
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<cudf::size_type>(0),

From c47546ed1907a47d22acefcf7e854ef8788d0d4c Mon Sep 17 00:00:00 2001
From: Suraj Aralihalli <suraj.ara16@gmail.com>
Date: Mon, 16 Oct 2023 16:12:17 -0700
Subject: [PATCH 029/118] Add stream parameter to List Sort and Filter APIs
 (#14272)

This PR introduces the stream parameter to the List Sorting and Filtering APIs.

Sorting and Filtering (`extract.hpp`, `filling.hpp`, `gather.hpp`, `reverse.hpp`, `sorting.hpp`, `stream_compaction.hpp`)

```
extract_list_element - index
extract_list_element - indices
segmented_gather
sequences - without steps
sequences - with steps
reverse
sort_lists
stable_sort_lists
apply_boolean_mask
distinct
```

Reference [13744](https://github.com/rapidsai/cudf/issues/13744)

Authors:
  - Suraj Aralihalli (https://github.com/SurajAralihalli)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14272
---
 cpp/include/cudf/lists/extract.hpp            |  6 +-
 cpp/include/cudf/lists/filling.hpp            |  6 ++
 cpp/include/cudf/lists/gather.hpp             |  2 +
 cpp/include/cudf/lists/reverse.hpp            |  4 +-
 cpp/include/cudf/lists/sorting.hpp            |  5 +-
 cpp/include/cudf/lists/stream_compaction.hpp  |  6 +-
 cpp/src/lists/copying/segmented_gather.cu     |  4 +-
 cpp/src/lists/extract.cu                      |  6 +-
 cpp/src/lists/reverse.cu                      |  6 +-
 cpp/src/lists/segmented_sort.cu               |  7 +-
 cpp/src/lists/sequences.cu                    |  6 +-
 .../stream_compaction/apply_boolean_mask.cu   |  3 +-
 cpp/src/lists/stream_compaction/distinct.cu   |  3 +-
 cpp/tests/groupby/histogram_tests.cpp         |  1 +
 cpp/tests/streams/lists_test.cpp              | 81 +++++++++++++++++++
 15 files changed, 129 insertions(+), 17 deletions(-)

diff --git a/cpp/include/cudf/lists/extract.hpp b/cpp/include/cudf/lists/extract.hpp
index e92354134e8..14c0f59e17d 100644
--- a/cpp/include/cudf/lists/extract.hpp
+++ b/cpp/include/cudf/lists/extract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,12 +59,14 @@ namespace lists {
  *
  * @param lists_column Column to extract elements from.
  * @param index The row within each sublist to retrieve.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Column of extracted elements.
  */
 std::unique_ptr<column> extract_list_element(
   lists_column_view const& lists_column,
   size_type index,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -97,6 +99,7 @@ std::unique_ptr<column> extract_list_element(
  * @param lists_column Column to extract elements from.
  * @param indices The column whose rows indicate the element index to be retrieved from each list
  * row.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Column of extracted elements.
  * @throws cudf::logic_error If the sizes of `lists_column` and `indices` do not match.
@@ -104,6 +107,7 @@ std::unique_ptr<column> extract_list_element(
 std::unique_ptr<column> extract_list_element(
   lists_column_view const& lists_column,
   column_view const& indices,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/filling.hpp b/cpp/include/cudf/lists/filling.hpp
index 059ed5ffd33..3730e16482d 100644
--- a/cpp/include/cudf/lists/filling.hpp
+++ b/cpp/include/cudf/lists/filling.hpp
@@ -17,7 +17,9 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
 #include <memory>
@@ -57,12 +59,14 @@ namespace cudf::lists {
  *
  * @param starts First values in the result sequences.
  * @param sizes Numbers of values in the result sequences.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return The result column containing generated sequences.
  */
 std::unique_ptr<column> sequences(
   column_view const& starts,
   column_view const& sizes,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -96,6 +100,7 @@ std::unique_ptr<column> sequences(
  * @param starts First values in the result sequences.
  * @param steps Increment values for the result sequences.
  * @param sizes Numbers of values in the result sequences.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return The result column containing generated sequences.
  */
@@ -103,6 +108,7 @@ std::unique_ptr<column> sequences(
   column_view const& starts,
   column_view const& steps,
   column_view const& sizes,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/gather.hpp b/cpp/include/cudf/lists/gather.hpp
index 38bed9ede43..5e6ab6816e6 100644
--- a/cpp/include/cudf/lists/gather.hpp
+++ b/cpp/include/cudf/lists/gather.hpp
@@ -65,6 +65,7 @@ namespace lists {
  * @param bounds_policy Can be `DONT_CHECK` or `NULLIFY`. Selects whether or not to nullify the
  * output list row's element, when the gather index falls outside the range `[-n, n)`,
  * where `n` is the number of elements in list row corresponding to the gather-map row.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource to allocate any returned objects
  * @return column with elements in list of rows gathered based on `gather_map_list`
  *
@@ -73,6 +74,7 @@ std::unique_ptr<column> segmented_gather(
   lists_column_view const& source_column,
   lists_column_view const& gather_map_list,
   out_of_bounds_policy bounds_policy  = out_of_bounds_policy::DONT_CHECK,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/reverse.hpp b/cpp/include/cudf/lists/reverse.hpp
index 226d417c53a..864cd796f72 100644
--- a/cpp/include/cudf/lists/reverse.hpp
+++ b/cpp/include/cudf/lists/reverse.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,11 +42,13 @@ namespace cudf::lists {
  * @endcode
  *
  * @param input Lists column for this operation
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New lists column with reversed lists
  */
 std::unique_ptr<column> reverse(
   lists_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/lists/sorting.hpp b/cpp/include/cudf/lists/sorting.hpp
index c203c452b0d..39a52c75a98 100644
--- a/cpp/include/cudf/lists/sorting.hpp
+++ b/cpp/include/cudf/lists/sorting.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,6 +46,7 @@ namespace lists {
  * @param source_column View of the list column of numeric types to sort
  * @param column_order The desired sort order
  * @param null_precedence The desired order of null compared to other elements in the list
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to allocate any returned objects
  * @return list column with elements in each list sorted.
  *
@@ -54,6 +55,7 @@ std::unique_ptr<column> sort_lists(
   lists_column_view const& source_column,
   order column_order,
   null_order null_precedence,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -66,6 +68,7 @@ std::unique_ptr<column> stable_sort_lists(
   lists_column_view const& source_column,
   order column_order,
   null_order null_precedence,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp
index 5ddaa992184..3ac4f6861ec 100644
--- a/cpp/include/cudf/lists/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,12 +54,14 @@ namespace cudf::lists {
  *
  * @param input The input list column view to be filtered
  * @param boolean_mask A nullable list of bools column used to filter `input` elements
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return List column of the same type as `input`, containing filtered list rows
  */
 std::unique_ptr<column> apply_boolean_mask(
   lists_column_view const& input,
   lists_column_view const& boolean_mask,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -78,6 +80,7 @@ std::unique_ptr<column> apply_boolean_mask(
  * @param input The input lists column
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
  * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned object
  * @return The resulting lists column containing lists without duplicates
  */
@@ -85,6 +88,7 @@ std::unique_ptr<column> distinct(
   lists_column_view const& input,
   null_equality nulls_equal           = null_equality::EQUAL,
   nan_equality nans_equal             = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 79d33e7c17d..855ceadf33f 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -116,11 +116,11 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
 std::unique_ptr<column> segmented_gather(lists_column_view const& source_column,
                                          lists_column_view const& gather_map_list,
                                          out_of_bounds_policy bounds_policy,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::segmented_gather(
-    source_column, gather_map_list, bounds_policy, cudf::get_default_stream(), mr);
+  return detail::segmented_gather(source_column, gather_map_list, bounds_policy, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index 5d4a20d1cb8..365e9ef8255 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -196,10 +196,11 @@ std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
  */
 std::unique_ptr<column> extract_list_element(lists_column_view const& lists_column,
                                              size_type index,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_list_element(lists_column, index, cudf::get_default_stream(), mr);
+  return detail::extract_list_element(lists_column, index, stream, mr);
 }
 
 /**
@@ -209,12 +210,13 @@ std::unique_ptr<column> extract_list_element(lists_column_view const& lists_colu
  */
 std::unique_ptr<column> extract_list_element(lists_column_view const& lists_column,
                                              column_view const& indices,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(indices.size() == lists_column.size(),
                "Index column must have as many elements as lists column.");
-  return detail::extract_list_element(lists_column, indices, cudf::get_default_stream(), mr);
+  return detail::extract_list_element(lists_column, indices, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/reverse.cu b/cpp/src/lists/reverse.cu
index a2af85b5dad..6c00f8b64b4 100644
--- a/cpp/src/lists/reverse.cu
+++ b/cpp/src/lists/reverse.cu
@@ -86,10 +86,12 @@ std::unique_ptr<column> reverse(lists_column_view const& input,
 
 }  // namespace detail
 
-std::unique_ptr<column> reverse(lists_column_view const& input, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> reverse(lists_column_view const& input,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::reverse(input, cudf::get_default_stream(), mr);
+  return detail::reverse(input, stream, mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu
index 49054ebb046..0b70773f4b2 100644
--- a/cpp/src/lists/segmented_sort.cu
+++ b/cpp/src/lists/segmented_sort.cu
@@ -119,20 +119,21 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
 std::unique_ptr<column> sort_lists(lists_column_view const& input,
                                    order column_order,
                                    null_order null_precedence,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sort_lists(input, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::sort_lists(input, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
                                           order column_order,
                                           null_order null_precedence,
+                                          rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::stable_sort_lists(
-    input, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::stable_sort_lists(input, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
index aaee5608cc3..f92ba782da7 100644
--- a/cpp/src/lists/sequences.cu
+++ b/cpp/src/lists/sequences.cu
@@ -208,19 +208,21 @@ std::unique_ptr<column> sequences(column_view const& starts,
 
 std::unique_ptr<column> sequences(column_view const& starts,
                                   column_view const& sizes,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sequences(starts, sizes, cudf::get_default_stream(), mr);
+  return detail::sequences(starts, sizes, stream, mr);
 }
 
 std::unique_ptr<column> sequences(column_view const& starts,
                                   column_view const& steps,
                                   column_view const& sizes,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sequences(starts, steps, sizes, cudf::get_default_stream(), mr);
+  return detail::sequences(starts, steps, sizes, stream, mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
index ad43fbd5b00..ce972d89150 100644
--- a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
@@ -101,10 +101,11 @@ std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
 
 std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            lists_column_view const& boolean_mask,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::apply_boolean_mask(input, boolean_mask, cudf::get_default_stream(), mr);
+  return detail::apply_boolean_mask(input, boolean_mask, stream, mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu
index 48d8babb4fa..eb21787b3fa 100644
--- a/cpp/src/lists/stream_compaction/distinct.cu
+++ b/cpp/src/lists/stream_compaction/distinct.cu
@@ -76,10 +76,11 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
 std::unique_ptr<column> distinct(lists_column_view const& input,
                                  null_equality nulls_equal,
                                  nan_equality nans_equal,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::distinct(input, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
+  return detail::distinct(input, nulls_equal, nans_equal, stream, mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp
index c5833f40cf2..612486d8e5c 100644
--- a/cpp/tests/groupby/histogram_tests.cpp
+++ b/cpp/tests/groupby/histogram_tests.cpp
@@ -67,6 +67,7 @@ auto groupby_histogram(cudf::column_view const& keys,
   auto sorted_histograms = cudf::lists::sort_lists(cudf::lists_column_view{*sorted_vals},
                                                    cudf::order::ASCENDING,
                                                    cudf::null_order::BEFORE,
+                                                   cudf::get_default_stream(),
                                                    rmm::mr::get_current_device_resource());
 
   return std::pair{std::move(sorted_keys), std::move(sorted_histograms)};
diff --git a/cpp/tests/streams/lists_test.cpp b/cpp/tests/streams/lists_test.cpp
index e292b551d83..82a4cb8aa4a 100644
--- a/cpp/tests/streams/lists_test.cpp
+++ b/cpp/tests/streams/lists_test.cpp
@@ -21,6 +21,12 @@
 #include <cudf/lists/combine.hpp>
 #include <cudf/lists/contains.hpp>
 #include <cudf/lists/count_elements.hpp>
+#include <cudf/lists/extract.hpp>
+#include <cudf/lists/filling.hpp>
+#include <cudf/lists/gather.hpp>
+#include <cudf/lists/reverse.hpp>
+#include <cudf/lists/sorting.hpp>
+#include <cudf/lists/stream_compaction.hpp>
 
 class ListTest : public cudf::test::BaseFixture {};
 
@@ -85,3 +91,78 @@ TEST_F(ListTest, CountElements)
   cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7}, {4, 5}};
   cudf::lists::count_elements(list_col, cudf::test::get_default_stream());
 }
+
+TEST_F(ListTest, ExtractListElementFromIndex)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7}, {4, 5}};
+  cudf::lists::extract_list_element(list_col, -1, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ExtractListElementFromIndices)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7}, {4, 5}};
+  cudf::test::fixed_width_column_wrapper<int> indices({-1, -2, -1});
+  cudf::lists::extract_list_element(list_col, indices, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, SegmentedGather)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::test::lists_column_wrapper<int> gather_map_list{{0}, {1, 2}, {1}};
+  cudf::lists::segmented_gather(list_col,
+                                gather_map_list,
+                                cudf::out_of_bounds_policy::DONT_CHECK,
+                                cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, Sequences)
+{
+  cudf::test::fixed_width_column_wrapper<int> starts({0, 1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<int> sizes({0, 1, 2, 2, 1});
+  cudf::lists::sequences(starts, sizes, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, SequencesWithSteps)
+{
+  cudf::test::fixed_width_column_wrapper<int> starts({0, 1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<int> steps({2, 1, 1, 1, -3});
+  cudf::test::fixed_width_column_wrapper<int> sizes({0, 1, 2, 2, 1});
+  cudf::lists::sequences(starts, steps, sizes, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, Reverse)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::lists::reverse(list_col, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, SortLists)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::lists::sort_lists(
+    list_col, cudf::order::DESCENDING, cudf::null_order::AFTER, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, StableSortLists)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::lists::stable_sort_lists(
+    list_col, cudf::order::DESCENDING, cudf::null_order::AFTER, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ApplyBooleanMask)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::test::lists_column_wrapper<bool> boolean_mask{{0, 1}, {1, 1, 1, 0}, {0, 1}};
+  cudf::lists::apply_boolean_mask(list_col, boolean_mask, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, Distinct)
+{
+  cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::test::lists_column_wrapper<int> boolean_mask{{0, 1}, {1, 1, 1, 0}, {0, 1}};
+  cudf::lists::distinct(list_col,
+                        cudf::null_equality::EQUAL,
+                        cudf::nan_equality::ALL_EQUAL,
+                        cudf::test::get_default_stream());
+}

From 5f05c180b80b70fc09ea58aef2494c57edc44b9c Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 17 Oct 2023 11:32:12 -0400
Subject: [PATCH 030/118] Enable indexalator for device code (#14206)

Enables indexalator to be instantiated from device code.
Also add gtests for the output indexalator.
This change helps enable for the offset-normalizing-iterator #14234

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/14206
---
 .../cudf/detail/normalizing_iterator.cuh      |  40 +++++--
 cpp/tests/iterator/indexalator_test.cu        | 100 ++++++++++++++++++
 2 files changed, 131 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh
index 51b3133f84f..35a695d47df 100644
--- a/cpp/include/cudf/detail/normalizing_iterator.cuh
+++ b/cpp/include/cudf/detail/normalizing_iterator.cuh
@@ -34,7 +34,7 @@ namespace detail {
  */
 template <class Derived, typename Integer>
 struct base_normalator {
-  static_assert(std::is_integral_v<Integer>);
+  static_assert(cudf::is_index_type<Integer>());
   using difference_type   = std::ptrdiff_t;
   using value_type        = Integer;
   using pointer           = Integer*;
@@ -202,13 +202,34 @@ struct base_normalator {
     return static_cast<Derived const&>(*this).p_ >= rhs.p_;
   }
 
+ private:
+  struct integer_sizeof_fn {
+    template <typename T, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
+    CUDF_HOST_DEVICE constexpr std::size_t operator()() const
+    {
+#ifndef __CUDA_ARCH__
+      CUDF_FAIL("only integral types are supported");
+#else
+      CUDF_UNREACHABLE("only integral types are supported");
+#endif
+    }
+    template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
+    CUDF_HOST_DEVICE constexpr std::size_t operator()() const noexcept
+    {
+      return sizeof(T);
+    }
+  };
+
  protected:
   /**
    * @brief Constructor assigns width and type member variables for base class.
    */
-  explicit base_normalator(data_type dtype) : width_(size_of(dtype)), dtype_(dtype) {}
+  explicit CUDF_HOST_DEVICE base_normalator(data_type dtype) : dtype_(dtype)
+  {
+    width_ = static_cast<int32_t>(type_dispatcher(dtype, integer_sizeof_fn{}));
+  }
 
-  int width_;        /// integer type width = 1,2,4, or 8
+  int32_t width_;    /// integer type width = 1,2,4, or 8
   data_type dtype_;  /// for type-dispatcher calls
 };
 
@@ -244,12 +265,12 @@ struct input_normalator : base_normalator<input_normalator<Integer>, Integer> {
    * @brief Dispatch functor for resolving a Integer value from any integer type
    */
   struct normalize_type {
-    template <typename T, std::enable_if_t<cuda::std::is_integral_v<T>>* = nullptr>
+    template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
     __device__ Integer operator()(void const* tp)
     {
       return static_cast<Integer>(*static_cast<T const*>(tp));
     }
-    template <typename T, std::enable_if_t<not cuda::std::is_integral_v<T>>* = nullptr>
+    template <typename T, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
     __device__ Integer operator()(void const*)
     {
       CUDF_UNREACHABLE("only integral types are supported");
@@ -274,9 +295,10 @@ struct input_normalator : base_normalator<input_normalator<Integer>, Integer> {
    * @param data      Pointer to an integer array in device memory.
    * @param data_type Type of data in data
    */
-  input_normalator(void const* data, data_type dtype)
+  CUDF_HOST_DEVICE input_normalator(void const* data, data_type dtype, cudf::size_type offset = 0)
     : base_normalator<input_normalator<Integer>, Integer>(dtype), p_{static_cast<char const*>(data)}
   {
+    p_ += offset * this->width_;
   }
 
   char const* p_;  /// pointer to the integer data in device memory
@@ -327,12 +349,12 @@ struct output_normalator : base_normalator<output_normalator<Integer>, Integer>
    * @brief Dispatch functor for setting the index value from a size_type value.
    */
   struct normalize_type {
-    template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+    template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
     __device__ void operator()(void* tp, Integer const value)
     {
       (*static_cast<T*>(tp)) = static_cast<T>(value);
     }
-    template <typename T, std::enable_if_t<not std::is_integral_v<T>>* = nullptr>
+    template <typename T, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
     __device__ void operator()(void*, Integer const)
     {
       CUDF_UNREACHABLE("only index types are supported");
@@ -355,7 +377,7 @@ struct output_normalator : base_normalator<output_normalator<Integer>, Integer>
    * @param data      Pointer to an integer array in device memory.
    * @param data_type Type of data in data
    */
-  output_normalator(void* data, data_type dtype)
+  CUDF_HOST_DEVICE output_normalator(void* data, data_type dtype)
     : base_normalator<output_normalator<Integer>, Integer>(dtype), p_{static_cast<char*>(data)}
   {
   }
diff --git a/cpp/tests/iterator/indexalator_test.cu b/cpp/tests/iterator/indexalator_test.cu
index 1ff7f4c42a5..3e8bcd5cb0d 100644
--- a/cpp/tests/iterator/indexalator_test.cu
+++ b/cpp/tests/iterator/indexalator_test.cu
@@ -20,9 +20,13 @@
 
 #include <cudf/detail/indexalator.cuh>
 
+#include <thrust/binary_search.h>
+#include <thrust/gather.h>
 #include <thrust/host_vector.h>
 #include <thrust/optional.h>
 #include <thrust/pair.h>
+#include <thrust/scatter.h>
+#include <thrust/sequence.h>
 
 using TestingTypes = cudf::test::IntegralTypesNotBool;
 
@@ -94,3 +98,99 @@ TYPED_TEST(IndexalatorTest, optional_iterator)
   auto it_dev = cudf::detail::indexalator_factory::make_input_optional_iterator(d_col);
   this->iterator_test_thrust(expected_values, it_dev, host_values.size());
 }
+
+template <typename Integer>
+struct transform_fn {
+  __device__ cudf::size_type operator()(Integer v)
+  {
+    return static_cast<cudf::size_type>(v) + static_cast<cudf::size_type>(v);
+  }
+};
+
+TYPED_TEST(IndexalatorTest, output_iterator)
+{
+  using T = TypeParam;
+
+  auto d_col1 =
+    cudf::test::fixed_width_column_wrapper<T, int32_t>({0, 6, 7, 14, 23, 33, 43, 45, 63});
+  auto d_col2 =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 0, 0, 0, 0, 0, 0, 0, 0});
+  auto itr    = cudf::detail::indexalator_factory::make_output_iterator(d_col2);
+  auto input  = cudf::column_view(d_col1);
+  auto stream = cudf::get_default_stream();
+
+  auto map   = cudf::test::fixed_width_column_wrapper<int>({0, 2, 4, 6, 8, 1, 3, 5, 7});
+  auto d_map = cudf::column_view(map);
+  thrust::gather(
+    rmm::exec_policy_nosync(stream), d_map.begin<int>(), d_map.end<int>(), input.begin<T>(), itr);
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 7, 23, 43, 63, 6, 14, 33, 45});
+  thrust::scatter(
+    rmm::exec_policy_nosync(stream), input.begin<T>(), input.end<T>(), d_map.begin<int>(), itr);
+  expected =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 33, 6, 43, 7, 45, 14, 63, 23});
+
+  thrust::transform(
+    rmm::exec_policy(stream), input.begin<T>(), input.end<T>(), itr, transform_fn<T>{});
+  expected =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 12, 14, 28, 46, 66, 86, 90, 126});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::fill(rmm::exec_policy(stream), itr, itr + input.size(), 77);
+  expected =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({77, 77, 77, 77, 77, 77, 77, 77, 77});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::sequence(rmm::exec_policy(stream), itr, itr + input.size());
+  expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 1, 2, 3, 4, 5, 6, 7, 8});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  auto indices =
+    cudf::test::fixed_width_column_wrapper<T, int32_t>({0, 10, 20, 30, 40, 50, 60, 70, 80});
+  auto d_indices = cudf::column_view(indices);
+  thrust::lower_bound(rmm::exec_policy(stream),
+                      d_indices.begin<T>(),
+                      d_indices.end<T>(),
+                      input.begin<T>(),
+                      input.end<T>(),
+                      itr);
+  expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 1, 1, 2, 3, 4, 5, 5, 7});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+}
+
+/**
+ * For testing creating and using the indexalator in device code.
+ */
+struct device_functor_fn {
+  cudf::column_device_view const d_col;
+  __device__ cudf::size_type operator()(cudf::size_type idx)
+  {
+    auto itr = cudf::detail::input_indexalator(d_col.head(), d_col.type());
+    return itr[idx] * 3;
+  }
+};
+
+TYPED_TEST(IndexalatorTest, device_indexalator)
+{
+  using T = TypeParam;
+
+  auto d_col1 =
+    cudf::test::fixed_width_column_wrapper<T, int32_t>({0, 6, 7, 14, 23, 33, 43, 45, 63});
+  auto d_col2 =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 0, 0, 0, 0, 0, 0, 0, 0});
+  auto input  = cudf::column_view(d_col1);
+  auto output = cudf::mutable_column_view(d_col2);
+  auto stream = cudf::get_default_stream();
+
+  auto d_input = cudf::column_device_view::create(input, stream);
+
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::counting_iterator<int>(0),
+                    thrust::counting_iterator<int>(input.size()),
+                    output.begin<cudf::size_type>(),
+                    device_functor_fn{*d_input});
+
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 18, 21, 42, 69, 99, 129, 135, 189});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+}

From 29b25373bb0074ebec18653b28b7fd1dc0196b1a Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Wed, 18 Oct 2023 08:35:31 -0500
Subject: [PATCH 031/118] update repo refs (#14289)

---
 .github/workflows/build.yaml | 16 ++++++++--------
 .github/workflows/pr.yaml    | 28 ++++++++++++++--------------
 .github/workflows/test.yaml  | 16 ++++++++--------
 ci/release/update-version.sh |  2 +-
 4 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index ab028eb89cc..1a7aa00aebf 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -100,7 +100,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 214f9c90b41..8d6c471c912 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -26,34 +26,34 @@ jobs:
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-23.12
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-23.12
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: pull-request
       test_script: "ci/test_python_cudf.sh"
@@ -61,14 +61,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: pull-request
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -78,7 +78,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -88,7 +88,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -98,21 +98,21 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-dask-cudf:
     needs: wheel-tests-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
@@ -120,7 +120,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 9ca32bcfe03..db86e035067 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -36,7 +36,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -55,7 +55,7 @@ jobs:
       test_script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -67,7 +67,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: nightly
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 5e735a71994..eac64fe1a0f 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -113,7 +113,7 @@ sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_
 
 # CI files
 for FILE in .github/workflows/*.yaml; do
-  sed_runner "/shared-action-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
+  sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
   sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE};
 done
 sed_runner "s/RAPIDS_VERSION_NUMBER=\".*/RAPIDS_VERSION_NUMBER=\"${NEXT_SHORT_TAG}\"/g" ci/build_docs.sh

From 7aa757959b5b597a0258c955fbeadb92cfb2e762 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 19 Oct 2023 00:10:19 +0100
Subject: [PATCH 032/118] Add nvtx annotations to spilling-based data movement
 (#14196)

In nsys traces, it is useful to be able to see when memory allocations are provoking cudf-managed spilling. Do this by adding appropriate nvtx annotations.

To enable this, we must move the nvtx decorators to a separate file avoiding circular imports.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/14196
---
 python/cudf/cudf/core/buffer/spill_manager.py |  7 +++++
 .../cudf/cudf/core/buffer/spillable_buffer.py | 24 +++++++++++----
 python/cudf/cudf/core/dataframe.py            |  7 ++---
 python/cudf/cudf/core/frame.py                |  7 ++---
 python/cudf/cudf/core/groupby/groupby.py      |  3 +-
 python/cudf/cudf/core/index.py                |  8 ++---
 python/cudf/cudf/core/indexed_frame.py        |  3 +-
 python/cudf/cudf/core/multiindex.py           |  3 +-
 python/cudf/cudf/core/series.py               |  2 +-
 python/cudf/cudf/core/single_column_frame.py  |  3 +-
 python/cudf/cudf/core/udf/groupby_utils.py    |  2 +-
 python/cudf/cudf/core/udf/utils.py            |  3 +-
 python/cudf/cudf/io/csv.py                    |  2 +-
 python/cudf/cudf/io/parquet.py                |  2 +-
 python/cudf/cudf/io/text.py                   |  4 +--
 python/cudf/cudf/utils/nvtx_annotation.py     | 30 +++++++++++++++++++
 python/cudf/cudf/utils/utils.py               | 27 -----------------
 python/dask_cudf/dask_cudf/backends.py        |  2 +-
 python/dask_cudf/dask_cudf/core.py            |  2 +-
 python/dask_cudf/dask_cudf/groupby.py         |  2 +-
 python/dask_cudf/dask_cudf/sorting.py         |  2 +-
 21 files changed, 82 insertions(+), 63 deletions(-)
 create mode 100644 python/cudf/cudf/utils/nvtx_annotation.py

diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index f056a0fd592..91f3b2cd544 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -11,14 +11,20 @@
 import weakref
 from collections import defaultdict
 from dataclasses import dataclass
+from functools import partial
 from typing import Dict, List, Optional, Tuple
 
 import rmm.mr
 
 from cudf.core.buffer.spillable_buffer import SpillableBuffer
 from cudf.options import get_option
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.string import format_bytes
 
+_spill_cudf_nvtx_annotate = partial(
+    _cudf_nvtx_annotate, domain="cudf_python-spill"
+)
+
 
 def get_traceback() -> str:
     """Pretty print current traceback to a string"""
@@ -329,6 +335,7 @@ def buffers(
             ret = tuple(sorted(ret, key=lambda b: b.last_accessed))
         return ret
 
+    @_spill_cudf_nvtx_annotate
     def spill_device_memory(self, nbytes: int) -> int:
         """Try to spill device memory
 
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index 84fb2044c62..1856bec1876 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -20,6 +20,7 @@
     get_ptr_and_size,
     host_memory_allocation,
 )
+from cudf.utils.nvtx_annotation import _get_color_for_nvtx, annotate
 from cudf.utils.string import format_bytes
 
 if TYPE_CHECKING:
@@ -291,8 +292,15 @@ def spill(self, target: str = "cpu") -> None:
                 )
 
             if (ptr_type, target) == ("gpu", "cpu"):
-                host_mem = host_memory_allocation(self.size)
-                rmm._lib.device_buffer.copy_ptr_to_host(self._ptr, host_mem)
+                with annotate(
+                    message="SpillDtoH",
+                    color=_get_color_for_nvtx("SpillDtoH"),
+                    domain="cudf_python-spill",
+                ):
+                    host_mem = host_memory_allocation(self.size)
+                    rmm._lib.device_buffer.copy_ptr_to_host(
+                        self._ptr, host_mem
+                    )
                 self._ptr_desc["memoryview"] = host_mem
                 self._ptr = 0
                 self._owner = None
@@ -302,9 +310,15 @@ def spill(self, target: str = "cpu") -> None:
                 # trigger a new call to this buffer's `spill()`.
                 # Therefore, it is important that spilling-on-demand doesn't
                 # try to unspill an already locked buffer!
-                dev_mem = rmm.DeviceBuffer.to_device(
-                    self._ptr_desc.pop("memoryview")
-                )
+                with annotate(
+                    message="SpillHtoD",
+                    color=_get_color_for_nvtx("SpillHtoD"),
+                    domain="cudf_python-spill",
+                ):
+
+                    dev_mem = rmm.DeviceBuffer.to_device(
+                        self._ptr_desc.pop("memoryview")
+                    )
                 self._ptr = dev_mem.ptr
                 self._owner = dev_mem
                 assert self._size == dev_mem.size
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index ead2f182e2d..b38345af83d 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -95,11 +95,8 @@
     min_scalar_type,
     numeric_normalize_types,
 )
-from cudf.utils.utils import (
-    GetAttrGetItemMixin,
-    _cudf_nvtx_annotate,
-    _external_only_api,
-)
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
 
 _cupy_nan_methods_map = {
     "min": "nanmin",
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 1e6d177f8ca..7cb78bc8d1f 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -47,11 +47,8 @@
 from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import find_common_type
-from cudf.utils.utils import (
-    _array_ufunc,
-    _cudf_nvtx_annotate,
-    _warn_no_dask_cudf,
-)
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf
 
 
 # TODO: It looks like Frame is missing a declaration of `copy`, need to add
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index e1740140b44..3b8f0f3824a 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -29,7 +29,8 @@
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
 from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply
-from cudf.utils.utils import GetAttrGetItemMixin, _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import GetAttrGetItemMixin
 
 
 # The three functions below return the quantiles [25%, 50%, 75%]
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index de8a5948033..5c323bda9ea 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -64,12 +64,8 @@
     is_mixed_with_object_dtype,
     numeric_normalize_types,
 )
-from cudf.utils.utils import (
-    _cudf_nvtx_annotate,
-    _is_same_name,
-    _warn_no_dask_cudf,
-    search_range,
-)
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import _is_same_name, _warn_no_dask_cudf, search_range
 
 
 def _lexsorted_equal_range(
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 1008cbdb67f..b1fb47eb790 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -69,7 +69,8 @@
 )
 from cudf.utils import docutils
 from cudf.utils._numba import _CUDFNumbaConfig
-from cudf.utils.utils import _cudf_nvtx_annotate, _warn_no_dask_cudf
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import _warn_no_dask_cudf
 
 doc_reset_index_template = """
         Reset the index of the {klass}, or a level of it.
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 21380bb841c..87a11478870 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -26,7 +26,8 @@
 from cudf.core._compat import PANDAS_GE_150
 from cudf.core.frame import Frame
 from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index
-from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate, _is_same_name
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import NotIterable, _is_same_name
 
 
 def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index a195738af54..00ba722136e 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -90,7 +90,7 @@
     is_mixed_with_object_dtype,
     to_cudf_compatible_scalar,
 )
-from cudf.utils.utils import _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
 
 def _format_percentile_names(percentiles):
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 6a56ab8f3a5..e30e1c747f5 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -19,7 +19,8 @@
 )
 from cudf.core.column import ColumnBase, as_column
 from cudf.core.frame import Frame
-from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import NotIterable
 
 
 class SingleColumnFrame(Frame, NotIterable):
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index b18720f5db5..5dbcf455e33 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -28,7 +28,7 @@
     _supported_dtypes_from_frame,
 )
 from cudf.utils._numba import _CUDFNumbaConfig
-from cudf.utils.utils import _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
 
 def _get_frame_groupby_type(dtype, index_dtype):
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index 35a3f6c1ffd..7b7ac2b3070 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -39,7 +39,8 @@
     STRING_TYPES,
     TIMEDELTA_TYPES,
 )
-from cudf.utils.utils import _cudf_nvtx_annotate, initfunc
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.utils import initfunc
 
 # Maximum size of a string column is 2 GiB
 _STRINGS_UDF_DEFAULT_HEAP_SIZE = os.environ.get(
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index bacc0641639..764885dd7b6 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -11,7 +11,7 @@
 from cudf.api.types import is_scalar
 from cudf.utils import ioutils
 from cudf.utils.dtypes import _maybe_convert_to_default_type
-from cudf.utils.utils import _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
 
 @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index d8510cf8e95..d84aff66d7b 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -22,7 +22,7 @@
 from cudf.api.types import is_list_like
 from cudf.core.column import build_categorical_column, column_empty, full
 from cudf.utils import ioutils
-from cudf.utils.utils import _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
 BYTE_SIZES = {
     "kb": 1000,
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index eb2c7fa7ef6..0e19972f6e0 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
 from io import BytesIO, StringIO
 
 import cudf
 from cudf._lib import text as libtext
 from cudf.utils import ioutils
-from cudf.utils.utils import _cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
 
 @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/utils/nvtx_annotation.py b/python/cudf/cudf/utils/nvtx_annotation.py
new file mode 100644
index 00000000000..a4404e51232
--- /dev/null
+++ b/python/cudf/cudf/utils/nvtx_annotation.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+import hashlib
+from functools import partial
+
+from nvtx import annotate
+
+_NVTX_COLORS = ["green", "blue", "purple", "rapids"]
+
+
+def _get_color_for_nvtx(name):
+    m = hashlib.sha256()
+    m.update(name.encode())
+    hash_value = int(m.hexdigest(), 16)
+    idx = hash_value % len(_NVTX_COLORS)
+    return _NVTX_COLORS[idx]
+
+
+def _cudf_nvtx_annotate(func, domain="cudf_python"):
+    """Decorator for applying nvtx annotations to methods in cudf."""
+    return annotate(
+        message=func.__qualname__,
+        color=_get_color_for_nvtx(func.__qualname__),
+        domain=domain,
+    )(func)
+
+
+_dask_cudf_nvtx_annotate = partial(
+    _cudf_nvtx_annotate, domain="dask_cudf_python"
+)
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index e2cb3f145a1..0ff23bd37c6 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -2,15 +2,12 @@
 
 import decimal
 import functools
-import hashlib
 import os
 import traceback
 import warnings
-from functools import partial
 from typing import FrozenSet, Set, Union
 
 import numpy as np
-from nvtx import annotate
 
 import rmm
 
@@ -119,8 +116,6 @@ def _array_ufunc(obj, ufunc, method, inputs, kwargs):
     "__ge__",
 }
 
-_NVTX_COLORS = ["green", "blue", "purple", "rapids"]
-
 # The test root is set by pytest to support situations where tests are run from
 # a source tree on a built version of cudf.
 NO_EXTERNAL_ONLY_APIS = os.getenv("NO_EXTERNAL_ONLY_APIS")
@@ -353,28 +348,6 @@ def is_na_like(obj):
     return obj is None or obj is cudf.NA or obj is cudf.NaT
 
 
-def _get_color_for_nvtx(name):
-    m = hashlib.sha256()
-    m.update(name.encode())
-    hash_value = int(m.hexdigest(), 16)
-    idx = hash_value % len(_NVTX_COLORS)
-    return _NVTX_COLORS[idx]
-
-
-def _cudf_nvtx_annotate(func, domain="cudf_python"):
-    """Decorator for applying nvtx annotations to methods in cudf."""
-    return annotate(
-        message=func.__qualname__,
-        color=_get_color_for_nvtx(func.__qualname__),
-        domain=domain,
-    )(func)
-
-
-_dask_cudf_nvtx_annotate = partial(
-    _cudf_nvtx_annotate, domain="dask_cudf_python"
-)
-
-
 def _warn_no_dask_cudf(fn):
     @functools.wraps(fn)
     def wrapper(self):
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 344b03c631d..7b35c71ff09 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -42,7 +42,7 @@
 
 import cudf
 from cudf.api.types import is_string_dtype
-from cudf.utils.utils import _dask_cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
 
 from .core import DataFrame, Index, Series
 
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 5b37e6e825c..17650c9b70d 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -22,7 +22,7 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf.utils.utils import _dask_cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
 
 from dask_cudf import sorting
 from dask_cudf.accessors import ListMethods, StructMethods
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index f4bbcaf4dd1..b1fdf443a17 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -15,7 +15,7 @@
 from dask.utils import funcname
 
 import cudf
-from cudf.utils.utils import _dask_cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
 
 # aggregations that are dask-cudf optimized
 OPTIMIZED_AGGS = (
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index d6c9c1be73c..27ba82c390c 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -16,7 +16,7 @@
 
 import cudf as gd
 from cudf.api.types import is_categorical_dtype
-from cudf.utils.utils import _dask_cudf_nvtx_annotate
+from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
 
 _SHUFFLE_SUPPORT = ("tasks", "p2p")  # "disk" not supported
 

From d36904b16fe6b0244fe63973f3a7ae0987062beb Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 19 Oct 2023 14:01:50 -0700
Subject: [PATCH 033/118] Downgrade to Arrow 12.0.0 for aws-sdk-cpp and fix
 cudf_kafka builds for new CI containers (#14296)

The aws-sdk-cpp pinning introduced in https://github.com/rapidsai/cudf/pull/14173 causes problems because newer builds of libarrow require a newer version of aws-sdk-cpp. Even though we restrict to libarrow 12.0.1, this restriction is insufficient to create solvable environments because the conda (mamba) solver doesn't seem to consistently reach far back enough into the history of builds to pull the last build that was compatible with the aws-sdk-cpp version that we need. For now, the safest way for us to avoid this problem is to downgrade to arrow 12.0.0, for which all conda package builds are pinned to the older version of aws-sdk-cpp that does not have the bug in question.

Separately, while the above issue was encountered we also got new builds of our CI images [that removed system installs of CTK packages from CUDA 12 images](https://github.com/rapidsai/ci-imgs/pull/77). This changes was made because for CUDA 12 we can get all the necessary pieces of the CTK from conda-forge. However, it turns out that the cudf_kafka builds were implicitly relying on system CTK packages, and the cudf_kafka build is in fact not fully compatible with conda-forge CTK packages because it is not using CMake via scikit-build (nor any other more sophisticated library discovery mechanism like pkg-config) and therefore does not know how to find conda-forge CTK headers/libraries. This PR introduces a set of temporary patches to get around this limitation. These patches are not a long-term fix, and are only put in place assuming that #14292 is merged in the near future before we cut a 23.12 release.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14296
---
 conda/environments/all_cuda-118_arch-x86_64.yaml |  4 ++--
 conda/environments/all_cuda-120_arch-x86_64.yaml |  5 +++--
 conda/recipes/cudf/meta.yaml                     |  2 +-
 conda/recipes/cudf_kafka/build.sh                | 14 +++++++++++++-
 conda/recipes/cudf_kafka/meta.yaml               | 11 +++++++++++
 conda/recipes/libcudf/conda_build_config.yaml    |  4 ++--
 conda/recipes/libcudf/meta.yaml                  |  4 ++--
 cpp/cmake/thirdparty/get_arrow.cmake             |  2 +-
 dependencies.yaml                                | 16 +++++++++-------
 python/cudf/pyproject.toml                       |  4 ++--
 python/cudf_kafka/pyproject.toml                 |  2 +-
 11 files changed, 47 insertions(+), 21 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 8976a4b14cb..2c79cbb6b6c 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -40,7 +40,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow==12.0.1.*
+- libarrow==12.0.0.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
@@ -69,7 +69,7 @@ dependencies:
 - pre-commit
 - protobuf>=4.21,<5
 - ptxcompiler
-- pyarrow==12.0.1.*
+- pyarrow==12.0.0.*
 - pydata-sphinx-theme
 - pyorc
 - pytest
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index f54d78593c3..c96b7428882 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -17,6 +17,7 @@ dependencies:
 - cachetools
 - cmake>=3.26.4
 - cuda-cudart-dev
+- cuda-gdb
 - cuda-nvcc
 - cuda-nvrtc-dev
 - cuda-nvtx-dev
@@ -41,7 +42,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow==12.0.1.*
+- libarrow==12.0.0.*
 - libcufile-dev
 - libcurand-dev
 - libkvikio==23.12.*
@@ -66,7 +67,7 @@ dependencies:
 - pip
 - pre-commit
 - protobuf>=4.21,<5
-- pyarrow==12.0.1.*
+- pyarrow==12.0.0.*
 - pydata-sphinx-theme
 - pyorc
 - pytest
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 54b687faa69..16b064a262e 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -61,7 +61,7 @@ requirements:
     - scikit-build >=0.13.1
     - setuptools
     - dlpack >=0.5,<0.6.0a0
-    - pyarrow =12
+    - pyarrow =12.0.0
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
diff --git a/conda/recipes/cudf_kafka/build.sh b/conda/recipes/cudf_kafka/build.sh
index 5d8720f1c98..f4bb6e1bc91 100644
--- a/conda/recipes/cudf_kafka/build.sh
+++ b/conda/recipes/cudf_kafka/build.sh
@@ -1,4 +1,16 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 # This assumes the script is executed from the root of the repo directory
+# Need to set CUDA_HOME inside conda environments because the hacked together
+# setup.py for cudf-kafka searches that way.
+# TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
+# cudf_kafka to use scikit-build
+CUDA_MAJOR=${RAPIDS_CUDA_VERSION%%.*}
+if [[ ${CUDA_MAJOR} == "12" ]]; then
+    target_name="x86_64-linux"
+    if [[ ! $(arch) == "x86_64" ]]; then
+        target_name="sbsa-linux"
+    fi
+    export CUDA_HOME="${PREFIX}/targets/${target_name}/"
+fi
 ./build.sh -v cudf_kafka
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index ec0cc402511..a79c23b7d98 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -33,6 +33,9 @@ build:
     - SCCACHE_S3_KEY_PREFIX=cudf-kafka-linux64 # [linux64]
     - SCCACHE_S3_USE_SSL
     - SCCACHE_S3_NO_CREDENTIALS
+    # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
+    # cudf_kafka to use scikit-build
+    - RAPIDS_CUDA_VERSION
 
 requirements:
   build:
@@ -41,6 +44,11 @@ requirements:
     - {{ compiler('cxx') }}
     - ninja
     - sysroot_{{ target_platform }} {{ sysroot_version }}
+    # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
+    # cudf_kafka to use scikit-build
+    {% if cuda_major == "12" %}
+    - cuda-gdb
+    {% endif %}
   host:
     - python
     - cython >=3.0.0
@@ -48,6 +56,9 @@ requirements:
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
     - setuptools
+    {% if cuda_major == "12" %}
+    - cuda-cudart-dev
+    {% endif %}
   run:
     - python
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index b1f5b083e06..4d33bb89220 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -25,8 +25,8 @@ gtest_version:
 aws_sdk_cpp_version:
   - "<1.11"
 
-libarrow_version:
-  - "=12"
+libarrow:
+  - "==12.0.0"
 
 dlpack_version:
   - ">=0.5,<0.6.0a0"
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 28357f0d96d..b9aff2a9c82 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -65,7 +65,7 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
     - nvcomp {{ nvcomp_version }}
-    - libarrow {{ libarrow_version }}
+    - libarrow {{ libarrow }}
     - dlpack {{ dlpack_version }}
     - librdkafka {{ librdkafka_version }}
     - fmt {{ fmt_version }}
@@ -104,7 +104,7 @@ outputs:
         - nvcomp {{ nvcomp_version }}
         - librmm ={{ minor_version }}
         - libkvikio ={{ minor_version }}
-        - libarrow {{ libarrow_version }}
+        - libarrow {{ libarrow }}
         - dlpack {{ dlpack_version }}
         - gtest {{ gtest_version }}
         - gmock {{ gtest_version }}
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 10d3145a36f..c2d5cfbaf78 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -411,7 +411,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
       # This version must be kept in sync with the libarrow version pinned for builds in
       # dependencies.yaml.
-      12.0.1
+      12.0.0
       CACHE STRING "The version of Arrow to find (or build)"
   )
 endif()
diff --git a/dependencies.yaml b/dependencies.yaml
index c19e8765be3..e8114fa5615 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -62,6 +62,7 @@ files:
     includes:
       - cudatoolkit
       - docs
+      - libarrow_run
       - py_version
   py_build_cudf:
     output: pyproject
@@ -225,7 +226,7 @@ dependencies:
           - &gmock gmock>=1.13.0
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - libarrow==12.0.1.*
+          - &libarrow libarrow==12.0.0.*
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==2.6.1
@@ -243,7 +244,7 @@ dependencies:
           - cython>=3.0.0
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - pyarrow==12.0.1.*
+          - &pyarrow pyarrow==12.0.0.*
           # TODO: Pin to numpy<1.25 until cudf requires pandas 2
           - &numpy numpy>=1.21,<1.25
   build_python:
@@ -260,16 +261,14 @@ dependencies:
           - protoc-wheel
   libarrow_run:
     common:
-      - output_types: [conda, requirements]
+      - output_types: conda
         packages:
-          # Allow runtime version to float up to minor version
-          - libarrow==12.*
+          - *libarrow
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          # Allow runtime version to float up to minor version
-          - pyarrow==12.*
+          - *pyarrow
   cudatoolkit:
     specific:
       - output_types: conda
@@ -282,6 +281,9 @@ dependencies:
               - cuda-nvrtc-dev
               - cuda-nvtx-dev
               - libcurand-dev
+              # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
+              # cudf_kafka to use scikit-build
+              - cuda-gdb
           - matrix:
               cuda: "11.8"
             packages:
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 39a8dca0267..ccb5d5d4416 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -8,7 +8,7 @@ requires = [
     "ninja",
     "numpy>=1.21,<1.25",
     "protoc-wheel",
-    "pyarrow==12.0.1.*",
+    "pyarrow==12.0.0.*",
     "rmm==23.12.*",
     "scikit-build>=0.13.1",
     "setuptools",
@@ -38,7 +38,7 @@ dependencies = [
     "pandas>=1.3,<1.6.0dev0",
     "protobuf>=4.21,<5",
     "ptxcompiler",
-    "pyarrow==12.*",
+    "pyarrow==12.0.0.*",
     "rmm==23.12.*",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 78a7a83ac3a..ff475e5a72e 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -5,7 +5,7 @@
 requires = [
     "cython>=3.0.0",
     "numpy>=1.21,<1.25",
-    "pyarrow==12.0.1.*",
+    "pyarrow==12.0.0.*",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From 50e2211506bf3de88bb625b2b6d684d0d799c274 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Fri, 20 Oct 2023 10:00:08 +0200
Subject: [PATCH 034/118] Changes JSON reader's recovery option's behaviour to
 ignore all characters after a valid JSON record (#14279)

Closes https://github.com/rapidsai/cudf/issues/14226.

The new behvior of `JSON_LINES_RECOVER` will now ignore excess characters after the first valid JSON record on each JSON line.
```
{ "number": 1 }
{ "number": 1 } xyz
{ "number": 1 } {}
{ "number": 1 } { "number": 4 }
```

**Implementation details:**
The JSON parser pushdown automaton was changed for `JSON_LINES_RECOVER` format such that when in state `PD_PVL` (`post-value`, "I have just finished parsing a value") and when the stack context is `ROOT` ("I'm not somewhere within a list or struct"), we just treat all characters as "white space" until encountering a newline character. `post-value` in stack context `ROOT` is exactly the condition we are in after having parsed the first valid record of a JSON line. _Thanks to @karthikeyann for suggesting to use `PD_PVL` as the capturing state._

As the stack context is generated upfront, we have to fix up and correct the stack context to set the stack context as `ROOT` stack context for all these excess characters. I.e., (`_` means `ROOT` stack context, `{` means within a `STRUCT` stack context):
```
in:    {"a":1}{"this is supposed to be ignored"}
stack: _{{{{{{_{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{
```
Needs to be fixed up to become:
```
in:    {"a":1}{"this is supposed to be ignored"}
stack: _{{{{{{__________________________________
```

Authors:
  - Elias Stehle (https://github.com/elstehle)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/14279
---
 cpp/src/io/fst/lookup_tables.cuh   |   4 +-
 cpp/src/io/json/nested_json_gpu.cu | 165 +++++++++++++++++++++++++----
 cpp/tests/io/json_test.cpp         |  71 ++++++++++++-
 cpp/tests/io/nested_json_test.cpp  |   2 +-
 4 files changed, 218 insertions(+), 24 deletions(-)

diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh
index 37c99453361..42036b79751 100644
--- a/cpp/src/io/fst/lookup_tables.cuh
+++ b/cpp/src/io/fst/lookup_tables.cuh
@@ -753,7 +753,7 @@ class TranslationOp {
                                              RelativeOffsetT const relative_offset,
                                              SymbolT const read_symbol) const
   {
-    return translation_op(*this, state_id, match_id, relative_offset, read_symbol);
+    return translation_op(state_id, match_id, relative_offset, read_symbol);
   }
 
   template <typename StateIndexT, typename SymbolIndexT, typename SymbolT>
@@ -761,7 +761,7 @@ class TranslationOp {
                                              SymbolIndexT const match_id,
                                              SymbolT const read_symbol) const
   {
-    return translation_op(*this, state_id, match_id, read_symbol);
+    return translation_op(state_id, match_id, read_symbol);
   }
 };
 
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index c9107357239..3702d94fd2b 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -91,6 +91,98 @@ void check_input_size(std::size_t input_size)
 
 namespace cudf::io::json {
 
+// FST to help fixing the stack context of characters that follow the first record on each JSON line
+namespace fix_stack_of_excess_chars {
+
+// Type used to represent the target state in the transition table
+using StateT = char;
+
+// Type used to represent a symbol group id
+using SymbolGroupT = uint8_t;
+
+/**
+ * @brief Definition of the DFA's states.
+ */
+enum class dfa_states : StateT {
+  // Before the first record on the JSON line
+  BEFORE,
+  // Within the first record on the JSON line
+  WITHIN,
+  // Excess data that follows the first record on the JSON line
+  EXCESS,
+  // Total number of states
+  NUM_STATES
+};
+
+/**
+ * @brief Definition of the symbol groups
+ */
+enum class dfa_symbol_group_id : SymbolGroupT {
+  ROOT,              ///< Symbol for root stack context
+  DELIMITER,         ///< Line delimiter symbol group
+  OTHER,             ///< Symbol group that implicitly matches all other tokens
+  NUM_SYMBOL_GROUPS  ///< Total number of symbol groups
+};
+
+constexpr auto TT_NUM_STATES     = static_cast<StateT>(dfa_states::NUM_STATES);
+constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
+
+/**
+ * @brief Function object to map (input_symbol,stack_context) tuples to a symbol group.
+ */
+struct SymbolPairToSymbolGroupId {
+  CUDF_HOST_DEVICE SymbolGroupT operator()(thrust::tuple<SymbolT, StackSymbolT> symbol) const
+  {
+    auto const input_symbol = thrust::get<0>(symbol);
+    auto const stack_symbol = thrust::get<1>(symbol);
+    return static_cast<SymbolGroupT>(
+      input_symbol == '\n'
+        ? dfa_symbol_group_id::DELIMITER
+        : (stack_symbol == '_' ? dfa_symbol_group_id::ROOT : dfa_symbol_group_id::OTHER));
+  }
+};
+
+/**
+ * @brief Translation function object that fixes the stack context of excess data that follows after
+ * the first JSON record on each line.
+ */
+struct TransduceInputOp {
+  template <typename RelativeOffsetT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE StackSymbolT operator()(StateT const state_id,
+                                                     SymbolGroupT const match_id,
+                                                     RelativeOffsetT const relative_offset,
+                                                     SymbolT const read_symbol) const
+  {
+    if (state_id == static_cast<StateT>(dfa_states::EXCESS)) { return '_'; }
+    return thrust::get<1>(read_symbol);
+  }
+
+  template <typename SymbolT>
+  constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id,
+                                                SymbolGroupT const match_id,
+                                                SymbolT const read_symbol) const
+  {
+    constexpr int32_t single_output_item = 1;
+    return single_output_item;
+  }
+};
+
+// Aliases for readability of the transition table
+constexpr auto TT_BEFORE = dfa_states::BEFORE;
+constexpr auto TT_INSIDE = dfa_states::WITHIN;
+constexpr auto TT_EXCESS = dfa_states::EXCESS;
+
+// Transition table
+std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> constexpr transition_table{
+  {/* IN_STATE            ROOT      NEWLINE     OTHER */
+   /* TT_BEFORE    */ {{TT_BEFORE, TT_BEFORE, TT_INSIDE}},
+   /* TT_INSIDE    */ {{TT_EXCESS, TT_BEFORE, TT_INSIDE}},
+   /* TT_EXCESS    */ {{TT_EXCESS, TT_BEFORE, TT_EXCESS}}}};
+
+// The DFA's starting state
+constexpr auto start_state = static_cast<StateT>(dfa_states::BEFORE);
+}  // namespace fix_stack_of_excess_chars
+
 // FST to prune tokens of invalid lines for recovering JSON lines format
 namespace token_filter {
 
@@ -146,9 +238,8 @@ struct UnwrapTokenFromSymbolOp {
  * invalid lines.
  */
 struct TransduceToken {
-  template <typename TransducerTableT, typename RelativeOffsetT, typename SymbolT>
-  constexpr CUDF_HOST_DEVICE SymbolT operator()(TransducerTableT const&,
-                                                StateT const state_id,
+  template <typename RelativeOffsetT, typename SymbolT>
+  constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
                                                 SymbolGroupT const match_id,
                                                 RelativeOffsetT const relative_offset,
                                                 SymbolT const read_symbol) const
@@ -165,9 +256,8 @@ struct TransduceToken {
     }
   }
 
-  template <typename TransducerTableT, typename SymbolT>
-  constexpr CUDF_HOST_DEVICE int32_t operator()(TransducerTableT const&,
-                                                StateT const state_id,
+  template <typename SymbolT>
+  constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id,
                                                 SymbolGroupT const match_id,
                                                 SymbolT const read_symbol) const
   {
@@ -643,6 +733,11 @@ auto get_transition_table(json_format_cfg_t format)
     // PD_ANL describes the target state after a new line after encountering error state
     auto const PD_ANL = (format == json_format_cfg_t::JSON_LINES_RECOVER) ? PD_BOV : PD_ERR;
 
+    // Target state after having parsed the first JSON value on a JSON line
+    // Spark has the special need to ignore everything that comes after the first JSON object
+    // on a JSON line instead of marking those as invalid
+    auto const PD_AFS = (format == json_format_cfg_t::JSON_LINES_RECOVER) ? PD_PVL : PD_ERR;
+
     // First row:  empty stack         ("root" level of the JSON)
     // Second row: '[' on top of stack (we're parsing a list value)
     // Third row:  '{' on top of stack (we're parsing a struct value)
@@ -668,7 +763,7 @@ auto get_transition_table(json_format_cfg_t format)
       PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR,
       PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR};
     pda_tt[static_cast<StateT>(pda_state_t::PD_PVL)] = {
-      PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_BOV, PD_ERR,
+      PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_PVL, PD_BOV, PD_AFS,
       PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_BOV, PD_ERR,
       PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_BOV, PD_ERR};
     pda_tt[static_cast<StateT>(pda_state_t::PD_BFN)] = {
@@ -733,6 +828,18 @@ auto get_translation_table(bool recover_from_error)
     return regular_tokens;
   };
 
+  /**
+   * @brief Helper function that returns `recovering_tokens` if `recover_from_error` is true and
+   * returns `regular_tokens` otherwise. This is used to ignore excess characters after the first
+   * value in the case of JSON lines that recover from invalid lines, as Spark ignores any excess
+   * characters that follow the first record on a JSON line.
+   */
+  auto alt_tokens = [recover_from_error](std::vector<char> regular_tokens,
+                                         std::vector<char> recovering_tokens) {
+    if (recover_from_error) { return recovering_tokens; }
+    return regular_tokens;
+  };
+
   std::array<std::array<std::vector<char>, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tlt;
   pda_tlt[static_cast<StateT>(pda_state_t::PD_BOV)] = {{                    /*ROOT*/
                                                         {StructBegin},      // OPENING_BRACE
@@ -920,18 +1027,18 @@ auto get_translation_table(bool recover_from_error)
                                                         {}}};                         // OTHER
 
   pda_tlt[static_cast<StateT>(pda_state_t::PD_PVL)] = {
-    {                    /*ROOT*/
-     {ErrorBegin},       // OPENING_BRACE
-     {ErrorBegin},       // OPENING_BRACKET
-     {ErrorBegin},       // CLOSING_BRACE
-     {ErrorBegin},       // CLOSING_BRACKET
-     {ErrorBegin},       // QUOTE
-     {ErrorBegin},       // ESCAPE
-     {ErrorBegin},       // COMMA
-     {ErrorBegin},       // COLON
-     {},                 // WHITE_SPACE
-     nl_tokens({}, {}),  // LINE_BREAK
-     {ErrorBegin},       // OTHER
+    {                                 /*ROOT*/
+     {alt_tokens({ErrorBegin}, {})},  // OPENING_BRACE
+     {alt_tokens({ErrorBegin}, {})},  // OPENING_BRACKET
+     {alt_tokens({ErrorBegin}, {})},  // CLOSING_BRACE
+     {alt_tokens({ErrorBegin}, {})},  // CLOSING_BRACKET
+     {alt_tokens({ErrorBegin}, {})},  // QUOTE
+     {alt_tokens({ErrorBegin}, {})},  // ESCAPE
+     {alt_tokens({ErrorBegin}, {})},  // COMMA
+     {alt_tokens({ErrorBegin}, {})},  // COLON
+     {},                              // WHITE_SPACE
+     nl_tokens({}, {}),               // LINE_BREAK
+     {alt_tokens({ErrorBegin}, {})},  // OTHER
      /*LIST*/
      {ErrorBegin},                 // OPENING_BRACE
      {ErrorBegin},                 // OPENING_BRACKET
@@ -1446,6 +1553,26 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   // character.
   auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_symbols.data());
 
+  // Spark, as the main stakeholder in the `recover_from_error` option, has the specific need to
+  // ignore any characters that follow the first value on each JSON line. This is an FST that
+  // fixes the stack context for those excess characters. That is, that all those excess characters
+  // will be interpreted in the root stack context
+  if (recover_from_error) {
+    auto fix_stack_of_excess_chars = fst::detail::make_fst(
+      fst::detail::make_symbol_group_lookup_op(
+        fix_stack_of_excess_chars::SymbolPairToSymbolGroupId{}),
+      fst::detail::make_transition_table(fix_stack_of_excess_chars::transition_table),
+      fst::detail::make_translation_functor(fix_stack_of_excess_chars::TransduceInputOp{}),
+      stream);
+    fix_stack_of_excess_chars.Transduce(zip_in,
+                                        static_cast<SymbolOffsetT>(json_in.size()),
+                                        stack_symbols.data(),
+                                        thrust::make_discard_iterator(),
+                                        thrust::make_discard_iterator(),
+                                        fix_stack_of_excess_chars::start_state,
+                                        stream);
+  }
+
   constexpr auto max_translation_table_size =
     tokenizer_pda::NUM_PDA_SGIDS *
     static_cast<tokenizer_pda::StateT>(tokenizer_pda::pda_state_t::PD_NUM_STATES);
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 2ddb0b76544..0149a467c32 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1957,11 +1957,11 @@ TEST_F(JsonReaderTest, JSONLinesRecovering)
     // 2 -> (invalid)
     R"({"b":{"a":[321})"
     "\n"
-    // 3 -> c: [1] (valid)
+    // 3 -> c: 1.2 (valid)
     R"({"c":1.2})"
     "\n"
     "\n"
-    // 4 -> a: 123 (valid)
+    // 4 -> a: 4 (valid)
     R"({"a":4})"
     "\n"
     // 5 -> (invalid)
@@ -2020,4 +2020,71 @@ TEST_F(JsonReaderTest, JSONLinesRecovering)
                     c_validity.cbegin()});
 }
 
+TEST_F(JsonReaderTest, JSONLinesRecoveringIgnoreExcessChars)
+{
+  /**
+   * @brief Spark has the specific need to ignore extra characters that come after the first record
+   * on a JSON line
+   */
+  std::string data =
+    // 0 -> a: -2 (valid)
+    R"({"a":-2}{})"
+    "\n"
+    // 1 -> (invalid)
+    R"({"b":{}should_be_invalid})"
+    "\n"
+    // 2 -> b (valid)
+    R"({"b":{"a":3} })"
+    "\n"
+    // 3 -> c: (valid)
+    R"({"c":1.2 } )"
+    "\n"
+    "\n"
+    // 4 -> (valid)
+    R"({"a":4} 123)"
+    "\n"
+    // 5 -> (valid)
+    R"({"a":5}//Comment after record)"
+    "\n"
+    // 6 -> (valid)
+    R"({"a":6} //Comment after whitespace)"
+    "\n"
+    // 7 -> (invalid)
+    R"({"a":5 //Invalid Comment within record})";
+
+  auto filepath = temp_env->get_temp_dir() + "RecoveringLinesExcessChars.json";
+  {
+    std::ofstream outfile(filepath, std::ofstream::out);
+    outfile << data;
+  }
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
+      .lines(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 3);
+  EXPECT_EQ(result.tbl->num_rows(), 8);
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRUCT);
+  EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::FLOAT64);
+
+  std::vector<bool> a_validity{true, false, false, false, true, true, true, false};
+  std::vector<bool> b_validity{false, false, true, false, false, false, false, false};
+  std::vector<bool> c_validity{false, false, false, true, false, false, false, false};
+
+  // Child column b->a
+  auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper{{-2, 0, 0, 0, 4, 5, 6, 0}, a_validity.cbegin()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(1), cudf::test::structs_column_wrapper({b_a_col}, b_validity.cbegin()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(2),
+    float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0}, c_validity.cbegin()});
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 3cb7e1f287a..5f79d5b862b 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -543,7 +543,7 @@ TEST_F(JsonTest, RecoveringTokenStream)
 {
   // Test input. Inline comments used to indicate character indexes
   //                           012345678 <= line 0
-  std::string const input = R"({"a":-2},)"
+  std::string const input = R"({"a":2 {})"
                             // 9
                             "\n"
                             // 01234 <= line 1

From f7ad66f440168fd4eceb3cc900301661023e42a1 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Fri, 20 Oct 2023 09:31:16 -0700
Subject: [PATCH 035/118] Add DELTA_BINARY_PACKED encoder for Parquet writer
 (#14100)

Part of #13501. Adds ability to fall back on DELTA_BINARY_PACKED encoding when V2 page headers are selected and dictionary encoding is not possible.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14100
---
 cpp/src/io/parquet/delta_binary.cuh |   6 -
 cpp/src/io/parquet/delta_enc.cuh    | 290 ++++++++
 cpp/src/io/parquet/page_enc.cu      | 998 ++++++++++++++++++++--------
 cpp/src/io/parquet/parquet_gpu.hpp  |  51 +-
 cpp/tests/io/parquet_test.cpp       | 108 ++-
 5 files changed, 1156 insertions(+), 297 deletions(-)
 create mode 100644 cpp/src/io/parquet/delta_enc.cuh

diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh
index a513e6674b4..e3b23f4c0a0 100644
--- a/cpp/src/io/parquet/delta_binary.cuh
+++ b/cpp/src/io/parquet/delta_binary.cuh
@@ -46,12 +46,6 @@ namespace cudf::io::parquet::detail {
 // encoded with DELTA_LENGTH_BYTE_ARRAY encoding, which is a DELTA_BINARY_PACKED list of suffix
 // lengths, followed by the concatenated suffix data.
 
-// TODO: The delta encodings use ULEB128 integers, but for now we're only
-// using max 64 bits. Need to see what the performance impact is of using
-// __int128_t rather than int64_t.
-using uleb128_t   = uint64_t;
-using zigzag128_t = int64_t;
-
 // we decode one mini-block at a time. max mini-block size seen is 64.
 constexpr int delta_rolling_buf_size = 128;
 
diff --git a/cpp/src/io/parquet/delta_enc.cuh b/cpp/src/io/parquet/delta_enc.cuh
new file mode 100644
index 00000000000..28f8cdfe2c1
--- /dev/null
+++ b/cpp/src/io/parquet/delta_enc.cuh
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "parquet_gpu.hpp"
+
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+
+#include <cub/cub.cuh>
+
+namespace cudf::io::parquet::detail {
+
+namespace delta {
+
+inline __device__ void put_uleb128(uint8_t*& p, uleb128_t v)
+{
+  while (v > 0x7f) {
+    *(p++) = v | 0x80;
+    v >>= 7;
+  }
+  *(p++) = v;
+}
+
+inline __device__ void put_zz128(uint8_t*& p, zigzag128_t v)
+{
+  zigzag128_t s = (v < 0);
+  put_uleb128(p, (v ^ -s) * 2 + s);
+}
+
+// A block size of 128, with 4 mini-blocks of 32 values each fits nicely without consuming
+// too much shared memory.
+// The parquet spec requires block_size to be a multiple of 128, and values_per_mini_block
+// to be a multiple of 32.
+constexpr int block_size            = 128;
+constexpr int num_mini_blocks       = 4;
+constexpr int values_per_mini_block = block_size / num_mini_blocks;
+constexpr int buffer_size           = 2 * block_size;
+
+// An extra sanity checks to enforce compliance with the parquet specification.
+static_assert(block_size % 128 == 0);
+static_assert(values_per_mini_block % 32 == 0);
+
+using block_reduce = cub::BlockReduce<zigzag128_t, block_size>;
+using warp_reduce  = cub::WarpReduce<uleb128_t>;
+using index_scan   = cub::BlockScan<size_type, block_size>;
+
+constexpr int rolling_idx(int index) { return rolling_index<buffer_size>(index); }
+
+// Version of bit packer that can handle up to 64 bits values.
+// T is the type to use for processing. if nbits <= 32 use uint32_t, otherwise unsigned long long
+// (not uint64_t because of atomicOr's typing). allowing this to be selectable since there's a
+// measurable impact to using the wider types.
+template <typename scratch_type>
+inline __device__ void bitpack_mini_block(
+  uint8_t* dst, uleb128_t val, uint32_t count, uint8_t nbits, void* temp_space)
+{
+  using wide_type =
+    std::conditional_t<std::is_same_v<scratch_type, unsigned long long>, __uint128_t, uint64_t>;
+  using cudf::detail::warp_size;
+  scratch_type constexpr mask = sizeof(scratch_type) * 8 - 1;
+  auto constexpr div          = sizeof(scratch_type) * 8;
+
+  auto const lane_id = threadIdx.x % warp_size;
+  auto const warp_id = threadIdx.x / warp_size;
+
+  auto const scratch = reinterpret_cast<scratch_type*>(temp_space) + warp_id * warp_size;
+
+  // zero out scratch
+  scratch[lane_id] = 0;
+  __syncwarp();
+
+  // TODO: see if there is any savings using special packing for easy bitwidths (1,2,4,8,16...)
+  // like what's done for the RLE encoder.
+  if (nbits == div) {
+    if (lane_id < count) {
+      for (int i = 0; i < sizeof(scratch_type); i++) {
+        dst[lane_id * sizeof(scratch_type) + i] = val & 0xff;
+        val >>= 8;
+      }
+    }
+    return;
+  }
+
+  if (lane_id <= count) {
+    // Shift symbol left by up to mask bits.
+    wide_type v2 = val;
+    v2 <<= (lane_id * nbits) & mask;
+
+    // Copy N bit word into two N/2 bit words while following C++ strict aliasing rules.
+    scratch_type v1[2];
+    memcpy(&v1, &v2, sizeof(wide_type));
+
+    // Atomically write result to scratch.
+    if (v1[0]) { atomicOr(scratch + ((lane_id * nbits) / div), v1[0]); }
+    if (v1[1]) { atomicOr(scratch + ((lane_id * nbits) / div) + 1, v1[1]); }
+  }
+  __syncwarp();
+
+  // Copy scratch data to final destination.
+  auto const available_bytes = util::div_rounding_up_safe(count * nbits, 8U);
+  auto const scratch_bytes   = reinterpret_cast<uint8_t const*>(scratch);
+
+  for (uint32_t i = lane_id; i < available_bytes; i += warp_size) {
+    dst[i] = scratch_bytes[i];
+  }
+  __syncwarp();
+}
+
+}  // namespace delta
+
+// Object used to turn a stream of integers into a DELTA_BINARY_PACKED stream. This takes as input
+// 128 values with validity at a time, saving them until there are enough values for a block
+// to be written.
+// T is the input data type (either zigzag128_t or uleb128_t).
+template <typename T>
+class delta_binary_packer {
+ private:
+  uint8_t* _dst;                             // sink to dump encoded values to
+  T* _buffer;                                // buffer to store values to be encoded
+  size_type _current_idx;                    // index of first value in buffer
+  uint32_t _num_values;                      // total number of values to encode
+  size_type _values_in_buffer;               // current number of values stored in _buffer
+  uint8_t _mb_bits[delta::num_mini_blocks];  // bitwidth for each mini-block
+
+  // pointers to shared scratch memory for the warp and block scans/reduces
+  delta::index_scan::TempStorage* _scan_tmp;
+  delta::warp_reduce::TempStorage* _warp_tmp;
+  delta::block_reduce::TempStorage* _block_tmp;
+
+  void* _bitpack_tmp;  // pointer to shared scratch memory used in bitpacking
+
+  // Write the delta binary header. Only call from thread 0.
+  inline __device__ void write_header()
+  {
+    delta::put_uleb128(_dst, delta::block_size);
+    delta::put_uleb128(_dst, delta::num_mini_blocks);
+    delta::put_uleb128(_dst, _num_values);
+    delta::put_zz128(_dst, _buffer[0]);
+  }
+
+  // Write the block header. Only call from thread 0.
+  inline __device__ void write_block_header(zigzag128_t block_min)
+  {
+    delta::put_zz128(_dst, block_min);
+    memcpy(_dst, _mb_bits, 4);
+    _dst += 4;
+  }
+
+  // Signed subtraction with defined wrapping behavior.
+  inline __device__ zigzag128_t subtract(zigzag128_t a, zigzag128_t b)
+  {
+    return static_cast<zigzag128_t>(static_cast<uleb128_t>(a) - static_cast<uleb128_t>(b));
+  }
+
+ public:
+  inline __device__ auto num_values() const { return _num_values; }
+
+  // Initialize the object. Only call from thread 0.
+  inline __device__ void init(uint8_t* dest, uint32_t num_values, T* buffer, void* temp_storage)
+  {
+    _dst              = dest;
+    _num_values       = num_values;
+    _buffer           = buffer;
+    _scan_tmp         = reinterpret_cast<delta::index_scan::TempStorage*>(temp_storage);
+    _warp_tmp         = reinterpret_cast<delta::warp_reduce::TempStorage*>(temp_storage);
+    _block_tmp        = reinterpret_cast<delta::block_reduce::TempStorage*>(temp_storage);
+    _bitpack_tmp      = _buffer + delta::buffer_size;
+    _current_idx      = 0;
+    _values_in_buffer = 0;
+  }
+
+  // Each thread calls this to add its current value.
+  inline __device__ void add_value(T value, bool is_valid)
+  {
+    // Figure out the correct position for the given value.
+    size_type const valid = is_valid;
+    size_type pos;
+    size_type num_valid;
+    delta::index_scan(*_scan_tmp).ExclusiveSum(valid, pos, num_valid);
+
+    if (is_valid) { _buffer[delta::rolling_idx(pos + _current_idx + _values_in_buffer)] = value; }
+    __syncthreads();
+
+    if (threadIdx.x == 0) {
+      _values_in_buffer += num_valid;
+      // if first pass write header
+      if (_current_idx == 0) {
+        write_header();
+        _current_idx = 1;
+        _values_in_buffer -= 1;
+      }
+    }
+    __syncthreads();
+
+    if (_values_in_buffer >= delta::block_size) { flush(); }
+  }
+
+  // Called by each thread to flush data to the sink.
+  inline __device__ uint8_t const* flush()
+  {
+    using cudf::detail::warp_size;
+    __shared__ zigzag128_t block_min;
+
+    int const t       = threadIdx.x;
+    int const warp_id = t / warp_size;
+    int const lane_id = t % warp_size;
+
+    if (_values_in_buffer <= 0) { return _dst; }
+
+    // Calculate delta for this thread.
+    size_type const idx     = _current_idx + t;
+    zigzag128_t const delta = idx < _num_values ? subtract(_buffer[delta::rolling_idx(idx)],
+                                                           _buffer[delta::rolling_idx(idx - 1)])
+                                                : std::numeric_limits<zigzag128_t>::max();
+
+    // Find min delta for the block.
+    auto const min_delta = delta::block_reduce(*_block_tmp).Reduce(delta, cub::Min());
+
+    if (t == 0) { block_min = min_delta; }
+    __syncthreads();
+
+    // Compute frame of reference for the block.
+    uleb128_t const norm_delta = idx < _num_values ? subtract(delta, block_min) : 0;
+
+    // Get max normalized delta for each warp, and use that to determine how many bits to use
+    // for the bitpacking of this warp.
+    zigzag128_t const warp_max =
+      delta::warp_reduce(_warp_tmp[warp_id]).Reduce(norm_delta, cub::Max());
+    __syncwarp();
+
+    if (lane_id == 0) { _mb_bits[warp_id] = sizeof(zigzag128_t) * 8 - __clzll(warp_max); }
+    __syncthreads();
+
+    // write block header
+    if (t == 0) { write_block_header(block_min); }
+    __syncthreads();
+
+    // Now each warp encodes its data...can calculate starting offset with _mb_bits.
+    // NOTE: using a switch here rather than a loop because the compiler produces code that
+    // uses fewer registers.
+    int cumulative_bits = 0;
+    switch (warp_id) {
+      case 3: cumulative_bits += _mb_bits[2]; [[fallthrough]];
+      case 2: cumulative_bits += _mb_bits[1]; [[fallthrough]];
+      case 1: cumulative_bits += _mb_bits[0];
+    }
+    uint8_t* const mb_ptr = _dst + cumulative_bits * delta::values_per_mini_block / 8;
+
+    // encoding happens here
+    auto const warp_idx = _current_idx + warp_id * delta::values_per_mini_block;
+    if (warp_idx < _num_values) {
+      auto const num_enc = min(delta::values_per_mini_block, _num_values - warp_idx);
+      if (_mb_bits[warp_id] > 32) {
+        delta::bitpack_mini_block<unsigned long long>(
+          mb_ptr, norm_delta, num_enc, _mb_bits[warp_id], _bitpack_tmp);
+      } else {
+        delta::bitpack_mini_block<uint32_t>(
+          mb_ptr, norm_delta, num_enc, _mb_bits[warp_id], _bitpack_tmp);
+      }
+    }
+    __syncthreads();
+
+    // Last warp updates global delta ptr.
+    if (warp_id == delta::num_mini_blocks - 1 && lane_id == 0) {
+      _dst              = mb_ptr + _mb_bits[warp_id] * delta::values_per_mini_block / 8;
+      _current_idx      = min(warp_idx + delta::values_per_mini_block, _num_values);
+      _values_in_buffer = max(_values_in_buffer - delta::block_size, 0U);
+    }
+    __syncthreads();
+
+    return _dst;
+  }
+};
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 78873d5e8ca..1e4f061d2e0 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "delta_enc.cuh"
 #include "parquet_gpu.cuh"
 
 #include <io/utilities/block_utils.cuh>
@@ -21,6 +22,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -41,13 +43,19 @@
 #include <thrust/scatter.h>
 #include <thrust/tuple.h>
 
+#include <bitset>
+
 namespace cudf::io::parquet::detail {
 
 namespace {
 
 using ::cudf::detail::device_2dspan;
 
-constexpr uint32_t rle_buffer_size = (1 << 9);
+constexpr int encode_block_size = 128;
+constexpr int rle_buffer_size   = 2 * encode_block_size;
+constexpr int num_encode_warps  = encode_block_size / cudf::detail::warp_size;
+
+constexpr int rolling_idx(int pos) { return rolling_index<rle_buffer_size>(pos); }
 
 // do not truncate statistics
 constexpr int32_t NO_TRUNC_STATS = 0;
@@ -69,6 +77,7 @@ struct frag_init_state_s {
   PageFragment frag;
 };
 
+template <int rle_buf_size>
 struct page_enc_state_s {
   uint8_t* cur;          //!< current output ptr
   uint8_t* rle_out;      //!< current RLE write ptr
@@ -81,14 +90,15 @@ struct page_enc_state_s {
   uint32_t rle_rpt_count;
   uint32_t page_start_val;
   uint32_t chunk_start_val;
-  volatile uint32_t rpt_map[4];
-  volatile uint32_t scratch_red[32];
+  volatile uint32_t rpt_map[num_encode_warps];
   EncPage page;
   EncColumnChunk ck;
   parquet_column_device_view col;
-  uint32_t vals[rle_buffer_size];
+  uint32_t vals[rle_buf_size];
 };
 
+using rle_page_enc_state_s = page_enc_state_s<rle_buffer_size>;
+
 /**
  * @brief Returns the size of the type in the Parquet file.
  */
@@ -205,6 +215,12 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t)
   }
 }
 
+/**
+ * @brief Determine the correct page encoding for the given page parameters.
+ *
+ * This is only used by the plain and dictionary encoders. Delta encoders will set the page
+ * encoding directly.
+ */
 Encoding __device__ determine_encoding(PageType page_type,
                                        Type physical_type,
                                        bool use_dictionary,
@@ -216,7 +232,6 @@ Encoding __device__ determine_encoding(PageType page_type,
   switch (page_type) {
     case PageType::DATA_PAGE: return use_dictionary ? Encoding::PLAIN_DICTIONARY : Encoding::PLAIN;
     case PageType::DATA_PAGE_V2:
-      // TODO need to work in delta encodings here when they're added
       return physical_type == BOOLEAN ? Encoding::RLE
              : use_dictionary         ? Encoding::RLE_DICTIONARY
                                       : Encoding::PLAIN;
@@ -236,6 +251,50 @@ struct BitwiseOr {
   }
 };
 
+// I is the column type from the input table
+template <typename I>
+__device__ uint8_t const* delta_encode(page_enc_state_s<0>* s,
+                                       uint32_t valid_count,
+                                       uint64_t* buffer,
+                                       void* temp_space)
+{
+  using output_type = std::conditional_t<std::is_signed_v<I>, zigzag128_t, uleb128_t>;
+  __shared__ delta_binary_packer<output_type> packer;
+
+  auto const t = threadIdx.x;
+  if (t == 0) {
+    packer.init(s->cur, valid_count, reinterpret_cast<output_type*>(buffer), temp_space);
+  }
+  __syncthreads();
+
+  // TODO(ets): in the plain encoder the scaling is a little different for INT32 than INT64.
+  // might need to modify this if there's a big performance hit in the 32-bit case.
+  int32_t const scale = s->col.ts_scale == 0 ? 1 : s->col.ts_scale;
+  for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
+    uint32_t const nvals = min(s->page.num_leaf_values - cur_val_idx, delta::block_size);
+
+    size_type const val_idx_in_block = cur_val_idx + t;
+    size_type const val_idx          = s->page_start_val + val_idx_in_block;
+
+    bool const is_valid =
+      (val_idx < s->col.leaf_column->size() && val_idx_in_block < s->page.num_leaf_values)
+        ? s->col.leaf_column->is_valid(val_idx)
+        : false;
+
+    cur_val_idx += nvals;
+
+    output_type v = s->col.leaf_column->element<I>(val_idx);
+    if (scale < 0) {
+      v /= -scale;
+    } else {
+      v *= scale;
+    }
+    packer.add_value(v, is_valid);
+  }
+
+  return packer.flush();
+}
+
 }  // anonymous namespace
 
 // blockDim {512,1,1}
@@ -323,6 +382,29 @@ __global__ void __launch_bounds__(128)
   }
 }
 
+__device__ size_t delta_data_len(Type physical_type, cudf::type_id type_id, uint32_t num_values)
+{
+  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len     = [&]() -> uint32_t {
+    if (physical_type == INT32) { return int32_logical_len(type_id); }
+    if (physical_type == INT96) { return sizeof(int64_t); }
+    return dtype_len_out;
+  }();
+
+  auto const vals_per_block = delta::block_size;
+  size_t const num_blocks   = util::div_rounding_up_unsafe(num_values, vals_per_block);
+  // need max dtype_len + 1 bytes for min_delta
+  // one byte per mini block for the bitwidth
+  // and block_size * dtype_len bytes for the actual encoded data
+  auto const block_size = dtype_len + 1 + delta::num_mini_blocks + vals_per_block * dtype_len;
+
+  // delta header is 2 bytes for the block_size, 1 byte for number of mini-blocks,
+  // max 5 bytes for number of values, and max dtype_len + 1 for first value.
+  auto const header_size = 2 + 1 + 5 + dtype_len + 1;
+
+  return header_size + num_blocks * block_size;
+}
+
 // blockDim {128,1,1}
 __global__ void __launch_bounds__(128)
   gpuInitPages(device_2dspan<EncColumnChunk> chunks,
@@ -354,6 +436,14 @@ __global__ void __launch_bounds__(128)
     page_g = {};
   }
   __syncthreads();
+
+  // if writing delta encoded values, we're going to need to know the data length to get a guess
+  // at the worst case number of bytes needed to encode.
+  auto const physical_type = col_g.physical_type;
+  auto const type_id       = col_g.leaf_column->type().id();
+  auto const is_use_delta =
+    write_v2_headers && !ck_g.use_dictionary && (physical_type == INT32 || physical_type == INT64);
+
   if (t < 32) {
     uint32_t fragments_in_chunk  = 0;
     uint32_t rows_in_page        = 0;
@@ -403,9 +493,12 @@ __global__ void __launch_bounds__(128)
       }
       __syncwarp();
       if (t == 0) {
-        if (not pages.empty()) pages[ck_g.first_page] = page_g;
-        if (not page_sizes.empty()) page_sizes[ck_g.first_page] = page_g.max_data_size;
-        if (page_grstats) page_grstats[ck_g.first_page] = pagestats_g;
+        if (not pages.empty()) {
+          page_g.kernel_mask     = encode_kernel_mask::PLAIN;
+          pages[ck_g.first_page] = page_g;
+        }
+        if (not page_sizes.empty()) { page_sizes[ck_g.first_page] = page_g.max_data_size; }
+        if (page_grstats) { page_grstats[ck_g.first_page] = pagestats_g; }
       }
       num_pages = 1;
     }
@@ -505,7 +598,12 @@ __global__ void __launch_bounds__(128)
           page_g.num_values         = values_in_page;
           auto const def_level_size = max_RLE_page_size(col_g.num_def_level_bits(), values_in_page);
           auto const rep_level_size = max_RLE_page_size(col_g.num_rep_level_bits(), values_in_page);
-          auto const max_data_size  = page_size + def_level_size + rep_level_size + rle_pad;
+          // get a different bound if using delta encoding
+          if (is_use_delta) {
+            page_size =
+              max(page_size, delta_data_len(physical_type, type_id, page_g.num_leaf_values));
+          }
+          auto const max_data_size = page_size + def_level_size + rep_level_size + rle_pad;
           // page size must fit in 32-bit signed integer
           if (max_data_size > std::numeric_limits<int32_t>::max()) {
             CUDF_UNREACHABLE("page size exceeds maximum for i32");
@@ -525,7 +623,16 @@ __global__ void __launch_bounds__(128)
         }
         __syncwarp();
         if (t == 0) {
-          if (not pages.empty()) { pages[ck_g.first_page + num_pages] = page_g; }
+          if (not pages.empty()) {
+            if (is_use_delta) {
+              page_g.kernel_mask = encode_kernel_mask::DELTA_BINARY;
+            } else if (ck_g.use_dictionary || physical_type == BOOLEAN) {
+              page_g.kernel_mask = encode_kernel_mask::DICTIONARY;
+            } else {
+              page_g.kernel_mask = encode_kernel_mask::PLAIN;
+            }
+            pages[ck_g.first_page + num_pages] = page_g;
+          }
           if (not page_sizes.empty()) {
             page_sizes[ck_g.first_page + num_pages] = page_g.max_data_size;
           }
@@ -789,8 +896,12 @@ inline __device__ void PackLiterals(
  * @param[in] t thread id (0..127)
  */
 static __device__ void RleEncode(
-  page_enc_state_s* s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t)
+  rle_page_enc_state_s* s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t)
 {
+  using cudf::detail::warp_size;
+  auto const lane_id = t % warp_size;
+  auto const warp_id = t / warp_size;
+
   uint32_t rle_pos = s->rle_pos;
   uint32_t rle_run = s->rle_run;
 
@@ -798,20 +909,20 @@ static __device__ void RleEncode(
     uint32_t pos = rle_pos + t;
     if (rle_run > 0 && !(rle_run & 1)) {
       // Currently in a long repeat run
-      uint32_t mask = ballot(pos < numvals && s->vals[pos & (rle_buffer_size - 1)] == s->run_val);
+      uint32_t mask = ballot(pos < numvals && s->vals[rolling_idx(pos)] == s->run_val);
       uint32_t rle_rpt_count, max_rpt_count;
-      if (!(t & 0x1f)) { s->rpt_map[t >> 5] = mask; }
+      if (lane_id == 0) { s->rpt_map[warp_id] = mask; }
       __syncthreads();
-      if (t < 32) {
+      if (t < warp_size) {
         uint32_t c32 = ballot(t >= 4 || s->rpt_map[t] != 0xffff'ffffu);
-        if (!t) {
+        if (t == 0) {
           uint32_t last_idx = __ffs(c32) - 1;
           s->rle_rpt_count =
-            last_idx * 32 + ((last_idx < 4) ? __ffs(~s->rpt_map[last_idx]) - 1 : 0);
+            last_idx * warp_size + ((last_idx < 4) ? __ffs(~s->rpt_map[last_idx]) - 1 : 0);
         }
       }
       __syncthreads();
-      max_rpt_count = min(numvals - rle_pos, 128);
+      max_rpt_count = min(numvals - rle_pos, encode_block_size);
       rle_rpt_count = s->rle_rpt_count;
       rle_run += rle_rpt_count << 1;
       rle_pos += rle_rpt_count;
@@ -828,17 +939,17 @@ static __device__ void RleEncode(
       }
     } else {
       // New run or in a literal run
-      uint32_t v0      = s->vals[pos & (rle_buffer_size - 1)];
-      uint32_t v1      = s->vals[(pos + 1) & (rle_buffer_size - 1)];
+      uint32_t v0      = s->vals[rolling_idx(pos)];
+      uint32_t v1      = s->vals[rolling_idx(pos + 1)];
       uint32_t mask    = ballot(pos + 1 < numvals && v0 == v1);
-      uint32_t maxvals = min(numvals - rle_pos, 128);
+      uint32_t maxvals = min(numvals - rle_pos, encode_block_size);
       uint32_t rle_lit_count, rle_rpt_count;
-      if (!(t & 0x1f)) { s->rpt_map[t >> 5] = mask; }
+      if (lane_id == 0) { s->rpt_map[warp_id] = mask; }
       __syncthreads();
-      if (t < 32) {
+      if (t < warp_size) {
         // Repeat run can only start on a multiple of 8 values
-        uint32_t idx8        = (t * 8) >> 5;
-        uint32_t pos8        = (t * 8) & 0x1f;
+        uint32_t idx8        = (t * 8) / warp_size;
+        uint32_t pos8        = (t * 8) % warp_size;
         uint32_t m0          = (idx8 < 4) ? s->rpt_map[idx8] : 0;
         uint32_t m1          = (idx8 < 3) ? s->rpt_map[idx8 + 1] : 0;
         uint32_t needed_mask = kRleRunMask[nbits - 1];
@@ -847,8 +958,8 @@ static __device__ void RleEncode(
           uint32_t rle_run_start = (mask != 0) ? min((__ffs(mask) - 1) * 8, maxvals) : maxvals;
           uint32_t rpt_len       = 0;
           if (rle_run_start < maxvals) {
-            uint32_t idx_cur = rle_run_start >> 5;
-            uint32_t idx_ofs = rle_run_start & 0x1f;
+            uint32_t idx_cur = rle_run_start / warp_size;
+            uint32_t idx_ofs = rle_run_start % warp_size;
             while (idx_cur < 4) {
               m0   = (idx_cur < 4) ? s->rpt_map[idx_cur] : 0;
               m1   = (idx_cur < 3) ? s->rpt_map[idx_cur + 1] : 0;
@@ -857,7 +968,7 @@ static __device__ void RleEncode(
                 rpt_len += __ffs(mask) - 1;
                 break;
               }
-              rpt_len += 32;
+              rpt_len += warp_size;
               idx_cur++;
             }
           }
@@ -928,7 +1039,7 @@ static __device__ void RleEncode(
  * @param[in] flush nonzero if last batch in block
  * @param[in] t thread id (0..127)
  */
-static __device__ void PlainBoolEncode(page_enc_state_s* s,
+static __device__ void PlainBoolEncode(rle_page_enc_state_s* s,
                                        uint32_t numvals,
                                        uint32_t flush,
                                        uint32_t t)
@@ -938,7 +1049,7 @@ static __device__ void PlainBoolEncode(page_enc_state_s* s,
 
   while (rle_pos < numvals) {
     uint32_t pos    = rle_pos + t;
-    uint32_t v      = (pos < numvals) ? s->vals[pos & (rle_buffer_size - 1)] : 0;
+    uint32_t v      = (pos < numvals) ? s->vals[rolling_idx(pos)] : 0;
     uint32_t n      = min(numvals - rle_pos, 128);
     uint32_t nbytes = (n + ((flush) ? 7 : 0)) >> 3;
     if (!nbytes) { break; }
@@ -992,28 +1103,22 @@ __device__ auto julian_days_with_time(int64_t v)
   return std::make_pair(dur_time_of_day_nanos, julian_days);
 }
 
+// this has been split out into its own kernel because of the amount of shared memory required
+// for the state buffer. encode kernels that don't use the RLE buffer can get started while
+// the level data is encoded.
 // blockDim(128, 1, 1)
 template <int block_size>
-__global__ void __launch_bounds__(128, 8)
-  gpuEncodePages(device_span<EncPage> pages,
-                 device_span<device_span<uint8_t const>> comp_in,
-                 device_span<device_span<uint8_t>> comp_out,
-                 device_span<compression_result> comp_results,
-                 bool write_v2_headers)
+__global__ void __launch_bounds__(block_size, 8) gpuEncodePageLevels(device_span<EncPage> pages,
+                                                                     bool write_v2_headers,
+                                                                     encode_kernel_mask kernel_mask)
 {
-  __shared__ __align__(8) page_enc_state_s state_g;
-  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
-  using block_scan   = cub::BlockScan<uint32_t, block_size>;
-  __shared__ union {
-    typename block_reduce::TempStorage reduce_storage;
-    typename block_scan::TempStorage scan_storage;
-  } temp_storage;
+  __shared__ __align__(8) rle_page_enc_state_s state_g;
 
-  page_enc_state_s* const s = &state_g;
-  auto const t              = threadIdx.x;
+  auto* const s    = &state_g;
+  uint32_t const t = threadIdx.x;
 
   if (t == 0) {
-    state_g = page_enc_state_s{};
+    state_g = rle_page_enc_state_s{};
     s->page = pages[blockIdx.x];
     s->ck   = *s->page.chunk;
     s->col  = *s->ck.col_desc;
@@ -1026,6 +1131,8 @@ __global__ void __launch_bounds__(128, 8)
   }
   __syncthreads();
 
+  if (BitAnd(s->page.kernel_mask, kernel_mask) == 0) { return; }
+
   auto const is_v2 = s->page.page_type == PageType::DATA_PAGE_V2;
 
   // Encode Repetition and Definition levels
@@ -1078,23 +1185,24 @@ __global__ void __launch_bounds__(128, 8)
           } while (is_col_struct);
           return def;
         }();
-        s->vals[(rle_numvals + t) & (rle_buffer_size - 1)] = def_lvl;
+        s->vals[rolling_idx(rle_numvals + t)] = def_lvl;
         __syncthreads();
         rle_numvals += nrows;
         RleEncode(s, rle_numvals, def_lvl_bits, (rle_numvals == s->page.num_rows), t);
         __syncthreads();
       }
       if (t < 32) {
-        uint8_t* const cur       = s->cur;
-        uint8_t* const rle_out   = s->rle_out;
-        uint32_t const rle_bytes = static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : 4);
-        if (is_v2 && t == 0) {
+        uint8_t* const cur     = s->cur;
+        uint8_t* const rle_out = s->rle_out;
+        // V2 does not write the RLE length field
+        uint32_t const rle_bytes =
+          static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : RLE_LENGTH_FIELD_LEN);
+        if (not is_v2 && t < RLE_LENGTH_FIELD_LEN) { cur[t] = rle_bytes >> (t * 8); }
+        __syncwarp();
+        if (t == 0) {
+          s->cur                = rle_out;
           s->page.def_lvl_bytes = rle_bytes;
-        } else if (not is_v2 && t < 4) {
-          cur[t] = rle_bytes >> (t * 8);
         }
-        __syncwarp();
-        if (t == 0) { s->cur = rle_out; }
       }
     }
   } else if (s->page.page_type != PageType::DICTIONARY_PAGE &&
@@ -1121,29 +1229,121 @@ __global__ void __launch_bounds__(128, 8)
         uint32_t idx         = page_first_val_idx + rle_numvals + t;
         uint32_t lvl_val =
           (rle_numvals + t < s->page.num_values && idx < col_last_val_idx) ? lvl_val_data[idx] : 0;
-        s->vals[(rle_numvals + t) & (rle_buffer_size - 1)] = lvl_val;
+        s->vals[rolling_idx(rle_numvals + t)] = lvl_val;
         __syncthreads();
         rle_numvals += nvals;
         RleEncode(s, rle_numvals, nbits, (rle_numvals == s->page.num_values), t);
         __syncthreads();
       }
       if (t < 32) {
-        uint8_t* const cur       = s->cur;
-        uint8_t* const rle_out   = s->rle_out;
-        uint32_t const rle_bytes = static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : 4);
-        if (is_v2 && t == 0) {
+        uint8_t* const cur     = s->cur;
+        uint8_t* const rle_out = s->rle_out;
+        // V2 does not write the RLE length field
+        uint32_t const rle_bytes =
+          static_cast<uint32_t>(rle_out - cur) - (is_v2 ? 0 : RLE_LENGTH_FIELD_LEN);
+        if (not is_v2 && t < RLE_LENGTH_FIELD_LEN) { cur[t] = rle_bytes >> (t * 8); }
+        __syncwarp();
+        if (t == 0) {
+          s->cur    = rle_out;
           lvl_bytes = rle_bytes;
-        } else if (not is_v2 && t < 4) {
-          cur[t] = rle_bytes >> (t * 8);
         }
-        __syncwarp();
-        if (t == 0) { s->cur = rle_out; }
       }
     };
     encode_levels(s->col.rep_values, s->col.num_rep_level_bits(), s->page.rep_lvl_bytes);
     __syncthreads();
     encode_levels(s->col.def_values, s->col.num_def_level_bits(), s->page.def_lvl_bytes);
   }
+
+  if (t == 0) { pages[blockIdx.x] = s->page; }
+}
+
+template <int block_size, typename state_buf>
+__device__ void finish_page_encode(state_buf* s,
+                                   uint32_t valid_count,
+                                   uint8_t const* end_ptr,
+                                   device_span<EncPage> pages,
+                                   device_span<device_span<uint8_t const>> comp_in,
+                                   device_span<device_span<uint8_t>> comp_out,
+                                   device_span<compression_result> comp_results,
+                                   bool write_v2_headers)
+{
+  auto const t = threadIdx.x;
+
+  // V2 does not compress rep and def level data
+  size_t const skip_comp_size =
+    write_v2_headers ? s->page.def_lvl_bytes + s->page.rep_lvl_bytes : 0;
+
+  if (t == 0) {
+    // only need num_nulls for v2 data page headers
+    if (write_v2_headers) { s->page.num_nulls = s->page.num_values - valid_count; }
+    uint8_t const* const base   = s->page.page_data + s->page.max_hdr_size;
+    auto const actual_data_size = static_cast<uint32_t>(end_ptr - base);
+    if (actual_data_size > s->page.max_data_size) {
+      CUDF_UNREACHABLE("detected possible page data corruption");
+    }
+    s->page.max_data_size = actual_data_size;
+    if (not comp_in.empty()) {
+      comp_in[blockIdx.x]  = {base + skip_comp_size, actual_data_size - skip_comp_size};
+      comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size,
+                              0};  // size is unused
+    }
+    pages[blockIdx.x] = s->page;
+    if (not comp_results.empty()) {
+      comp_results[blockIdx.x]   = {0, compression_status::FAILURE};
+      pages[blockIdx.x].comp_res = &comp_results[blockIdx.x];
+    }
+  }
+
+  // copy uncompressed bytes over
+  if (skip_comp_size != 0 && not comp_in.empty()) {
+    uint8_t* const src = s->page.page_data + s->page.max_hdr_size;
+    uint8_t* const dst = s->page.compressed_data + s->page.max_hdr_size;
+    for (int i = t; i < skip_comp_size; i += block_size) {
+      dst[i] = src[i];
+    }
+  }
+}
+
+// PLAIN page data encoder
+// blockDim(128, 1, 1)
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 8)
+  gpuEncodePages(device_span<EncPage> pages,
+                 device_span<device_span<uint8_t const>> comp_in,
+                 device_span<device_span<uint8_t>> comp_out,
+                 device_span<compression_result> comp_results,
+                 bool write_v2_headers)
+{
+  __shared__ __align__(8) page_enc_state_s<0> state_g;
+  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
+  using block_scan   = cub::BlockScan<uint32_t, block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
+
+  auto* const s = &state_g;
+  uint32_t t    = threadIdx.x;
+
+  if (t == 0) {
+    state_g        = page_enc_state_s<0>{};
+    s->page        = pages[blockIdx.x];
+    s->ck          = *s->page.chunk;
+    s->col         = *s->ck.col_desc;
+    s->rle_len_pos = nullptr;
+    // get s->cur back to where it was at the end of encoding the rep and def level data
+    s->cur =
+      s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+    // if V1 data page, need space for the RLE length fields
+    if (s->page.page_type == PageType::DATA_PAGE) {
+      if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; }
+      if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; }
+    }
+  }
+  __syncthreads();
+
+  if (BitAnd(s->page.kernel_mask, encode_kernel_mask::PLAIN) == 0) { return; }
+
   // Encode data values
   __syncthreads();
   auto const physical_type = s->col.physical_type;
@@ -1155,10 +1355,6 @@ __global__ void __launch_bounds__(128, 8)
     return dtype_len_out;
   }();
 
-  auto const dict_bits = (physical_type == BOOLEAN) ? 1
-                         : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE)
-                           ? s->ck.dict_rle_bits
-                           : -1;
   if (t == 0) {
     uint8_t* dst   = s->cur;
     s->rle_run     = 0;
@@ -1167,219 +1363,315 @@ __global__ void __launch_bounds__(128, 8)
     s->rle_out     = dst;
     s->page.encoding =
       determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers);
-    if (dict_bits >= 0 && physical_type != BOOLEAN) {
-      dst[0]     = dict_bits;
-      s->rle_out = dst + 1;
-    } else if (is_v2 && physical_type == BOOLEAN) {
-      // save space for RLE length. we don't know the total length yet.
-      s->rle_out     = dst + RLE_LENGTH_FIELD_LEN;
-      s->rle_len_pos = dst;
-    }
     s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
     s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
   }
   __syncthreads();
+
   uint32_t num_valid = 0;
   for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
-    uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, 128);
+    uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, block_size);
     uint32_t len, pos;
 
     auto [is_valid, val_idx] = [&]() {
       uint32_t val_idx;
       uint32_t is_valid;
 
-      size_type val_idx_in_block = cur_val_idx + t;
+      size_type const val_idx_in_block = cur_val_idx + t;
       if (s->page.page_type == PageType::DICTIONARY_PAGE) {
         val_idx  = val_idx_in_block;
         is_valid = (val_idx < s->page.num_leaf_values);
         if (is_valid) { val_idx = s->ck.dict_data[val_idx]; }
       } else {
-        size_type val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
+        size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
 
         is_valid = (val_idx_in_leaf_col < s->col.leaf_column->size() &&
                     val_idx_in_block < s->page.num_leaf_values)
                      ? s->col.leaf_column->is_valid(val_idx_in_leaf_col)
                      : 0;
-        val_idx =
-          (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col;
+        val_idx  = val_idx_in_leaf_col;
       }
       return std::make_tuple(is_valid, val_idx);
     }();
 
-    if (is_valid) num_valid++;
-
+    if (is_valid) { num_valid++; }
     cur_val_idx += nvals;
-    if (dict_bits >= 0) {
-      // Dictionary encoding
-      if (dict_bits > 0) {
-        uint32_t rle_numvals;
-        uint32_t rle_numvals_in_block;
-        block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block);
-        rle_numvals = s->rle_numvals;
-        if (is_valid) {
-          uint32_t v;
-          if (physical_type == BOOLEAN) {
-            v = s->col.leaf_column->element<uint8_t>(val_idx);
-          } else {
-            v = s->ck.dict_index[val_idx];
-          }
-          s->vals[(rle_numvals + pos) & (rle_buffer_size - 1)] = v;
-        }
-        rle_numvals += rle_numvals_in_block;
-        __syncthreads();
-        if (!is_v2 && physical_type == BOOLEAN) {
-          PlainBoolEncode(s, rle_numvals, (cur_val_idx == s->page.num_leaf_values), t);
-        } else {
-          RleEncode(s, rle_numvals, dict_bits, (cur_val_idx == s->page.num_leaf_values), t);
+
+    // Non-dictionary encoding
+    uint8_t* dst = s->cur;
+
+    if (is_valid) {
+      len = dtype_len_out;
+      if (physical_type == BYTE_ARRAY) {
+        if (type_id == type_id::STRING) {
+          len += s->col.leaf_column->element<string_view>(val_idx).size_bytes();
+        } else if (s->col.output_as_byte_array && type_id == type_id::LIST) {
+          len +=
+            get_element<statistics::byte_array_view>(*s->col.leaf_column, val_idx).size_bytes();
         }
-        __syncthreads();
       }
-      if (t == 0) { s->cur = s->rle_out; }
-      __syncthreads();
     } else {
-      // Non-dictionary encoding
-      uint8_t* dst = s->cur;
-
-      if (is_valid) {
-        len = dtype_len_out;
-        if (physical_type == BYTE_ARRAY) {
-          if (type_id == type_id::STRING) {
-            len += s->col.leaf_column->element<string_view>(val_idx).size_bytes();
-          } else if (s->col.output_as_byte_array && type_id == type_id::LIST) {
-            len +=
-              get_element<statistics::byte_array_view>(*s->col.leaf_column, val_idx).size_bytes();
+      len = 0;
+    }
+    uint32_t total_len = 0;
+    block_scan(temp_storage.scan_storage).ExclusiveSum(len, pos, total_len);
+    __syncthreads();
+    if (t == 0) { s->cur = dst + total_len; }
+    if (is_valid) {
+      switch (physical_type) {
+        case INT32: [[fallthrough]];
+        case FLOAT: {
+          auto const v = [dtype_len = dtype_len_in,
+                          idx       = val_idx,
+                          col       = s->col.leaf_column,
+                          scale     = s->col.ts_scale == 0 ? 1 : s->col.ts_scale]() -> int32_t {
+            switch (dtype_len) {
+              case 8: return col->element<int64_t>(idx) * scale;
+              case 4: return col->element<int32_t>(idx) * scale;
+              case 2: return col->element<int16_t>(idx) * scale;
+              default: return col->element<int8_t>(idx) * scale;
+            }
+          }();
+
+          dst[pos + 0] = v;
+          dst[pos + 1] = v >> 8;
+          dst[pos + 2] = v >> 16;
+          dst[pos + 3] = v >> 24;
+        } break;
+        case INT64: {
+          int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
+          int32_t ts_scale = s->col.ts_scale;
+          if (ts_scale != 0) {
+            if (ts_scale < 0) {
+              v /= -ts_scale;
+            } else {
+              v *= ts_scale;
+            }
+          }
+          dst[pos + 0] = v;
+          dst[pos + 1] = v >> 8;
+          dst[pos + 2] = v >> 16;
+          dst[pos + 3] = v >> 24;
+          dst[pos + 4] = v >> 32;
+          dst[pos + 5] = v >> 40;
+          dst[pos + 6] = v >> 48;
+          dst[pos + 7] = v >> 56;
+        } break;
+        case INT96: {
+          int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
+          int32_t ts_scale = s->col.ts_scale;
+          if (ts_scale != 0) {
+            if (ts_scale < 0) {
+              v /= -ts_scale;
+            } else {
+              v *= ts_scale;
+            }
           }
-        }
-      } else {
-        len = 0;
-      }
-      uint32_t total_len = 0;
-      block_scan(temp_storage.scan_storage).ExclusiveSum(len, pos, total_len);
-      __syncthreads();
-      if (t == 0) { s->cur = dst + total_len; }
-      if (is_valid) {
-        switch (physical_type) {
-          case INT32: [[fallthrough]];
-          case FLOAT: {
-            auto const v = [dtype_len = dtype_len_in,
-                            idx       = val_idx,
-                            col       = s->col.leaf_column,
-                            scale     = s->col.ts_scale == 0 ? 1 : s->col.ts_scale]() -> int32_t {
-              switch (dtype_len) {
-                case 8: return col->element<int64_t>(idx) * scale;
-                case 4: return col->element<int32_t>(idx) * scale;
-                case 2: return col->element<int16_t>(idx) * scale;
-                default: return col->element<int8_t>(idx) * scale;
-              }
-            }();
 
-            dst[pos + 0] = v;
-            dst[pos + 1] = v >> 8;
-            dst[pos + 2] = v >> 16;
-            dst[pos + 3] = v >> 24;
-          } break;
-          case INT64: {
-            int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
-            int32_t ts_scale = s->col.ts_scale;
-            if (ts_scale != 0) {
-              if (ts_scale < 0) {
-                v /= -ts_scale;
-              } else {
-                v *= ts_scale;
-              }
+          auto const [last_day_nanos, julian_days] = [&] {
+            using namespace cuda::std::chrono;
+            switch (s->col.leaf_column->type().id()) {
+              case type_id::TIMESTAMP_SECONDS:
+              case type_id::TIMESTAMP_MILLISECONDS: {
+                return julian_days_with_time<cuda::std::milli>(v);
+              } break;
+              case type_id::TIMESTAMP_MICROSECONDS:
+              case type_id::TIMESTAMP_NANOSECONDS: {
+                return julian_days_with_time<cuda::std::micro>(v);
+              } break;
             }
-            dst[pos + 0] = v;
-            dst[pos + 1] = v >> 8;
-            dst[pos + 2] = v >> 16;
-            dst[pos + 3] = v >> 24;
-            dst[pos + 4] = v >> 32;
-            dst[pos + 5] = v >> 40;
-            dst[pos + 6] = v >> 48;
-            dst[pos + 7] = v >> 56;
-          } break;
-          case INT96: {
-            int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
-            int32_t ts_scale = s->col.ts_scale;
-            if (ts_scale != 0) {
-              if (ts_scale < 0) {
-                v /= -ts_scale;
-              } else {
-                v *= ts_scale;
-              }
+            return julian_days_with_time<cuda::std::nano>(0);
+          }();
+
+          // the 12 bytes of fixed length data.
+          v             = last_day_nanos.count();
+          dst[pos + 0]  = v;
+          dst[pos + 1]  = v >> 8;
+          dst[pos + 2]  = v >> 16;
+          dst[pos + 3]  = v >> 24;
+          dst[pos + 4]  = v >> 32;
+          dst[pos + 5]  = v >> 40;
+          dst[pos + 6]  = v >> 48;
+          dst[pos + 7]  = v >> 56;
+          uint32_t w    = julian_days.count();
+          dst[pos + 8]  = w;
+          dst[pos + 9]  = w >> 8;
+          dst[pos + 10] = w >> 16;
+          dst[pos + 11] = w >> 24;
+        } break;
+
+        case DOUBLE: {
+          auto v = s->col.leaf_column->element<double>(val_idx);
+          memcpy(dst + pos, &v, 8);
+        } break;
+        case BYTE_ARRAY: {
+          auto const bytes = [](cudf::type_id const type_id,
+                                column_device_view const* leaf_column,
+                                uint32_t const val_idx) -> void const* {
+            switch (type_id) {
+              case type_id::STRING:
+                return reinterpret_cast<void const*>(
+                  leaf_column->element<string_view>(val_idx).data());
+              case type_id::LIST:
+                return reinterpret_cast<void const*>(
+                  get_element<statistics::byte_array_view>(*(leaf_column), val_idx).data());
+              default: CUDF_UNREACHABLE("invalid type id for byte array writing!");
             }
+          }(type_id, s->col.leaf_column, val_idx);
+          uint32_t v   = len - 4;  // string length
+          dst[pos + 0] = v;
+          dst[pos + 1] = v >> 8;
+          dst[pos + 2] = v >> 16;
+          dst[pos + 3] = v >> 24;
+          if (v != 0) memcpy(dst + pos + 4, bytes, v);
+        } break;
+        case FIXED_LEN_BYTE_ARRAY: {
+          if (type_id == type_id::DECIMAL128) {
+            // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian
+            auto const v = s->col.leaf_column->element<numeric::decimal128>(val_idx).value();
+            auto const v_char_ptr = reinterpret_cast<char const*>(&v);
+            thrust::copy(thrust::seq,
+                         thrust::make_reverse_iterator(v_char_ptr + sizeof(v)),
+                         thrust::make_reverse_iterator(v_char_ptr),
+                         dst + pos);
+          }
+        } break;
+      }
+    }
+    __syncthreads();
+  }
 
-            auto const [last_day_nanos, julian_days] = [&] {
-              using namespace cuda::std::chrono;
-              switch (s->col.leaf_column->type().id()) {
-                case type_id::TIMESTAMP_SECONDS:
-                case type_id::TIMESTAMP_MILLISECONDS: {
-                  return julian_days_with_time<cuda::std::milli>(v);
-                } break;
-                case type_id::TIMESTAMP_MICROSECONDS:
-                case type_id::TIMESTAMP_NANOSECONDS: {
-                  return julian_days_with_time<cuda::std::micro>(v);
-                } break;
-              }
-              return julian_days_with_time<cuda::std::nano>(0);
-            }();
-
-            // the 12 bytes of fixed length data.
-            v             = last_day_nanos.count();
-            dst[pos + 0]  = v;
-            dst[pos + 1]  = v >> 8;
-            dst[pos + 2]  = v >> 16;
-            dst[pos + 3]  = v >> 24;
-            dst[pos + 4]  = v >> 32;
-            dst[pos + 5]  = v >> 40;
-            dst[pos + 6]  = v >> 48;
-            dst[pos + 7]  = v >> 56;
-            uint32_t w    = julian_days.count();
-            dst[pos + 8]  = w;
-            dst[pos + 9]  = w >> 8;
-            dst[pos + 10] = w >> 16;
-            dst[pos + 11] = w >> 24;
-          } break;
+  uint32_t const valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid);
 
-          case DOUBLE: {
-            auto v = s->col.leaf_column->element<double>(val_idx);
-            memcpy(dst + pos, &v, 8);
-          } break;
-          case BYTE_ARRAY: {
-            auto const bytes = [](cudf::type_id const type_id,
-                                  column_device_view const* leaf_column,
-                                  uint32_t const val_idx) -> void const* {
-              switch (type_id) {
-                case type_id::STRING:
-                  return reinterpret_cast<void const*>(
-                    leaf_column->element<string_view>(val_idx).data());
-                case type_id::LIST:
-                  return reinterpret_cast<void const*>(
-                    get_element<statistics::byte_array_view>(*(leaf_column), val_idx).data());
-                default: CUDF_UNREACHABLE("invalid type id for byte array writing!");
-              }
-            }(type_id, s->col.leaf_column, val_idx);
-            uint32_t v   = len - 4;  // string length
-            dst[pos + 0] = v;
-            dst[pos + 1] = v >> 8;
-            dst[pos + 2] = v >> 16;
-            dst[pos + 3] = v >> 24;
-            if (v != 0) memcpy(dst + pos + 4, bytes, v);
-          } break;
-          case FIXED_LEN_BYTE_ARRAY: {
-            if (type_id == type_id::DECIMAL128) {
-              // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian
-              auto const v = s->col.leaf_column->element<numeric::decimal128>(val_idx).value();
-              auto const v_char_ptr = reinterpret_cast<char const*>(&v);
-              thrust::copy(thrust::seq,
-                           thrust::make_reverse_iterator(v_char_ptr + sizeof(v)),
-                           thrust::make_reverse_iterator(v_char_ptr),
-                           dst + pos);
-            }
-          } break;
+  finish_page_encode<block_size>(
+    s, valid_count, s->cur, pages, comp_in, comp_out, comp_results, write_v2_headers);
+}
+
+// DICTIONARY page data encoder
+// blockDim(128, 1, 1)
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 8)
+  gpuEncodeDictPages(device_span<EncPage> pages,
+                     device_span<device_span<uint8_t const>> comp_in,
+                     device_span<device_span<uint8_t>> comp_out,
+                     device_span<compression_result> comp_results,
+                     bool write_v2_headers)
+{
+  __shared__ __align__(8) rle_page_enc_state_s state_g;
+  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
+  using block_scan   = cub::BlockScan<uint32_t, block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename block_scan::TempStorage scan_storage;
+  } temp_storage;
+
+  auto* const s = &state_g;
+  uint32_t t    = threadIdx.x;
+
+  if (t == 0) {
+    state_g        = rle_page_enc_state_s{};
+    s->page        = pages[blockIdx.x];
+    s->ck          = *s->page.chunk;
+    s->col         = *s->ck.col_desc;
+    s->rle_len_pos = nullptr;
+    // get s->cur back to where it was at the end of encoding the rep and def level data
+    s->cur =
+      s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+    // if V1 data page, need space for the RLE length fields
+    if (s->page.page_type == PageType::DATA_PAGE) {
+      if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; }
+      if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; }
+    }
+  }
+  __syncthreads();
+
+  if (BitAnd(s->page.kernel_mask, encode_kernel_mask::DICTIONARY) == 0) { return; }
+
+  // Encode data values
+  __syncthreads();
+  auto const physical_type = s->col.physical_type;
+  auto const type_id       = s->col.leaf_column->type().id();
+  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_in  = [&]() -> uint32_t {
+    if (physical_type == INT32) { return int32_logical_len(type_id); }
+    if (physical_type == INT96) { return sizeof(int64_t); }
+    return dtype_len_out;
+  }();
+
+  // TODO assert dict_bits >= 0
+  auto const dict_bits = (physical_type == BOOLEAN) ? 1
+                         : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE)
+                           ? s->ck.dict_rle_bits
+                           : -1;
+  if (t == 0) {
+    uint8_t* dst   = s->cur;
+    s->rle_run     = 0;
+    s->rle_pos     = 0;
+    s->rle_numvals = 0;
+    s->rle_out     = dst;
+    s->page.encoding =
+      determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers);
+    if (dict_bits >= 0 && physical_type != BOOLEAN) {
+      dst[0]     = dict_bits;
+      s->rle_out = dst + 1;
+    } else if (write_v2_headers && physical_type == BOOLEAN) {
+      // save space for RLE length. we don't know the total length yet.
+      s->rle_out     = dst + RLE_LENGTH_FIELD_LEN;
+      s->rle_len_pos = dst;
+    }
+    s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
+    s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
+  }
+  __syncthreads();
+
+  uint32_t num_valid = 0;
+  for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
+    uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, block_size);
+
+    auto [is_valid, val_idx] = [&]() {
+      size_type const val_idx_in_block    = cur_val_idx + t;
+      size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
+
+      uint32_t const is_valid = (val_idx_in_leaf_col < s->col.leaf_column->size() &&
+                                 val_idx_in_block < s->page.num_leaf_values)
+                                  ? s->col.leaf_column->is_valid(val_idx_in_leaf_col)
+                                  : 0;
+      // need to test for use_dictionary because it might be boolean
+      uint32_t const val_idx =
+        (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col;
+      return std::make_tuple(is_valid, val_idx);
+    }();
+
+    if (is_valid) { num_valid++; }
+    cur_val_idx += nvals;
+
+    // Dictionary encoding
+    if (dict_bits > 0) {
+      uint32_t rle_numvals;
+      uint32_t rle_numvals_in_block;
+      uint32_t pos;
+      block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block);
+      rle_numvals = s->rle_numvals;
+      if (is_valid) {
+        uint32_t v;
+        if (physical_type == BOOLEAN) {
+          v = s->col.leaf_column->element<uint8_t>(val_idx);
+        } else {
+          v = s->ck.dict_index[val_idx];
         }
+        s->vals[rolling_idx(rle_numvals + pos)] = v;
+      }
+      rle_numvals += rle_numvals_in_block;
+      __syncthreads();
+      if ((!write_v2_headers) && (physical_type == BOOLEAN)) {
+        PlainBoolEncode(s, rle_numvals, (cur_val_idx == s->page.num_leaf_values), t);
+      } else {
+        RleEncode(s, rle_numvals, dict_bits, (cur_val_idx == s->page.num_leaf_values), t);
       }
       __syncthreads();
     }
+    if (t == 0) { s->cur = s->rle_out; }
+    __syncthreads();
   }
 
   uint32_t const valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid);
@@ -1392,37 +1684,137 @@ __global__ void __launch_bounds__(128, 8)
     __syncwarp();
   }
 
-  // V2 does not compress rep and def level data
-  size_t const skip_comp_size = s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+  finish_page_encode<block_size>(
+    s, valid_count, s->cur, pages, comp_in, comp_out, comp_results, write_v2_headers);
+}
+
+// DELTA_BINARY_PACKED page data encoder
+// blockDim(128, 1, 1)
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 8)
+  gpuEncodeDeltaBinaryPages(device_span<EncPage> pages,
+                            device_span<device_span<uint8_t const>> comp_in,
+                            device_span<device_span<uint8_t>> comp_out,
+                            device_span<compression_result> comp_results)
+{
+  // block of shared memory for value storage and bit packing
+  __shared__ uleb128_t delta_shared[delta::buffer_size + delta::block_size];
+  __shared__ __align__(8) page_enc_state_s<0> state_g;
+  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
+  __shared__ union {
+    typename block_reduce::TempStorage reduce_storage;
+    typename delta::index_scan::TempStorage delta_index_tmp;
+    typename delta::block_reduce::TempStorage delta_reduce_tmp;
+    typename delta::warp_reduce::TempStorage delta_warp_red_tmp[delta::num_mini_blocks];
+  } temp_storage;
+
+  auto* const s = &state_g;
+  uint32_t t    = threadIdx.x;
 
   if (t == 0) {
-    s->page.num_nulls           = s->page.num_values - valid_count;
-    uint8_t* const base         = s->page.page_data + s->page.max_hdr_size;
-    auto const actual_data_size = static_cast<uint32_t>(s->cur - base);
-    if (actual_data_size > s->page.max_data_size) {
-      CUDF_UNREACHABLE("detected possible page data corruption");
-    }
-    s->page.max_data_size = actual_data_size;
-    if (not comp_in.empty()) {
-      comp_in[blockIdx.x]  = {base + skip_comp_size, actual_data_size - skip_comp_size};
-      comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size,
-                              0};  // size is unused
-    }
-    pages[blockIdx.x] = s->page;
-    if (not comp_results.empty()) {
-      comp_results[blockIdx.x]   = {0, compression_status::FAILURE};
-      pages[blockIdx.x].comp_res = &comp_results[blockIdx.x];
+    state_g        = page_enc_state_s<0>{};
+    s->page        = pages[blockIdx.x];
+    s->ck          = *s->page.chunk;
+    s->col         = *s->ck.col_desc;
+    s->rle_len_pos = nullptr;
+    // get s->cur back to where it was at the end of encoding the rep and def level data
+    s->cur =
+      s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes;
+  }
+  __syncthreads();
+
+  if (BitAnd(s->page.kernel_mask, encode_kernel_mask::DELTA_BINARY) == 0) { return; }
+
+  // Encode data values
+  __syncthreads();
+  auto const physical_type = s->col.physical_type;
+  auto const type_id       = s->col.leaf_column->type().id();
+  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_in  = [&]() -> uint32_t {
+    if (physical_type == INT32) { return int32_logical_len(type_id); }
+    if (physical_type == INT96) { return sizeof(int64_t); }
+    return dtype_len_out;
+  }();
+
+  if (t == 0) {
+    uint8_t* dst       = s->cur;
+    s->rle_run         = 0;
+    s->rle_pos         = 0;
+    s->rle_numvals     = 0;
+    s->rle_out         = dst;
+    s->page.encoding   = Encoding::DELTA_BINARY_PACKED;
+    s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
+    s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
+  }
+  __syncthreads();
+
+  // need to know the number of valid values for the null values calculation and to size
+  // the delta binary encoder.
+  uint32_t valid_count = 0;
+  if (not s->col.leaf_column->nullable()) {
+    valid_count = s->page.num_leaf_values;
+  } else {
+    uint32_t num_valid = 0;
+    for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
+      uint32_t const nvals                = min(s->page.num_leaf_values - cur_val_idx, block_size);
+      size_type const val_idx_in_block    = cur_val_idx + t;
+      size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
+
+      if (val_idx_in_leaf_col < s->col.leaf_column->size() &&
+          val_idx_in_block < s->page.num_leaf_values &&
+          s->col.leaf_column->is_valid(val_idx_in_leaf_col)) {
+        num_valid++;
+      }
+      cur_val_idx += nvals;
     }
+    valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid);
   }
 
-  // copy over uncompressed data
-  if (skip_comp_size != 0 && not comp_in.empty()) {
-    uint8_t const* const src = s->page.page_data + s->page.max_hdr_size;
-    uint8_t* const dst       = s->page.compressed_data + s->page.max_hdr_size;
-    for (int i = t; i < skip_comp_size; i += block_size) {
-      dst[i] = src[i];
+  uint8_t const* delta_ptr = nullptr;  // this will be the end of delta block pointer
+
+  if (physical_type == INT32) {
+    switch (dtype_len_in) {
+      case 8: {
+        // only DURATIONS map to 8 bytes, so safe to just use signed here?
+        delta_ptr = delta_encode<int64_t>(s, valid_count, delta_shared, &temp_storage);
+        break;
+      }
+      case 4: {
+        if (type_id == type_id::UINT32) {
+          delta_ptr = delta_encode<uint32_t>(s, valid_count, delta_shared, &temp_storage);
+        } else {
+          delta_ptr = delta_encode<int32_t>(s, valid_count, delta_shared, &temp_storage);
+        }
+        break;
+      }
+      case 2: {
+        if (type_id == type_id::UINT16) {
+          delta_ptr = delta_encode<uint16_t>(s, valid_count, delta_shared, &temp_storage);
+        } else {
+          delta_ptr = delta_encode<int16_t>(s, valid_count, delta_shared, &temp_storage);
+        }
+        break;
+      }
+      case 1: {
+        if (type_id == type_id::UINT8) {
+          delta_ptr = delta_encode<uint8_t>(s, valid_count, delta_shared, &temp_storage);
+        } else {
+          delta_ptr = delta_encode<int8_t>(s, valid_count, delta_shared, &temp_storage);
+        }
+        break;
+      }
+      default: CUDF_UNREACHABLE("invalid dtype_len_in when encoding DELTA_BINARY_PACKED");
+    }
+  } else {
+    if (type_id == type_id::UINT64) {
+      delta_ptr = delta_encode<uint64_t>(s, valid_count, delta_shared, &temp_storage);
+    } else {
+      delta_ptr = delta_encode<int64_t>(s, valid_count, delta_shared, &temp_storage);
     }
   }
+
+  finish_page_encode<block_size>(
+    s, valid_count, delta_ptr, pages, comp_in, comp_out, comp_results, true);
 }
 
 constexpr int decide_compression_warps_in_block = 4;
@@ -1457,7 +1849,8 @@ __global__ void __launch_bounds__(decide_compression_block_size)
   for (auto page_id = lane_id; page_id < num_pages; page_id += cudf::detail::warp_size) {
     auto const& curr_page     = ck_g[warp_id].pages[page_id];
     auto const page_data_size = curr_page.max_data_size;
-    auto const lvl_bytes      = curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes;
+    auto const is_v2          = curr_page.page_type == PageType::DATA_PAGE_V2;
+    auto const lvl_bytes      = is_v2 ? curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes : 0;
     uncompressed_data_size += page_data_size;
     if (auto comp_res = curr_page.comp_res; comp_res != nullptr) {
       compressed_data_size += comp_res->bytes_written + lvl_bytes;
@@ -1920,7 +2313,8 @@ __global__ void __launch_bounds__(128)
     }
     uncompressed_page_size = page_g.max_data_size;
     if (ck_g.is_compressed) {
-      auto const lvl_bytes = page_g.def_lvl_bytes + page_g.rep_lvl_bytes;
+      auto const is_v2     = page_g.page_type == PageType::DATA_PAGE_V2;
+      auto const lvl_bytes = is_v2 ? page_g.def_lvl_bytes + page_g.rep_lvl_bytes : 0;
       hdr_start            = page_g.compressed_data;
       compressed_page_size =
         static_cast<uint32_t>(comp_results[blockIdx.x].bytes_written) + lvl_bytes;
@@ -2155,6 +2549,10 @@ constexpr __device__ void* align8(void* ptr)
   return static_cast<char*>(ptr) - algn;
 }
 
+struct mask_tform {
+  __device__ uint32_t operator()(EncPage const& p) { return static_cast<uint32_t>(p.kernel_mask); }
+};
+
 }  // namespace
 
 // blockDim(1, 1, 1)
@@ -2257,8 +2655,9 @@ void InitFragmentStatistics(device_span<statistics_group> groups,
                             rmm::cuda_stream_view stream)
 {
   int const num_fragments = fragments.size();
-  int const dim = util::div_rounding_up_safe(num_fragments, 128 / cudf::detail::warp_size);
-  gpuInitFragmentStats<<<dim, 128, 0, stream.value()>>>(groups, fragments);
+  int const dim =
+    util::div_rounding_up_safe(num_fragments, encode_block_size / cudf::detail::warp_size);
+  gpuInitFragmentStats<<<dim, encode_block_size, 0, stream.value()>>>(groups, fragments);
 }
 
 void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
@@ -2277,18 +2676,18 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
 {
   auto num_rowgroups = chunks.size().first;
   dim3 dim_grid(num_columns, num_rowgroups);  // 1 threadblock per rowgroup
-  gpuInitPages<<<dim_grid, 128, 0, stream.value()>>>(chunks,
-                                                     pages,
-                                                     page_sizes,
-                                                     comp_page_sizes,
-                                                     col_desc,
-                                                     page_grstats,
-                                                     chunk_grstats,
-                                                     num_columns,
-                                                     max_page_size_bytes,
-                                                     max_page_size_rows,
-                                                     page_align,
-                                                     write_v2_headers);
+  gpuInitPages<<<dim_grid, encode_block_size, 0, stream.value()>>>(chunks,
+                                                                   pages,
+                                                                   page_sizes,
+                                                                   comp_page_sizes,
+                                                                   col_desc,
+                                                                   page_grstats,
+                                                                   chunk_grstats,
+                                                                   num_columns,
+                                                                   max_page_size_bytes,
+                                                                   max_page_size_rows,
+                                                                   page_align,
+                                                                   write_v2_headers);
 }
 
 void EncodePages(device_span<EncPage> pages,
@@ -2299,10 +2698,43 @@ void EncodePages(device_span<EncPage> pages,
                  rmm::cuda_stream_view stream)
 {
   auto num_pages = pages.size();
+
+  // determine which kernels to invoke
+  auto mask_iter       = thrust::make_transform_iterator(pages.begin(), mask_tform{});
+  uint32_t kernel_mask = thrust::reduce(
+    rmm::exec_policy(stream), mask_iter, mask_iter + pages.size(), 0U, thrust::bit_or<uint32_t>{});
+
+  // get the number of streams we need from the pool
+  int nkernels = std::bitset<32>(kernel_mask).count();
+  auto streams = cudf::detail::fork_streams(stream, nkernels);
+
   // A page is part of one column. This is launching 1 block per page. 1 block will exclusively
   // deal with one datatype.
-  gpuEncodePages<128><<<num_pages, 128, 0, stream.value()>>>(
-    pages, comp_in, comp_out, comp_results, write_v2_headers);
+
+  int s_idx = 0;
+  if (BitAnd(kernel_mask, encode_kernel_mask::PLAIN) != 0) {
+    auto const strm = streams[s_idx++];
+    gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, write_v2_headers, encode_kernel_mask::PLAIN);
+    gpuEncodePages<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, comp_in, comp_out, comp_results, write_v2_headers);
+  }
+  if (BitAnd(kernel_mask, encode_kernel_mask::DELTA_BINARY) != 0) {
+    auto const strm = streams[s_idx++];
+    gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, write_v2_headers, encode_kernel_mask::DELTA_BINARY);
+    gpuEncodeDeltaBinaryPages<encode_block_size>
+      <<<num_pages, encode_block_size, 0, strm.value()>>>(pages, comp_in, comp_out, comp_results);
+  }
+  if (BitAnd(kernel_mask, encode_kernel_mask::DICTIONARY) != 0) {
+    auto const strm = streams[s_idx++];
+    gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, write_v2_headers, encode_kernel_mask::DICTIONARY);
+    gpuEncodeDictPages<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, comp_in, comp_out, comp_results, write_v2_headers);
+  }
+
+  cudf::detail::join_streams(streams, stream);
 }
 
 void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
@@ -2320,7 +2752,7 @@ void EncodePageHeaders(device_span<EncPage> pages,
 {
   // TODO: single thread task. No need for 128 threads/block. Earlier it used to employ rest of the
   // threads to coop load structs
-  gpuEncodePageHeaders<<<pages.size(), 128, 0, stream.value()>>>(
+  gpuEncodePageHeaders<<<pages.size(), encode_block_size, 0, stream.value()>>>(
     pages, comp_results, page_stats, chunk_stats);
 }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 6a93fec0c46..048f1a73a9c 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -88,6 +88,37 @@ struct input_column_info {
   auto nesting_depth() const { return nesting.size(); }
 };
 
+// The delta encodings use ULEB128 integers, but parquet only uses max 64 bits.
+using uleb128_t   = uint64_t;
+using zigzag128_t = int64_t;
+
+// this is in C++23
+#if !defined(__cpp_lib_is_scoped_enum)
+template <typename Enum, bool = std::is_enum_v<Enum>>
+struct is_scoped_enum {
+  static const bool value = not std::is_convertible_v<Enum, std::underlying_type_t<Enum>>;
+};
+
+template <typename Enum>
+struct is_scoped_enum<Enum, false> {
+  static const bool value = false;
+};
+#else
+using std::is_scoped_enum;
+#endif
+
+// helpers to do bit operations on scoped enums
+template <class T1,
+          class T2,
+          typename std::enable_if_t<(is_scoped_enum<T1>::value and std::is_same_v<T1, T2>) or
+                                    (is_scoped_enum<T1>::value and std::is_same_v<uint32_t, T2>) or
+                                    (is_scoped_enum<T2>::value and std::is_same_v<uint32_t, T1>)>* =
+            nullptr>
+constexpr uint32_t BitAnd(T1 a, T2 b)
+{
+  return static_cast<uint32_t>(a) & static_cast<uint32_t>(b);
+}
+
 /**
  * @brief Enums for the flags in the page header
  */
@@ -371,6 +402,17 @@ constexpr uint32_t encoding_to_mask(Encoding encoding)
   return 1 << static_cast<uint32_t>(encoding);
 }
 
+/**
+ * @brief Enum of mask bits for the EncPage kernel_mask
+ *
+ * Used to control which encode kernels to run.
+ */
+enum class encode_kernel_mask {
+  PLAIN        = (1 << 0),  // Run plain encoding kernel
+  DICTIONARY   = (1 << 1),  // Run dictionary encoding kernel
+  DELTA_BINARY = (1 << 2)   // Run DELTA_BINARY_PACKED encoding kernel
+};
+
 /**
  * @brief Struct describing an encoder column chunk
  */
@@ -429,10 +471,11 @@ struct EncPage {
   uint32_t num_leaf_values;  //!< Values in page. Different from num_rows in case of nested types
   uint32_t num_values;  //!< Number of def/rep level values in page. Includes null/empty elements in
                         //!< non-leaf levels
-  uint32_t def_lvl_bytes;        //!< Number of bytes of encoded definition level data (V2 only)
-  uint32_t rep_lvl_bytes;        //!< Number of bytes of encoded repetition level data (V2 only)
-  compression_result* comp_res;  //!< Ptr to compression result
-  uint32_t num_nulls;            //!< Number of null values (V2 only) (down here for alignment)
+  uint32_t def_lvl_bytes;          //!< Number of bytes of encoded definition level data (V2 only)
+  uint32_t rep_lvl_bytes;          //!< Number of bytes of encoded repetition level data (V2 only)
+  compression_result* comp_res;    //!< Ptr to compression result
+  uint32_t num_nulls;              //!< Number of null values (V2 only) (down here for alignment)
+  encode_kernel_mask kernel_mask;  //!< Mask used to control which encoding kernels to run
 };
 
 /**
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index fa85e3a4a1d..2a654bd7e8c 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -353,6 +353,9 @@ struct ParquetWriterSchemaTest : public ParquetWriterTest {
 template <typename T>
 struct ParquetReaderSourceTest : public ParquetReaderTest {};
 
+template <typename T>
+struct ParquetWriterDeltaTest : public ParquetWriterTest {};
+
 // Declare typed test cases
 // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352
 using SupportedTypes = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, bool, float, double>;
@@ -384,7 +387,6 @@ TYPED_TEST_SUITE(ParquetChunkedWriterNumericTypeTest, SupportedTypes);
 class ParquetSizedTest : public ::cudf::test::BaseFixtureWithParam<int> {};
 
 // test the allowed bit widths for dictionary encoding
-// values chosen to trigger 1, 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, and 24 bit dictionaries
 INSTANTIATE_TEST_SUITE_P(ParquetDictionaryTest,
                          ParquetSizedTest,
                          testing::Range(1, 25),
@@ -6698,7 +6700,7 @@ TEST_P(ParquetV2Test, CheckEncodings)
   // data should be PLAIN for v1, RLE for V2
   auto col0_data =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 2 == 0; });
-  // data should be PLAIN for both
+  // data should be PLAIN for v1, DELTA_BINARY_PACKED for v2
   auto col1_data = random_values<int32_t>(num_rows);
   // data should be PLAIN_DICTIONARY for v1, PLAIN and RLE_DICTIONARY for v2
   auto col2_data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
@@ -6733,10 +6735,10 @@ TEST_P(ParquetV2Test, CheckEncodings)
     // col0 should have RLE for rep/def and data
     EXPECT_TRUE(chunk0_enc.size() == 1);
     EXPECT_TRUE(contains(chunk0_enc, Encoding::RLE));
-    // col1 should have RLE for rep/def and PLAIN for data
+    // col1 should have RLE for rep/def and DELTA_BINARY_PACKED for data
     EXPECT_TRUE(chunk1_enc.size() == 2);
     EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE));
-    EXPECT_TRUE(contains(chunk1_enc, Encoding::PLAIN));
+    EXPECT_TRUE(contains(chunk1_enc, Encoding::DELTA_BINARY_PACKED));
     // col2 should have RLE for rep/def, PLAIN for dict, and RLE_DICTIONARY for data
     EXPECT_TRUE(chunk2_enc.size() == 3);
     EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE));
@@ -6758,6 +6760,104 @@ TEST_P(ParquetV2Test, CheckEncodings)
   }
 }
 
+// removing duration_D, duration_s, and timestamp_s as they don't appear to be supported properly.
+// see definition of UnsupportedChronoTypes above.
+using DeltaDecimalTypes = cudf::test::Types<numeric::decimal32, numeric::decimal64>;
+using DeltaBinaryTypes =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::ChronoTypes, DeltaDecimalTypes>;
+using SupportedDeltaTestTypes =
+  cudf::test::RemoveIf<cudf::test::ContainedIn<UnsupportedChronoTypes>, DeltaBinaryTypes>;
+TYPED_TEST_SUITE(ParquetWriterDeltaTest, SupportedDeltaTestTypes);
+
+TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypes)
+{
+  using T   = TypeParam;
+  auto col0 = testdata::ascending<T>();
+  auto col1 = testdata::unordered<T>();
+
+  auto const expected = table_view{{col0, col1}};
+
+  auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPacked.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .write_v2_headers(true)
+      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
+TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypesSliced)
+{
+  using T                = TypeParam;
+  constexpr int num_rows = 4'000;
+  auto col0              = testdata::ascending<T>();
+  auto col1              = testdata::unordered<T>();
+
+  auto const expected = table_view{{col0, col1}};
+  auto expected_slice = cudf::slice(expected, {num_rows, 2 * num_rows});
+  ASSERT_EQ(expected_slice[0].num_rows(), num_rows);
+
+  auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedSliced.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice)
+      .write_v2_headers(true)
+      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
+}
+
+TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaListSliced)
+{
+  using T = TypeParam;
+
+  constexpr int num_slice = 4'000;
+  constexpr int num_rows  = 32 * 1024;
+
+  std::mt19937 gen(6542);
+  std::bernoulli_distribution bn(0.7f);
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); });
+  auto values = thrust::make_counting_iterator(0);
+
+  // list<T>
+  constexpr int vals_per_row = 4;
+  auto c1_offset_iter        = cudf::detail::make_counting_transform_iterator(
+    0, [vals_per_row](cudf::size_type idx) { return idx * vals_per_row; });
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> c1_offsets(c1_offset_iter,
+                                                                     c1_offset_iter + num_rows + 1);
+  cudf::test::fixed_width_column_wrapper<T> c1_vals(
+    values, values + (num_rows * vals_per_row), valids);
+  auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows);
+
+  auto _c1 = cudf::make_lists_column(
+    num_rows, c1_offsets.release(), c1_vals.release(), null_count, std::move(null_mask));
+  auto c1 = cudf::purge_nonempty_nulls(*_c1);
+
+  auto const expected = table_view{{*c1}};
+  auto expected_slice = cudf::slice(expected, {num_slice, 2 * num_slice});
+  ASSERT_EQ(expected_slice[0].num_rows(), num_slice);
+
+  auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedListSliced.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice)
+      .write_v2_headers(true)
+      .dictionary_policy(cudf::io::dictionary_policy::NEVER);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
+}
+
 TEST_F(ParquetWriterTest, EmptyMinStringStatistics)
 {
   char const* const min_val = "";

From 0341bb7cebfab1fb45d4a53cfc495265bb96ee3a Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 20 Oct 2023 10:46:02 -0700
Subject: [PATCH 036/118] Expose streams in public null mask APIs (#14263)

Contributes to #925

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/14263
---
 cpp/include/cudf/lists/detail/scatter.cuh |  6 +-
 cpp/include/cudf/null_mask.hpp            | 24 +++++-
 cpp/src/binaryop/binaryop.cpp             |  2 +-
 cpp/src/bitmask/null_mask.cu              | 38 +++++++---
 cpp/src/copying/concatenate.cu            |  2 +-
 cpp/src/copying/scatter.cu                |  5 +-
 cpp/src/groupby/hash/groupby.cu           |  3 +-
 cpp/src/lists/contains.cu                 | 16 ++--
 cpp/src/merge/merge.cu                    |  2 +-
 cpp/src/round/round.cu                    | 16 +++-
 cpp/src/search/contains_column.cu         |  2 +-
 cpp/src/strings/replace/multi.cu          |  2 +-
 cpp/src/strings/split/split_re.cu         |  2 +-
 cpp/src/strings/split/split_record.cu     |  6 +-
 cpp/src/unary/cast_ops.cu                 |  8 +-
 cpp/src/unary/math_ops.cu                 |  8 +-
 cpp/tests/CMakeLists.txt                  |  1 +
 cpp/tests/streams/null_mask_test.cpp      | 92 +++++++++++++++++++++++
 18 files changed, 191 insertions(+), 44 deletions(-)
 create mode 100644 cpp/tests/streams/null_mask_test.cpp

diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index f04b2fda2bf..ff148c59a23 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -20,9 +20,9 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/detail/scatter_helper.cuh>
 #include <cudf/lists/list_device_view.cuh>
-#include <cudf/null_mask.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -130,8 +130,8 @@ std::unique_ptr<column> scatter_impl(rmm::device_uvector<unbound_list_view> cons
   std::vector<std::unique_ptr<column>> children;
   children.emplace_back(std::move(offsets_column));
   children.emplace_back(std::move(child_column));
-  auto null_mask =
-    target.has_nulls() ? copy_bitmask(target, stream, mr) : rmm::device_buffer{0, stream, mr};
+  auto null_mask = target.has_nulls() ? cudf::detail::copy_bitmask(target, stream, mr)
+                                      : rmm::device_buffer{0, stream, mr};
 
   // The output column from this function only has null masks copied from the target columns.
   // That is still not a correct final null mask for the scatter result.
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 672f479ad53..524296e60ca 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -80,6 +81,7 @@ size_type num_bitmask_words(size_type number_of_bits);
  *
  * @param size The number of elements to be represented by the mask
  * @param state The desired state of the mask
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A `device_buffer` for use as a null bitmask
  * satisfying the desired size and state
@@ -87,6 +89,7 @@ size_type num_bitmask_words(size_type number_of_bits);
 rmm::device_buffer create_null_mask(
   size_type size,
   mask_state state,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -100,8 +103,13 @@ rmm::device_buffer create_null_mask(
  * @param begin_bit Index of the first bit to set (inclusive)
  * @param end_bit Index of the last bit to set (exclusive)
  * @param valid If true set all entries to valid; otherwise, set all to null
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid);
+void set_null_mask(bitmask_type* bitmask,
+                   size_type begin_bit,
+                   size_type end_bit,
+                   bool valid,
+                   rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Creates a `device_buffer` from a slice of bitmask defined by a range
@@ -115,6 +123,7 @@ void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit
  * @param mask Bitmask residing in device memory whose bits will be copied
  * @param begin_bit Index of the first bit to be copied (inclusive)
  * @param end_bit Index of the last bit to be copied (exclusive)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A `device_buffer` containing the bits
  * `[begin_bit, end_bit)` from `mask`.
@@ -123,6 +132,7 @@ rmm::device_buffer copy_bitmask(
   bitmask_type const* mask,
   size_type begin_bit,
   size_type end_bit,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -132,12 +142,14 @@ rmm::device_buffer copy_bitmask(
  * Returns empty `device_buffer` if the column is not nullable
  *
  * @param view Column view whose bitmask needs to be copied
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A `device_buffer` containing the bits
  * `[view.offset(), view.offset() + view.size())` from `view`'s bitmask.
  */
 rmm::device_buffer copy_bitmask(
   column_view const& view,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -148,11 +160,13 @@ rmm::device_buffer copy_bitmask(
  * If no column in the table is nullable, an empty bitmask is returned.
  *
  * @param view The table of columns
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A pair of resulting bitmask and count of unset bits
  */
 std::pair<rmm::device_buffer, size_type> bitmask_and(
   table_view const& view,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -163,11 +177,13 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(
  * If no column in the table is nullable, an empty bitmask is returned.
  *
  * @param view The table of columns
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  * @return A pair of resulting bitmask and count of unset bits
  */
 std::pair<rmm::device_buffer, size_type> bitmask_or(
   table_view const& view,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -183,8 +199,12 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(
  * @param bitmask Validity bitmask residing in device memory.
  * @param start Index of the first bit to count (inclusive).
  * @param stop Index of the last bit to count (exclusive).
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return The number of null elements in the specified range.
  */
-cudf::size_type null_count(bitmask_type const* bitmask, size_type start, size_type stop);
+cudf::size_type null_count(bitmask_type const* bitmask,
+                           size_type start,
+                           size_type stop,
+                           rmm::cuda_stream_view stream = cudf::get_default_stream());
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 6b413ab2be4..53b04c4ca80 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -366,7 +366,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
 
   CUDF_EXPECTS((lhs.size() == rhs.size()), "Column sizes don't match");
 
-  auto [new_mask, null_count] = bitmask_and(table_view({lhs, rhs}), stream, mr);
+  auto [new_mask, null_count] = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr);
   auto out =
     make_fixed_width_column(output_type, lhs.size(), std::move(new_mask), null_count, stream, mr);
 
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 5a0d3e4f120..3ff56eabe1e 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -157,16 +157,21 @@ void set_null_mask(bitmask_type* bitmask,
 // Create a device_buffer for a null mask
 rmm::device_buffer create_null_mask(size_type size,
                                     mask_state state,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  return detail::create_null_mask(size, state, cudf::get_default_stream(), mr);
+  return detail::create_null_mask(size, state, stream, mr);
 }
 
 // Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true,
 // or null, otherwise;
-void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid)
+void set_null_mask(bitmask_type* bitmask,
+                   size_type begin_bit,
+                   size_type end_bit,
+                   bool valid,
+                   rmm::cuda_stream_view stream)
 {
-  return detail::set_null_mask(bitmask, begin_bit, end_bit, valid, cudf::get_default_stream());
+  return detail::set_null_mask(bitmask, begin_bit, end_bit, valid, stream);
 }
 
 namespace detail {
@@ -511,33 +516,46 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
 rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type begin_bit,
                                 size_type end_bit,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
-  return detail::copy_bitmask(mask, begin_bit, end_bit, cudf::get_default_stream(), mr);
+  CUDF_FUNC_RANGE();
+  return detail::copy_bitmask(mask, begin_bit, end_bit, stream, mr);
 }
 
 // Create a bitmask from a column view
-rmm::device_buffer copy_bitmask(column_view const& view, rmm::mr::device_memory_resource* mr)
+rmm::device_buffer copy_bitmask(column_view const& view,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
-  return detail::copy_bitmask(view, cudf::get_default_stream(), mr);
+  CUDF_FUNC_RANGE();
+  return detail::copy_bitmask(view, stream, mr);
 }
 
 std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::mr::device_memory_resource* mr)
 {
-  return detail::bitmask_and(view, cudf::get_default_stream(), mr);
+  CUDF_FUNC_RANGE();
+  return detail::bitmask_and(view, stream, mr);
 }
 
 std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
+                                                    rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
-  return detail::bitmask_or(view, cudf::get_default_stream(), mr);
+  CUDF_FUNC_RANGE();
+  return detail::bitmask_or(view, stream, mr);
 }
 
 // Count non-zero bits in the specified range
-cudf::size_type null_count(bitmask_type const* bitmask, size_type start, size_type stop)
+cudf::size_type null_count(bitmask_type const* bitmask,
+                           size_type start,
+                           size_type stop,
+                           rmm::cuda_stream_view stream)
 {
-  return detail::null_count(bitmask, start, stop, cudf::get_default_stream());
+  CUDF_FUNC_RANGE();
+  return detail::null_count(bitmask, start, stop, stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index d08c3025553..9b9e780965a 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -563,7 +563,7 @@ rmm::device_buffer concatenate_masks(host_span<column_view const> views,
       });
 
     rmm::device_buffer null_mask =
-      create_null_mask(total_element_count, mask_state::UNINITIALIZED, mr);
+      cudf::detail::create_null_mask(total_element_count, mask_state::UNINITIALIZED, stream, mr);
 
     detail::concatenate_masks(views, static_cast<bitmask_type*>(null_mask.data()), stream);
 
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 11c27fc86e3..879ddb5048e 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -268,8 +268,9 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
 
     // Compute null mask
     rmm::device_buffer null_mask =
-      target.nullable() ? copy_bitmask(target, stream, mr)
-                        : create_null_mask(target.size(), mask_state::UNALLOCATED, stream, mr);
+      target.nullable()
+        ? detail::copy_bitmask(target, stream, mr)
+        : detail::create_null_mask(target.size(), mask_state::UNALLOCATED, stream, mr);
     column null_mask_stub(data_type{type_id::STRUCT},
                           target.size(),
                           rmm::device_buffer{},
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 506832881a9..195c8924c9a 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -410,7 +410,8 @@ void sparse_to_dense_results(table_view const& keys,
                              rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
-  auto row_bitmask = bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first;
+  auto row_bitmask =
+    cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first;
   bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
   bitmask_type const* row_bitmask_ptr =
     skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 4733a5d63a8..cd2bc493bc7 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/detail/contains.hpp>
 #include <cudf/lists/detail/lists_column_factories.hpp>
@@ -274,12 +275,13 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  rmm::mr::device_memory_resource* mr)
 {
   if (!search_key.is_valid(stream)) {
-    return make_numeric_column(data_type{cudf::type_to_id<size_type>()},
-                               lists.size(),
-                               cudf::create_null_mask(lists.size(), mask_state::ALL_NULL, mr),
-                               lists.size(),
-                               stream,
-                               mr);
+    return make_numeric_column(
+      data_type{cudf::type_to_id<size_type>()},
+      lists.size(),
+      cudf::detail::create_null_mask(lists.size(), mask_state::ALL_NULL, stream, mr),
+      lists.size(),
+      stream,
+      mr);
   }
   if (lists.size() == 0) {
     return make_numeric_column(
@@ -337,7 +339,7 @@ std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
   auto const lists_cv      = lists.parent();
   auto output              = make_numeric_column(data_type{type_to_id<bool>()},
                                     lists.size(),
-                                    copy_bitmask(lists_cv, stream, mr),
+                                    cudf::detail::copy_bitmask(lists_cv, stream, mr),
                                     lists_cv.null_count(),
                                     stream,
                                     mr);
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index c0765b48205..00a2f0bee8f 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -381,7 +381,7 @@ std::unique_ptr<column> column_merger::operator()<cudf::struct_view>(
   // materialize the output buffer
   rmm::device_buffer validity =
     lcol.has_nulls() || rcol.has_nulls()
-      ? create_null_mask(merged_size, mask_state::UNINITIALIZED, stream, mr)
+      ? detail::create_null_mask(merged_size, mask_state::UNINITIALIZED, stream, mr)
       : rmm::device_buffer{};
   if (lcol.has_nulls() || rcol.has_nulls()) {
     materialize_bitmask(lcol,
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 41cce57d55b..8a6367a1f87 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -219,8 +219,12 @@ std::unique_ptr<column> round_with(column_view const& input,
   if (decimal_places >= 0 && std::is_integral_v<T>)
     return std::make_unique<cudf::column>(input, stream, mr);
 
-  auto result = cudf::make_fixed_width_column(
-    input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
+  auto result = cudf::make_fixed_width_column(input.type(),
+                                              input.size(),
+                                              detail::copy_bitmask(input, stream, mr),
+                                              input.null_count(),
+                                              stream,
+                                              mr);
 
   auto out_view = result->mutable_view();
   T const n     = std::pow(10, std::abs(decimal_places));
@@ -256,8 +260,12 @@ std::unique_ptr<column> round_with(column_view const& input,
   if (input.type().scale() > -decimal_places)
     return cudf::detail::cast(input, result_type, stream, mr);
 
-  auto result = cudf::make_fixed_width_column(
-    result_type, input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
+  auto result = cudf::make_fixed_width_column(result_type,
+                                              input.size(),
+                                              detail::copy_bitmask(input, stream, mr),
+                                              input.null_count(),
+                                              stream,
+                                              mr);
 
   auto out_view = result->mutable_view();
 
diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu
index 85971647434..b8c7d058535 100644
--- a/cpp/src/search/contains_column.cu
+++ b/cpp/src/search/contains_column.cu
@@ -42,7 +42,7 @@ struct contains_column_dispatch {
                                      stream,
                                      mr);
     return std::make_unique<column>(
-      std::move(result_v), copy_bitmask(needles, stream, mr), needles.null_count());
+      std::move(result_v), detail::copy_bitmask(needles, stream, mr), needles.null_count());
   }
 };
 
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index ee47932100a..f80ace57c69 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -383,7 +383,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
                              std::move(offsets),
                              std::move(chars->release().children.back()),
                              input.null_count(),
-                             copy_bitmask(input.parent(), stream, mr));
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
 /**
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 913aec79758..045aac279e6 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -290,7 +290,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
                            std::move(offsets),
                            std::move(strings_output),
                            input.null_count(),
-                           copy_bitmask(input.parent(), stream, mr),
+                           cudf::detail::copy_bitmask(input.parent(), stream, mr),
                            stream,
                            mr);
 }
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 52f27c68111..7a0cfb9ef41 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -57,7 +57,7 @@ std::unique_ptr<column> split_record_fn(strings_column_view const& input,
                              std::move(offsets),
                              std::move(results),
                              input.null_count(),
-                             copy_bitmask(input.parent(), stream, mr),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr),
                              stream,
                              mr);
   }
@@ -72,7 +72,7 @@ std::unique_ptr<column> split_record_fn(strings_column_view const& input,
                            std::move(offsets),
                            std::move(strings_child),
                            input.null_count(),
-                           copy_bitmask(input.parent(), stream, mr),
+                           cudf::detail::copy_bitmask(input.parent(), stream, mr),
                            stream,
                            mr);
 }
@@ -160,7 +160,7 @@ std::unique_ptr<column> whitespace_split_record_fn(strings_column_view const& in
                            std::move(offsets),
                            std::move(strings_output),
                            input.null_count(),
-                           copy_bitmask(input.parent(), stream, mr),
+                           cudf::detail::copy_bitmask(input.parent(), stream, mr),
                            stream,
                            mr);
 }
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 1c81f266200..6fa87b1f709 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -194,7 +194,7 @@ std::unique_ptr<column> rescale(column_view input,
       auto const scalar  = make_fixed_point_scalar<T>(0, scale_type{scale}, stream);
       auto output_column = make_column_from_scalar(*scalar, input.size(), stream, mr);
       if (input.nullable()) {
-        auto const null_mask = copy_bitmask(input, stream, mr);
+        auto const null_mask = detail::copy_bitmask(input, stream, mr);
         output_column->set_null_mask(std::move(null_mask), input.null_count());
       }
       return output_column;
@@ -255,7 +255,7 @@ struct dispatch_unary_cast_to {
       std::make_unique<column>(type,
                                size,
                                rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                               copy_bitmask(input, stream, mr),
+                               detail::copy_bitmask(input, stream, mr),
                                input.null_count());
 
     mutable_column_view output_mutable = *output;
@@ -285,7 +285,7 @@ struct dispatch_unary_cast_to {
       std::make_unique<column>(type,
                                size,
                                rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                               copy_bitmask(input, stream, mr),
+                               detail::copy_bitmask(input, stream, mr),
                                input.null_count());
 
     mutable_column_view output_mutable = *output;
@@ -334,7 +334,7 @@ struct dispatch_unary_cast_to {
       auto output     = std::make_unique<column>(cudf::data_type{type.id(), input.type().scale()},
                                              size,
                                              rmm::device_buffer{size * cudf::size_of(type), stream},
-                                             copy_bitmask(input, stream, mr),
+                                             detail::copy_bitmask(input, stream, mr),
                                              input.null_count());
 
       mutable_column_view output_mutable = *output;
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index d0cae81a9c8..d84e0171b49 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -291,8 +291,12 @@ std::unique_ptr<column> unary_op_with(column_view const& input,
        std::is_same_v<FixedPointUnaryOpFunctor, fixed_point_floor<Type>>))
     return std::make_unique<cudf::column>(input, stream, mr);
 
-  auto result = cudf::make_fixed_width_column(
-    input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
+  auto result = cudf::make_fixed_width_column(input.type(),
+                                              input.size(),
+                                              detail::copy_bitmask(input, stream, mr),
+                                              input.null_count(),
+                                              stream,
+                                              mr);
 
   auto out_view = result->mutable_view();
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 3e30db7abcb..16e7239ebd8 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -629,6 +629,7 @@ ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/null_mask_test.cpp b/cpp/tests/streams/null_mask_test.cpp
new file mode 100644
index 00000000000..7e59201c8cf
--- /dev/null
+++ b/cpp/tests/streams/null_mask_test.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/binaryop/util/runtime_support.h>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class NullMaskTest : public cudf::test::BaseFixture {};
+
+TEST_F(NullMaskTest, CreateNullMask)
+{
+  cudf::create_null_mask(10, cudf::mask_state::ALL_VALID, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, SetNullMask)
+{
+  cudf::test::fixed_width_column_wrapper<bool> col({0, 1, 0, 1, 1},
+                                                   {true, false, true, false, false});
+
+  cudf::set_null_mask(static_cast<cudf::mutable_column_view>(col).null_mask(),
+                      0,
+                      3,
+                      false,
+                      cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, CopyBitmask)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col({0, 1, 0, 1, 1},
+                                                         {true, false, true, false, false});
+
+  cudf::copy_bitmask(
+    static_cast<cudf::column_view>(col).null_mask(), 0, 3, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, CopyBitmaskFromColumn)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col({0, 1, 0, 1, 1},
+                                                         {true, false, true, false, false});
+
+  cudf::copy_bitmask(col, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, BitMaskAnd)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col1({0, 1, 0, 1, 1},
+                                                          {true, false, true, false, false});
+  cudf::test::fixed_width_column_wrapper<bool> const col2({0, 1, 0, 1, 1},
+                                                          {true, true, false, false, true});
+
+  auto tbl = cudf::table_view{{col1, col2}};
+  cudf::bitmask_and(tbl, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, BitMaskOr)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col1({0, 1, 0, 1, 1},
+                                                          {true, false, true, false, false});
+  cudf::test::fixed_width_column_wrapper<bool> const col2({0, 1, 0, 1, 1},
+                                                          {true, true, false, false, true});
+
+  auto tbl = cudf::table_view{{col1, col2}};
+  cudf::bitmask_or(tbl, cudf::test::get_default_stream());
+}
+
+TEST_F(NullMaskTest, NullCount)
+{
+  cudf::test::fixed_width_column_wrapper<bool> const col({0, 1, 0, 1, 1},
+                                                         {true, true, false, false, true});
+
+  cudf::null_count(
+    static_cast<cudf::column_view>(col).null_mask(), 0, 4, cudf::test::get_default_stream());
+}

From e7c6365a4976881dc3cf0bcbfa254eb664cfe877 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Fri, 20 Oct 2023 11:32:38 -0700
Subject: [PATCH 037/118] Detect and report errors in Parquet header parsing
 (#14237)

Fixes #13656.  Uses the error reporting introduced in #14167 to report errors in header parsing.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14237
---
 cpp/src/io/parquet/error.hpp                  |  77 ++++++++++++++++++
 cpp/src/io/parquet/page_data.cu               |   5 +-
 cpp/src/io/parquet/page_delta_decode.cu       |   5 +-
 cpp/src/io/parquet/page_hdr.cu                |  58 ++++++-------
 cpp/src/io/parquet/page_string_decode.cu      |   5 +-
 cpp/src/io/parquet/parquet_gpu.hpp            |  32 +++++++-
 cpp/src/io/parquet/reader_impl.cpp            |  11 ++-
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  44 ++++------
 .../cudf/tests/data/parquet/bad_dict.parquet  | Bin 0 -> 2850 bytes
 python/cudf/cudf/tests/test_parquet.py        |   8 ++
 10 files changed, 170 insertions(+), 75 deletions(-)
 create mode 100644 cpp/src/io/parquet/error.hpp
 create mode 100644 python/cudf/cudf/tests/data/parquet/bad_dict.parquet

diff --git a/cpp/src/io/parquet/error.hpp b/cpp/src/io/parquet/error.hpp
new file mode 100644
index 00000000000..92b5eebe9fd
--- /dev/null
+++ b/cpp/src/io/parquet/error.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
+
+#include <cstdint>
+#include <sstream>
+
+namespace cudf::io::parquet {
+
+/**
+ * @brief Wrapper around a `rmm::device_scalar` for use in reporting errors that occur in
+ * kernel calls.
+ *
+ * The `kernel_error` object is created with a `rmm::cuda_stream_view` which is used throughout
+ * the object's lifetime.
+ */
+class kernel_error {
+ private:
+  rmm::device_scalar<int32_t> _error_code;
+
+ public:
+  /**
+   * @brief Construct a new `kernel_error` with an initial value of 0.
+   *
+   * Note: the initial value is set asynchronously.
+   *
+   * @throws `rmm::bad_alloc` if allocating the device memory for `initial_value` fails.
+   * @throws `rmm::cuda_error` if copying `initial_value` to device memory fails.
+   *
+   * @param CUDA stream to use
+   */
+  kernel_error(rmm::cuda_stream_view stream) : _error_code{0, stream} {}
+
+  /**
+   * @brief Return a pointer to the device memory for the error
+   */
+  [[nodiscard]] auto data() { return _error_code.data(); }
+
+  /**
+   * @brief Return the current value of the error
+   *
+   * This uses the stream used to create this instance. This does a synchronize on the stream
+   * this object was instantiated with.
+   */
+  [[nodiscard]] auto value() const { return _error_code.value(_error_code.stream()); }
+
+  /**
+   * @brief Return a hexadecimal string representation of the current error code
+   *
+   * Returned string will have "0x" prepended.
+   */
+  [[nodiscard]] std::string str() const
+  {
+    std::stringstream sstream;
+    sstream << std::hex << value();
+    return "0x" + sstream.str();
+  }
+};
+
+}  // namespace cudf::io::parquet
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index cce3659b902..a783b489c02 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -599,10 +599,7 @@ __global__ void __launch_bounds__(decode_block_size)
     }
     __syncthreads();
   }
-  if (t == 0 and s->error != 0) {
-    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
-    ref.fetch_or(s->error, cuda::std::memory_order_relaxed);
-  }
+  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
 
 struct mask_tform {
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index d25684a59f3..bb5e5066b69 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -151,10 +151,7 @@ __global__ void __launch_bounds__(96)
     __syncthreads();
   }
 
-  if (t == 0 and s->error != 0) {
-    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
-    ref.fetch_or(s->error, cuda::std::memory_order_relaxed);
-  }
+  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
 
 }  // anonymous namespace
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index eae8e05e61e..22add2fffc6 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -16,6 +16,9 @@
 
 #include "parquet_gpu.hpp"
 #include <io/utilities/block_utils.cuh>
+
+#include <cudf/detail/utilities/cuda.cuh>
+
 #include <thrust/tuple.h>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -25,23 +28,6 @@ namespace cudf::io::parquet::detail {
 // Minimal thrift implementation for parsing page headers
 // https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
 
-static const __device__ __constant__ uint8_t g_list2struct[16] = {0,
-                                                                  1,
-                                                                  2,
-                                                                  ST_FLD_BYTE,
-                                                                  ST_FLD_DOUBLE,
-                                                                  5,
-                                                                  ST_FLD_I16,
-                                                                  7,
-                                                                  ST_FLD_I32,
-                                                                  9,
-                                                                  ST_FLD_I64,
-                                                                  ST_FLD_BINARY,
-                                                                  ST_FLD_STRUCT,
-                                                                  ST_FLD_MAP,
-                                                                  ST_FLD_SET,
-                                                                  ST_FLD_LIST};
-
 struct byte_stream_s {
   uint8_t const* cur{};
   uint8_t const* end{};
@@ -140,12 +126,13 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type)
       case ST_FLD_SET: {  // NOTE: skipping a list of lists is not handled
         auto const c = getb(bs);
         int n        = c >> 4;
-        if (n == 0xf) n = get_u32(bs);
-        field_type = g_list2struct[c & 0xf];
-        if (field_type == ST_FLD_STRUCT)
+        if (n == 0xf) { n = get_u32(bs); }
+        field_type = c & 0xf;
+        if (field_type == ST_FLD_STRUCT) {
           struct_depth += n;
-        else
+        } else {
           rep_cnt = n;
+        }
       } break;
       case ST_FLD_STRUCT: struct_depth++; break;
     }
@@ -356,16 +343,20 @@ struct gpuParsePageHeader {
  */
 // blockDim {128,1,1}
 __global__ void __launch_bounds__(128)
-  gpuDecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks)
+  gpuDecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, int32_t* error_code)
 {
+  using cudf::detail::warp_size;
   gpuParsePageHeader parse_page_header;
   __shared__ byte_stream_s bs_g[4];
 
-  int lane_id             = threadIdx.x % 32;
-  int chunk               = (blockIdx.x * 4) + (threadIdx.x / 32);
-  byte_stream_s* const bs = &bs_g[threadIdx.x / 32];
+  int32_t error[4]   = {0};
+  auto const lane_id = threadIdx.x % warp_size;
+  auto const warp_id = threadIdx.x / warp_size;
+  auto const chunk   = (blockIdx.x * 4) + warp_id;
+  auto const bs      = &bs_g[warp_id];
 
-  if (chunk < num_chunks and lane_id == 0) bs->ck = chunks[chunk];
+  if (chunk < num_chunks and lane_id == 0) { bs->ck = chunks[chunk]; }
+  if (lane_id == 0) { error[warp_id] = 0; }
   __syncthreads();
 
   if (chunk < num_chunks) {
@@ -376,7 +367,7 @@ __global__ void __launch_bounds__(128)
     int32_t num_dict_pages = bs->ck.num_dict_pages;
     PageInfo* page_info;
 
-    if (!lane_id) {
+    if (lane_id == 0) {
       bs->base = bs->cur      = bs->ck.compressed_data;
       bs->end                 = bs->base + bs->ck.compressed_size;
       bs->page.chunk_idx      = chunk;
@@ -412,6 +403,9 @@ __global__ void __launch_bounds__(128)
         bs->page.lvl_bytes[level_type::DEFINITION] = 0;
         bs->page.lvl_bytes[level_type::REPETITION] = 0;
         if (parse_page_header(bs) && bs->page.compressed_page_size >= 0) {
+          if (not is_supported_encoding(bs->page.encoding)) {
+            error[warp_id] |= static_cast<int32_t>(decode_error::UNSUPPORTED_ENCODING);
+          }
           switch (bs->page_type) {
             case PageType::DATA_PAGE:
               index_out = num_dict_pages + data_page_count;
@@ -440,20 +434,25 @@ __global__ void __launch_bounds__(128)
           }
           bs->page.page_data = const_cast<uint8_t*>(bs->cur);
           bs->cur += bs->page.compressed_page_size;
+          if (bs->cur > bs->end) {
+            error[warp_id] |= static_cast<int32_t>(decode_error::DATA_STREAM_OVERRUN);
+          }
           bs->page.kernel_mask = kernel_mask_for_page(bs->page, bs->ck);
         } else {
           bs->cur = bs->end;
         }
       }
       index_out = shuffle(index_out);
-      if (index_out >= 0 && index_out < max_num_pages && lane_id == 0)
+      if (index_out >= 0 && index_out < max_num_pages && lane_id == 0) {
         page_info[index_out] = bs->page;
+      }
       num_values = shuffle(num_values);
       __syncwarp();
     }
     if (lane_id == 0) {
       chunks[chunk].num_data_pages = data_page_count;
       chunks[chunk].num_dict_pages = dictionary_page_count;
+      if (error[warp_id] != 0) { set_error(error[warp_id], error_code); }
     }
   }
 }
@@ -509,11 +508,12 @@ __global__ void __launch_bounds__(128)
 
 void __host__ DecodePageHeaders(ColumnChunkDesc* chunks,
                                 int32_t num_chunks,
+                                int32_t* error_code,
                                 rmm::cuda_stream_view stream)
 {
   dim3 dim_block(128, 1);
   dim3 dim_grid((num_chunks + 3) >> 2, 1);  // 1 chunk per warp, 4 warps per block
-  gpuDecodePageHeaders<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_chunks);
+  gpuDecodePageHeaders<<<dim_grid, dim_block, 0, stream.value()>>>(chunks, num_chunks, error_code);
 }
 
 void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 4d79770ec34..4c7d8e3c20a 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -745,10 +745,7 @@ __global__ void __launch_bounds__(decode_block_size)
   auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
   block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
 
-  if (t == 0 and s->error != 0) {
-    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
-    ref.fetch_or(s->error, cuda::std::memory_order_relaxed);
-  }
+  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
 
 }  // anonymous namespace
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 048f1a73a9c..164e2cea2ed 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -31,6 +31,8 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/atomic>
+
 #include <cuda_runtime.h>
 
 #include <vector>
@@ -54,6 +56,30 @@ constexpr int rolling_index(int index)
   return index % rolling_size;
 }
 
+// see setupLocalPageInfo() in page_decode.cuh for supported page encodings
+constexpr bool is_supported_encoding(Encoding enc)
+{
+  switch (enc) {
+    case Encoding::PLAIN:
+    case Encoding::PLAIN_DICTIONARY:
+    case Encoding::RLE:
+    case Encoding::RLE_DICTIONARY:
+    case Encoding::DELTA_BINARY_PACKED: return true;
+    default: return false;
+  }
+}
+
+/**
+ * @brief Atomically OR `error` into `error_code`.
+ */
+constexpr void set_error(int32_t error, int32_t* error_code)
+{
+  if (error != 0) {
+    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
+    ref.fetch_or(error, cuda::std::memory_order_relaxed);
+  }
+}
+
 /**
  * @brief Enum for the different types of errors that can occur during decoding.
  *
@@ -495,9 +521,13 @@ constexpr bool is_string_col(ColumnChunkDesc const& chunk)
  *
  * @param[in] chunks List of column chunks
  * @param[in] num_chunks Number of column chunks
+ * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
-void DecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, rmm::cuda_stream_view stream);
+void DecodePageHeaders(ColumnChunkDesc* chunks,
+                       int32_t num_chunks,
+                       int32_t* error_code,
+                       rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for building the dictionary index for the column
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index db81222157a..11c20d0e540 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "reader_impl.hpp"
+#include "error.hpp"
 
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/transform.hpp>
@@ -163,7 +164,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
 
-  rmm::device_scalar<int32_t> error_code(0, _stream);
+  // create this before we fork streams
+  kernel_error error_code(_stream);
 
   // get the number of streams we need from the pool and tell them to wait on the H2D copies
   int const nkernels = std::bitset<32>(kernel_mask).count();
@@ -199,11 +201,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   page_nesting.device_to_host_async(_stream);
   page_nesting_decode.device_to_host_async(_stream);
 
-  auto const decode_error = error_code.value(_stream);
-  if (decode_error != 0) {
-    std::stringstream stream;
-    stream << std::hex << decode_error;
-    CUDF_FAIL("Parquet data decode failed with code(s) 0x" + stream.str());
+  if (error_code.value() != 0) {
+    CUDF_FAIL("Parquet data decode failed with code(s) " + error_code.str());
   }
 
   // for list columns, add the final offset to every offset buffer.
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index ce45f709ee1..8494dc72a1d 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "error.hpp"
 #include "reader_impl.hpp"
 
 #include <io/comp/nvcomp_adapter.hpp>
@@ -263,10 +264,15 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
 {
   size_t total_pages = 0;
 
+  kernel_error error_code(stream);
   chunks.host_to_device_async(stream);
-  DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  DecodePageHeaders(chunks.device_ptr(), chunks.size(), error_code.data(), stream);
   chunks.device_to_host_sync(stream);
 
+  if (error_code.value() != 0) {
+    CUDF_FAIL("Parquet header parsing failed with code(s) " + error_code.str());
+  }
+
   for (size_t c = 0; c < chunks.size(); c++) {
     total_pages += chunks[c].num_data_pages + chunks[c].num_dict_pages;
   }
@@ -274,19 +280,6 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
   return total_pages;
 }
 
-// see setupLocalPageInfo() in page_data.cu for supported page encodings
-constexpr bool is_supported_encoding(Encoding enc)
-{
-  switch (enc) {
-    case Encoding::PLAIN:
-    case Encoding::PLAIN_DICTIONARY:
-    case Encoding::RLE:
-    case Encoding::RLE_DICTIONARY:
-    case Encoding::DELTA_BINARY_PACKED: return true;
-    default: return false;
-  }
-}
-
 /**
  * @brief Decode the page information from the given column chunks.
  *
@@ -307,8 +300,14 @@ int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks
     page_count += chunks[c].max_num_pages;
   }
 
+  kernel_error error_code(stream);
   chunks.host_to_device_async(stream);
-  DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  DecodePageHeaders(chunks.device_ptr(), chunks.size(), error_code.data(), stream);
+
+  if (error_code.value() != 0) {
+    // TODO(ets): if an unsupported encoding was detected, do extra work to figure out which one
+    CUDF_FAIL("Parquet header parsing failed with code(s)" + error_code.str());
+  }
 
   // compute max bytes needed for level data
   auto level_bit_size =
@@ -318,22 +317,13 @@ int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks
         max(c.level_bits[level_type::REPETITION], c.level_bits[level_type::DEFINITION]));
     });
   // max level data bit size.
-  int const max_level_bits   = thrust::reduce(rmm::exec_policy(stream),
+  int const max_level_bits = thrust::reduce(rmm::exec_policy(stream),
                                             level_bit_size,
                                             level_bit_size + chunks.size(),
                                             0,
                                             thrust::maximum<int>());
-  auto const level_type_size = std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8));
-
-  pages.device_to_host_sync(stream);
 
-  // validate page encodings
-  CUDF_EXPECTS(std::all_of(pages.begin(),
-                           pages.end(),
-                           [](auto const& page) { return is_supported_encoding(page.encoding); }),
-               "Unsupported page encoding detected");
-
-  return level_type_size;
+  return std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8));
 }
 
 /**
@@ -771,6 +761,7 @@ void reader::impl::load_and_decompress_data()
 
   // decoding of column/page information
   _pass_itm_data->level_type_size = decode_page_headers(chunks, pages, _stream);
+  pages.device_to_host_sync(_stream);
   if (has_compressed_data) {
     decomp_page_data = decompress_page_data(chunks, pages, _stream);
     // Free compressed data
@@ -795,7 +786,6 @@ void reader::impl::load_and_decompress_data()
   // std::vector<output_column_info> output_info = build_output_column_info();
 
   // the following two allocate functions modify the page data
-  pages.device_to_host_sync(_stream);
   {
     // nesting information (sizes, etc) stored -per page-
     // note : even for flat schemas, we allocate 1 level of "nesting" info
diff --git a/python/cudf/cudf/tests/data/parquet/bad_dict.parquet b/python/cudf/cudf/tests/data/parquet/bad_dict.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..5008ac0b22b622cbcf37d5e286c749324fc26535
GIT binary patch
literal 2850
zcmb7G&2Af26n0F~Hfi|Brja6Y7Z7#fSP&MJN~pStO)_mfb{z*P0aRU&ubpYebEosu
z#$t!MKs*8qR@ng0z>-bZJOz)yito(KnX$tHVfi>WbI<?xoqMMz`=8V*wLh2eZPa$L
zyivPVsZ=YK%I*?=D)dADy#`r^tUzu+ZbG0~S%s`YZbQ}~uS4E|ya{;=@;2lh$h(mD
zAT>xG@;>Aa{l8rMmA*#^4*%p<9v!s%mL$)zf6%rEmMtmdUSFPBN3`0PwrojCrG0SP
z8&DqY!J*adS*hZeeQEEv$rT)Rq<u=7mfgyHV$rqR$CjjWEU4yxACuNY*^@Sfc-ZZ?
zq)m?=we3?$&N|(GcYnW2)pojPva@GBK53J2uiZZ(&HY1UN}ByHqFR#raV$?-mXLk9
zXQf+TS?z%>I~1aSa*QmfStqtNXt%^8%RXpzJJiDb$?-rA+Q&fUUhN*ivpf&a%BuK5
z{`mRbU;g;%yN_R4zx_`AM1%hWO+u7}GN%tAOF}4<&~p-AV_AHzvw@jG_OyotUwwdY
zj^?L$B+k`j;_0SvRH(;^=gq{}^_3?^IxvwNM_9z#iAChiu%{-<J%Sk&#^zE-A_&dY
zjWlmNjtNJuKM}DJ@wpbEo+O?Mg~CSS$5B%ZjW@#MXNivBTzSU7fIoP~Y7)64EzCGI
z4YZ$n;zZ_Ci~{Yr$RSY9g_<A@Brnv^Byno$hQ@atWEf(vndinf8sW5xP&m@eG#esv
zJ?+PZKeplY7vhYQ&@~DAA<T0frU7FY#Uj$!OU7dtJ5;5U)is1xs#JD3@slV)`Y<vh
z<GFEao3%Zepo*-ZXNl`vP}7~zL{aLajLoTywh-pI_KV!b3w5DIZre-|*oMj?x(q<J
z4}=<7EAB%RZS_>FJxy?N7zR*W?u2S~U*sh;78E3`k*L0Ok_ezU>Z$QcY7|9qH=8BN
zg3v|#RS1%VaFRo&FN09Sp^9eHAU0{6I3cD*PT<U9o+CL4&W&@SXH?QmhY{4Y4#W^W
zKiA%@)Zazaod5q~Vn+R9EWk_cJ1$|i2P<M)k<8GT2zA6JNa11xGh1%@14bur-iPp#
z5()$&5;V}ktni6D1{d-^s?VS|AB;%PmCUif<QexWP~XB-@I)~_+31UsQ(Q)qRm)5Q
z;A6y4D?iEzR!}`qs5m}DfY3)viiFZZ$JB?!<|&3d8v@TnXna8%=7IoY<}yi*3mQJy
zfp_@kY32DTd6DH*#15arpfI0Hid%q63mCJ|Jh^kjrqi)13oqF+;{zibi?0zaB|+g6
zNEW9AgXx>tgucrh%vi+`@qD4k5uHvcPxApWaG#x`#T;Ba@bX|{ns~7rcrKXUEJ(Ca
ztFI<RmHbG`8mj2j&&MUt!$wFN|NLBJ=9afb4kOoAz?jX~ToW0SCOp!}37oU8gDZlT
zJm1CbKrL_NH&7QA;0fOpxOseH8g7*W*%Ue-M%fLsaP{!)afgY)U%+Kb;cQOJN#V<Z
z!!?T=i#lBb@52G#kX$oe7Uo5MALg@K9KW0eUod>KieadgbAGMu=s0tm2k!s9T~s%|
zIb53J9kEvZsCsA3SFvmSCpZ8rwRJpL_SID9d+5u;-fhQBrvT@PW4uazcVlzoX#3V?
zt-7(YG4H5`z53rv_-^CHZu|S{W^J8RQrGF`%a!`e6>MPl>LxoVqs=+E!rR#P&#SqI
k`2<p>tF8Lg7WV0tw7y%pyB4SrY>0QJVdXCX(Zk#EKmWZ<*#H0l

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index b892cc62ac4..d2c08246518 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2830,6 +2830,14 @@ def test_parquet_reader_unsupported_page_encoding(datadir):
         cudf.read_parquet(fname)
 
 
+def test_parquet_reader_detect_bad_dictionary(datadir):
+    fname = datadir / "bad_dict.parquet"
+
+    # expect a failure when reading the whole file
+    with pytest.raises(RuntimeError):
+        cudf.read_parquet(fname)
+
+
 @pytest.mark.parametrize("data", [{"a": [1, 2, 3, 4]}, {"b": [1, None, 2, 3]}])
 @pytest.mark.parametrize("force_nullable_schema", [True, False])
 def test_parquet_writer_schema_nullability(data, force_nullable_schema):

From 253f6a6d5b19387c05368e073954ff773b3d6a39 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Fri, 20 Oct 2023 14:05:53 -0700
Subject: [PATCH 038/118] Refactor LogicalType for Parquet (#14264)

Continuation of #14097, this PR refactors the LogicalType struct to use the new way of treating unions defined in the parquet thrift (more enum like than struct like).

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14264
---
 .../io/parquet/compact_protocol_reader.cpp    |  95 +++--------
 .../io/parquet/compact_protocol_writer.cpp    |  81 +++++----
 cpp/src/io/parquet/page_decode.cuh            |   3 +-
 cpp/src/io/parquet/parquet.hpp                | 156 +++++++++++-------
 cpp/src/io/parquet/parquet_gpu.hpp            |  30 ++--
 cpp/src/io/parquet/reader_impl_chunking.cu    |  13 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    | 104 ++++++------
 cpp/src/io/parquet/writer_impl.cu             | 107 +++++++-----
 cpp/tests/io/parquet_test.cpp                 |   7 +-
 9 files changed, 293 insertions(+), 303 deletions(-)

diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 1a345ee0750..5a2b8aa8f2a 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -339,61 +339,6 @@ struct parquet_field_struct_list : public parquet_field_list<T> {
   }
 };
 
-// TODO(ets): replace current union handling (which mirrors thrift) to use std::optional fields
-// in a struct
-/**
- * @brief Functor to read a union member from CompactProtocolReader
- *
- * @tparam is_empty True if tparam `T` type is empty type, else false.
- *
- * @return True if field types mismatch or if the process of reading a
- * union member fails
- */
-template <typename T, bool is_empty = false>
-class ParquetFieldUnionFunctor : public parquet_field {
-  bool& is_set;
-  T& val;
-
- public:
-  ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_STRUCT) {
-      return true;
-    } else {
-      is_set = true;
-      return !cpr->read(&val);
-    }
-  }
-};
-
-template <typename T>
-class ParquetFieldUnionFunctor<T, true> : public parquet_field {
-  bool& is_set;
-  T& val;
-
- public:
-  ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {}
-
-  inline bool operator()(CompactProtocolReader* cpr, int field_type)
-  {
-    if (field_type != ST_FLD_STRUCT) {
-      return true;
-    } else {
-      is_set = true;
-      cpr->skip_struct_field(field_type);
-      return false;
-    }
-  }
-};
-
-template <typename T>
-ParquetFieldUnionFunctor<T, std::is_empty_v<T>> ParquetFieldUnion(int f, bool& b, T& v)
-{
-  return ParquetFieldUnionFunctor<T, std::is_empty_v<T>>(f, b, v);
-}
-
 /**
  * @brief Functor to read a binary from CompactProtocolReader
  *
@@ -595,34 +540,38 @@ bool CompactProtocolReader::read(FileMetaData* f)
 
 bool CompactProtocolReader::read(SchemaElement* s)
 {
+  using optional_converted_type =
+    parquet_field_optional<ConvertedType, parquet_field_enum<ConvertedType>>;
+  using optional_logical_type =
+    parquet_field_optional<LogicalType, parquet_field_struct<LogicalType>>;
   auto op = std::make_tuple(parquet_field_enum<Type>(1, s->type),
                             parquet_field_int32(2, s->type_length),
                             parquet_field_enum<FieldRepetitionType>(3, s->repetition_type),
                             parquet_field_string(4, s->name),
                             parquet_field_int32(5, s->num_children),
-                            parquet_field_enum<ConvertedType>(6, s->converted_type),
+                            optional_converted_type(6, s->converted_type),
                             parquet_field_int32(7, s->decimal_scale),
                             parquet_field_int32(8, s->decimal_precision),
                             parquet_field_optional<int32_t, parquet_field_int32>(9, s->field_id),
-                            parquet_field_struct(10, s->logical_type));
+                            optional_logical_type(10, s->logical_type));
   return function_builder(this, op);
 }
 
 bool CompactProtocolReader::read(LogicalType* l)
 {
-  auto op =
-    std::make_tuple(ParquetFieldUnion(1, l->isset.STRING, l->STRING),
-                    ParquetFieldUnion(2, l->isset.MAP, l->MAP),
-                    ParquetFieldUnion(3, l->isset.LIST, l->LIST),
-                    ParquetFieldUnion(4, l->isset.ENUM, l->ENUM),
-                    ParquetFieldUnion(5, l->isset.DECIMAL, l->DECIMAL),  // read the struct
-                    ParquetFieldUnion(6, l->isset.DATE, l->DATE),
-                    ParquetFieldUnion(7, l->isset.TIME, l->TIME),            //  read the struct
-                    ParquetFieldUnion(8, l->isset.TIMESTAMP, l->TIMESTAMP),  //  read the struct
-                    ParquetFieldUnion(10, l->isset.INTEGER, l->INTEGER),     //  read the struct
-                    ParquetFieldUnion(11, l->isset.UNKNOWN, l->UNKNOWN),
-                    ParquetFieldUnion(12, l->isset.JSON, l->JSON),
-                    ParquetFieldUnion(13, l->isset.BSON, l->BSON));
+  auto op = std::make_tuple(
+    parquet_field_union_enumerator(1, l->type),
+    parquet_field_union_enumerator(2, l->type),
+    parquet_field_union_enumerator(3, l->type),
+    parquet_field_union_enumerator(4, l->type),
+    parquet_field_union_struct<LogicalType::Type, DecimalType>(5, l->type, l->decimal_type),
+    parquet_field_union_enumerator(6, l->type),
+    parquet_field_union_struct<LogicalType::Type, TimeType>(7, l->type, l->time_type),
+    parquet_field_union_struct<LogicalType::Type, TimestampType>(8, l->type, l->timestamp_type),
+    parquet_field_union_struct<LogicalType::Type, IntType>(10, l->type, l->int_type),
+    parquet_field_union_enumerator(11, l->type),
+    parquet_field_union_enumerator(12, l->type),
+    parquet_field_union_enumerator(13, l->type));
   return function_builder(this, op);
 }
 
@@ -648,9 +597,9 @@ bool CompactProtocolReader::read(TimestampType* t)
 
 bool CompactProtocolReader::read(TimeUnit* u)
 {
-  auto op = std::make_tuple(ParquetFieldUnion(1, u->isset.MILLIS, u->MILLIS),
-                            ParquetFieldUnion(2, u->isset.MICROS, u->MICROS),
-                            ParquetFieldUnion(3, u->isset.NANOS, u->NANOS));
+  auto op = std::make_tuple(parquet_field_union_enumerator(1, u->type),
+                            parquet_field_union_enumerator(2, u->type),
+                            parquet_field_union_enumerator(3, u->type));
   return function_builder(this, op);
 }
 
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 00810269d3c..fbeda7f1099 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -16,6 +16,8 @@
 
 #include "compact_protocol_writer.hpp"
 
+#include <cudf/utilities/error.hpp>
+
 namespace cudf::io::parquet::detail {
 
 /**
@@ -46,13 +48,11 @@ size_t CompactProtocolWriter::write(DecimalType const& decimal)
 size_t CompactProtocolWriter::write(TimeUnit const& time_unit)
 {
   CompactProtocolFieldWriter c(*this);
-  auto const isset = time_unit.isset;
-  if (isset.MILLIS) {
-    c.field_struct(1, time_unit.MILLIS);
-  } else if (isset.MICROS) {
-    c.field_struct(2, time_unit.MICROS);
-  } else if (isset.NANOS) {
-    c.field_struct(3, time_unit.NANOS);
+  switch (time_unit.type) {
+    case TimeUnit::MILLIS:
+    case TimeUnit::MICROS:
+    case TimeUnit::NANOS: c.field_empty_struct(time_unit.type); break;
+    default: CUDF_FAIL("Trying to write an invalid TimeUnit " + std::to_string(time_unit.type));
   }
   return c.value();
 }
@@ -84,31 +84,29 @@ size_t CompactProtocolWriter::write(IntType const& integer)
 size_t CompactProtocolWriter::write(LogicalType const& logical_type)
 {
   CompactProtocolFieldWriter c(*this);
-  auto const isset = logical_type.isset;
-  if (isset.STRING) {
-    c.field_struct(1, logical_type.STRING);
-  } else if (isset.MAP) {
-    c.field_struct(2, logical_type.MAP);
-  } else if (isset.LIST) {
-    c.field_struct(3, logical_type.LIST);
-  } else if (isset.ENUM) {
-    c.field_struct(4, logical_type.ENUM);
-  } else if (isset.DECIMAL) {
-    c.field_struct(5, logical_type.DECIMAL);
-  } else if (isset.DATE) {
-    c.field_struct(6, logical_type.DATE);
-  } else if (isset.TIME) {
-    c.field_struct(7, logical_type.TIME);
-  } else if (isset.TIMESTAMP) {
-    c.field_struct(8, logical_type.TIMESTAMP);
-  } else if (isset.INTEGER) {
-    c.field_struct(10, logical_type.INTEGER);
-  } else if (isset.UNKNOWN) {
-    c.field_struct(11, logical_type.UNKNOWN);
-  } else if (isset.JSON) {
-    c.field_struct(12, logical_type.JSON);
-  } else if (isset.BSON) {
-    c.field_struct(13, logical_type.BSON);
+  switch (logical_type.type) {
+    case LogicalType::STRING:
+    case LogicalType::MAP:
+    case LogicalType::LIST:
+    case LogicalType::ENUM:
+    case LogicalType::DATE:
+    case LogicalType::UNKNOWN:
+    case LogicalType::JSON:
+    case LogicalType::BSON: c.field_empty_struct(logical_type.type); break;
+    case LogicalType::DECIMAL:
+      c.field_struct(LogicalType::DECIMAL, logical_type.decimal_type.value());
+      break;
+    case LogicalType::TIME:
+      c.field_struct(LogicalType::TIME, logical_type.time_type.value());
+      break;
+    case LogicalType::TIMESTAMP:
+      c.field_struct(LogicalType::TIMESTAMP, logical_type.timestamp_type.value());
+      break;
+    case LogicalType::INTEGER:
+      c.field_struct(LogicalType::INTEGER, logical_type.int_type.value());
+      break;
+    default:
+      CUDF_FAIL("Trying to write an invalid LogicalType " + std::to_string(logical_type.type));
   }
   return c.value();
 }
@@ -124,20 +122,15 @@ size_t CompactProtocolWriter::write(SchemaElement const& s)
   c.field_string(4, s.name);
 
   if (s.type == UNDEFINED_TYPE) { c.field_int(5, s.num_children); }
-  if (s.converted_type != UNKNOWN) {
-    c.field_int(6, s.converted_type);
+  if (s.converted_type.has_value()) {
+    c.field_int(6, s.converted_type.value());
     if (s.converted_type == DECIMAL) {
       c.field_int(7, s.decimal_scale);
       c.field_int(8, s.decimal_precision);
     }
   }
-  if (s.field_id) { c.field_int(9, s.field_id.value()); }
-  auto const isset = s.logical_type.isset;
-  // TODO: add handling for all logical types
-  // if (isset.STRING or isset.MAP or isset.LIST or isset.ENUM or isset.DECIMAL or isset.DATE or
-  //    isset.TIME or isset.TIMESTAMP or isset.INTEGER or isset.UNKNOWN or isset.JSON or isset.BSON)
-  //    {
-  if (isset.TIMESTAMP or isset.TIME) { c.field_struct(10, s.logical_type); }
+  if (s.field_id.has_value()) { c.field_int(9, s.field_id.value()); }
+  if (s.logical_type.has_value()) { c.field_struct(10, s.logical_type.value()); }
   return c.value();
 }
 
@@ -223,9 +216,9 @@ size_t CompactProtocolWriter::write(OffsetIndex const& s)
 size_t CompactProtocolWriter::write(ColumnOrder const& co)
 {
   CompactProtocolFieldWriter c(*this);
-  switch (co) {
-    case ColumnOrder::TYPE_ORDER: c.field_empty_struct(1); break;
-    default: break;
+  switch (co.type) {
+    case ColumnOrder::TYPE_ORDER: c.field_empty_struct(co.type); break;
+    default: CUDF_FAIL("Trying to write an invalid ColumnOrder " + std::to_string(co.type));
   }
   return c.value();
 }
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 7c866fd8b9e..ab1cc68923d 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -1143,7 +1143,8 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
               units = cudf::timestamp_ms::period::den;
             } else if (s->col.converted_type == TIMESTAMP_MICROS) {
               units = cudf::timestamp_us::period::den;
-            } else if (s->col.logical_type.TIMESTAMP.unit.isset.NANOS) {
+            } else if (s->col.logical_type.has_value() and
+                       s->col.logical_type->is_timestamp_nanos()) {
               units = cudf::timestamp_ns::period::den;
             }
             if (units and units != s->col.ts_clock_rate) {
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 1cd16ac6102..699cad89703 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -46,79 +46,98 @@ struct file_ender_s {
   uint32_t magic;
 };
 
-// thrift generated code simplified.
-struct StringType {};
-struct MapType {};
-struct ListType {};
-struct EnumType {};
+// thrift inspired code simplified.
 struct DecimalType {
   int32_t scale     = 0;
   int32_t precision = 0;
 };
-struct DateType {};
-
-struct MilliSeconds {};
-struct MicroSeconds {};
-struct NanoSeconds {};
-using TimeUnit_isset = struct TimeUnit_isset {
-  bool MILLIS{false};
-  bool MICROS{false};
-  bool NANOS{false};
-};
 
 struct TimeUnit {
-  TimeUnit_isset isset;
-  MilliSeconds MILLIS;
-  MicroSeconds MICROS;
-  NanoSeconds NANOS;
+  enum Type { UNDEFINED, MILLIS, MICROS, NANOS };
+  Type type;
 };
 
 struct TimeType {
   bool isAdjustedToUTC = false;
   TimeUnit unit;
 };
+
 struct TimestampType {
   bool isAdjustedToUTC = false;
   TimeUnit unit;
 };
+
 struct IntType {
   int8_t bitWidth = 0;
   bool isSigned   = false;
 };
-struct NullType {};
-struct JsonType {};
-struct BsonType {};
-
-// thrift generated code simplified.
-using LogicalType_isset = struct LogicalType_isset {
-  bool STRING{false};
-  bool MAP{false};
-  bool LIST{false};
-  bool ENUM{false};
-  bool DECIMAL{false};
-  bool DATE{false};
-  bool TIME{false};
-  bool TIMESTAMP{false};
-  bool INTEGER{false};
-  bool UNKNOWN{false};
-  bool JSON{false};
-  bool BSON{false};
-};
 
 struct LogicalType {
-  LogicalType_isset isset;
-  StringType STRING;
-  MapType MAP;
-  ListType LIST;
-  EnumType ENUM;
-  DecimalType DECIMAL;
-  DateType DATE;
-  TimeType TIME;
-  TimestampType TIMESTAMP;
-  IntType INTEGER;
-  NullType UNKNOWN;
-  JsonType JSON;
-  BsonType BSON;
+  enum Type {
+    UNDEFINED,
+    STRING,
+    MAP,
+    LIST,
+    ENUM,
+    DECIMAL,
+    DATE,
+    TIME,
+    TIMESTAMP,
+    // 9 is reserved
+    INTEGER = 10,
+    UNKNOWN,
+    JSON,
+    BSON
+  };
+  Type type;
+  thrust::optional<DecimalType> decimal_type;
+  thrust::optional<TimeType> time_type;
+  thrust::optional<TimestampType> timestamp_type;
+  thrust::optional<IntType> int_type;
+
+  LogicalType(Type tp = UNDEFINED) : type(tp) {}
+  LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
+  LogicalType(TimeType&& tt) : type(TIME), time_type(tt) {}
+  LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {}
+  LogicalType(IntType&& it) : type(INTEGER), int_type(it) {}
+
+  constexpr bool is_time_millis() const
+  {
+    return type == TIME and time_type->unit.type == TimeUnit::MILLIS;
+  }
+
+  constexpr bool is_time_micros() const
+  {
+    return type == TIME and time_type->unit.type == TimeUnit::MICROS;
+  }
+
+  constexpr bool is_time_nanos() const
+  {
+    return type == TIME and time_type->unit.type == TimeUnit::NANOS;
+  }
+
+  constexpr bool is_timestamp_millis() const
+  {
+    return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MILLIS;
+  }
+
+  constexpr bool is_timestamp_micros() const
+  {
+    return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MICROS;
+  }
+
+  constexpr bool is_timestamp_nanos() const
+  {
+    return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::NANOS;
+  }
+
+  constexpr int8_t bit_width() const { return type == INTEGER ? int_type->bitWidth : -1; }
+
+  constexpr bool is_signed() const { return type == INTEGER and int_type->isSigned; }
+
+  constexpr int32_t scale() const { return type == DECIMAL ? decimal_type->scale : -1; }
+
+  constexpr int32_t precision() const { return type == DECIMAL ? decimal_type->precision : -1; }
 };
 
 /**
@@ -127,8 +146,6 @@ struct LogicalType {
 struct ColumnOrder {
   enum Type { UNDEFINED, TYPE_ORDER };
   Type type;
-
-  operator Type() const { return type; }
 };
 
 /**
@@ -138,18 +155,29 @@ struct ColumnOrder {
  * as a schema tree.
  */
 struct SchemaElement {
-  Type type                    = UNDEFINED_TYPE;
-  ConvertedType converted_type = UNKNOWN;
-  LogicalType logical_type;
-  int32_t type_length =
-    0;  // Byte length of FIXED_LENGTH_BYTE_ARRAY elements, or maximum bit length for other types
+  // 1: parquet physical type for output
+  Type type = UNDEFINED_TYPE;
+  // 2: byte length of FIXED_LENGTH_BYTE_ARRAY elements, or maximum bit length for other types
+  int32_t type_length = 0;
+  // 3: repetition of the field
   FieldRepetitionType repetition_type = REQUIRED;
-  std::string name                    = "";
-  int32_t num_children                = 0;
-  int32_t decimal_scale               = 0;
-  int32_t decimal_precision           = 0;
-  thrust::optional<int32_t> field_id  = thrust::nullopt;
-  bool output_as_byte_array           = false;
+  // 4: name of the field
+  std::string name = "";
+  // 5: nested fields
+  int32_t num_children = 0;
+  // 6: DEPRECATED: record the original type before conversion to parquet type
+  thrust::optional<ConvertedType> converted_type;
+  // 7: DEPRECATED: record the scale for DECIMAL converted type
+  int32_t decimal_scale = 0;
+  // 8: DEPRECATED: record the precision for DECIMAL converted type
+  int32_t decimal_precision = 0;
+  // 9: save field_id from original schema
+  thrust::optional<int32_t> field_id;
+  // 10: replaces converted type
+  thrust::optional<LogicalType> logical_type;
+
+  // extra cudf specific fields
+  bool output_as_byte_array = false;
 
   // The following fields are filled in later during schema initialization
   int max_definition_level = 0;
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 164e2cea2ed..68851e72663 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -313,7 +313,7 @@ struct ColumnChunkDesc {
                            uint8_t rep_level_bits_,
                            int8_t codec_,
                            int8_t converted_type_,
-                           LogicalType logical_type_,
+                           thrust::optional<LogicalType> logical_type_,
                            int8_t decimal_precision_,
                            int32_t ts_clock_rate_,
                            int32_t src_col_index_,
@@ -355,20 +355,20 @@ struct ColumnChunkDesc {
   uint16_t data_type{};  // basic column data type, ((type_length << 3) |
                          // parquet::Type)
   uint8_t
-    level_bits[level_type::NUM_LEVEL_TYPES]{};  // bits to encode max definition/repetition levels
-  int32_t num_data_pages{};                     // number of data pages
-  int32_t num_dict_pages{};                     // number of dictionary pages
-  int32_t max_num_pages{};                      // size of page_info array
-  PageInfo* page_info{};                        // output page info for up to num_dict_pages +
-                                                // num_data_pages (dictionary pages first)
-  string_index_pair* str_dict_index{};          // index for string dictionary
-  bitmask_type** valid_map_base{};              // base pointers of valid bit map for this column
-  void** column_data_base{};                    // base pointers of column data
-  void** column_string_base{};                  // base pointers of column string data
-  int8_t codec{};                               // compressed codec enum
-  int8_t converted_type{};                      // converted type enum
-  LogicalType logical_type{};                   // logical type
-  int8_t decimal_precision{};                   // Decimal precision
+    level_bits[level_type::NUM_LEVEL_TYPES]{};   // bits to encode max definition/repetition levels
+  int32_t num_data_pages{};                      // number of data pages
+  int32_t num_dict_pages{};                      // number of dictionary pages
+  int32_t max_num_pages{};                       // size of page_info array
+  PageInfo* page_info{};                         // output page info for up to num_dict_pages +
+                                                 // num_data_pages (dictionary pages first)
+  string_index_pair* str_dict_index{};           // index for string dictionary
+  bitmask_type** valid_map_base{};               // base pointers of valid bit map for this column
+  void** column_data_base{};                     // base pointers of column data
+  void** column_string_base{};                   // base pointers of column string data
+  int8_t codec{};                                // compressed codec enum
+  int8_t converted_type{};                       // converted type enum
+  thrust::optional<LogicalType> logical_type{};  // logical type
+  int8_t decimal_precision{};                    // Decimal precision
   int32_t ts_clock_rate{};  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
 
   int32_t src_col_index{};   // my input column index
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index ad52a7dfcc1..213fc380a34 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -304,11 +304,12 @@ std::vector<chunk_read_info> find_splits(std::vector<cumulative_row_info> const&
  *
  * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
  */
-[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
-                                                                   type_id timestamp_type_id,
-                                                                   Type physical,
-                                                                   int8_t converted,
-                                                                   int32_t length)
+[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(
+  type_id column_type_id,
+  type_id timestamp_type_id,
+  Type physical,
+  thrust::optional<ConvertedType> converted,
+  int32_t length)
 {
   int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0;
   int32_t clock_rate = 0;
@@ -322,7 +323,7 @@ std::vector<chunk_read_info> find_splits(std::vector<cumulative_row_info> const&
     clock_rate = to_clockrate(timestamp_type_id);
   }
 
-  int8_t converted_type = converted;
+  int8_t converted_type = converted.value_or(UNKNOWN);
   if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 &&
       not cudf::is_fixed_point(data_type{column_type_id})) {
     converted_type = UNKNOWN;  // Not converting to float64 or decimal
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 040c6403f57..a9c84143e1a 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -25,44 +25,42 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
-ConvertedType logical_type_to_converted_type(LogicalType const& logical)
+ConvertedType logical_type_to_converted_type(thrust::optional<LogicalType> const& logical)
 {
-  if (logical.isset.STRING) {
-    return UTF8;
-  } else if (logical.isset.MAP) {
-    return MAP;
-  } else if (logical.isset.LIST) {
-    return LIST;
-  } else if (logical.isset.ENUM) {
-    return ENUM;
-  } else if (logical.isset.DECIMAL) {
-    return DECIMAL;  // TODO set decimal values
-  } else if (logical.isset.DATE) {
-    return DATE;
-  } else if (logical.isset.TIME) {
-    if (logical.TIME.unit.isset.MILLIS)
-      return TIME_MILLIS;
-    else if (logical.TIME.unit.isset.MICROS)
-      return TIME_MICROS;
-  } else if (logical.isset.TIMESTAMP) {
-    if (logical.TIMESTAMP.unit.isset.MILLIS)
-      return TIMESTAMP_MILLIS;
-    else if (logical.TIMESTAMP.unit.isset.MICROS)
-      return TIMESTAMP_MICROS;
-  } else if (logical.isset.INTEGER) {
-    switch (logical.INTEGER.bitWidth) {
-      case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8;
-      case 16: return logical.INTEGER.isSigned ? INT_16 : UINT_16;
-      case 32: return logical.INTEGER.isSigned ? INT_32 : UINT_32;
-      case 64: return logical.INTEGER.isSigned ? INT_64 : UINT_64;
-      default: break;
-    }
-  } else if (logical.isset.UNKNOWN) {
-    return NA;
-  } else if (logical.isset.JSON) {
-    return JSON;
-  } else if (logical.isset.BSON) {
-    return BSON;
+  if (not logical.has_value()) { return UNKNOWN; }
+  switch (logical->type) {
+    case LogicalType::STRING: return UTF8;
+    case LogicalType::MAP: return MAP;
+    case LogicalType::LIST: return LIST;
+    case LogicalType::ENUM: return ENUM;
+    case LogicalType::DECIMAL: return DECIMAL;  // TODO use decimal scale/precision
+    case LogicalType::DATE: return DATE;
+    case LogicalType::TIME:
+      if (logical->is_time_millis()) {
+        return TIME_MILLIS;
+      } else if (logical->is_time_micros()) {
+        return TIME_MICROS;
+      }
+      break;
+    case LogicalType::TIMESTAMP:
+      if (logical->is_timestamp_millis()) {
+        return TIMESTAMP_MILLIS;
+      } else if (logical->is_timestamp_micros()) {
+        return TIMESTAMP_MICROS;
+      }
+      break;
+    case LogicalType::INTEGER:
+      switch (logical->bit_width()) {
+        case 8: return logical->is_signed() ? INT_8 : UINT_8;
+        case 16: return logical->is_signed() ? INT_16 : UINT_16;
+        case 32: return logical->is_signed() ? INT_32 : UINT_32;
+        case 64: return logical->is_signed() ? INT_64 : UINT_64;
+        default: break;
+      }
+    case LogicalType::UNKNOWN: return NA;
+    case LogicalType::JSON: return JSON;
+    case LogicalType::BSON: return BSON;
+    default: break;
   }
   return UNKNOWN;
 }
@@ -76,20 +74,20 @@ type_id to_type_id(SchemaElement const& schema,
                    bool strings_to_categorical,
                    type_id timestamp_type_id)
 {
-  Type const physical            = schema.type;
-  LogicalType const logical_type = schema.logical_type;
-  ConvertedType converted_type   = schema.converted_type;
-  int32_t decimal_precision      = schema.decimal_precision;
+  auto const physical       = schema.type;
+  auto const logical_type   = schema.logical_type;
+  auto converted_type       = schema.converted_type;
+  int32_t decimal_precision = schema.decimal_precision;
 
+  // FIXME(ets): this should just use logical type to deduce the type_id. then fall back to
+  // converted_type if logical_type isn't set
   // Logical type used for actual data interpretation; the legacy converted type
   // is superseded by 'logical' type whenever available.
   auto const inferred_converted_type = logical_type_to_converted_type(logical_type);
   if (inferred_converted_type != UNKNOWN) { converted_type = inferred_converted_type; }
-  if (inferred_converted_type == DECIMAL) {
-    decimal_precision = schema.logical_type.DECIMAL.precision;
-  }
+  if (inferred_converted_type == DECIMAL) { decimal_precision = schema.logical_type->precision(); }
 
-  switch (converted_type) {
+  switch (converted_type.value_or(UNKNOWN)) {
     case UINT_8: return type_id::UINT8;
     case INT_8: return type_id::INT8;
     case UINT_16: return type_id::UINT16;
@@ -140,15 +138,13 @@ type_id to_type_id(SchemaElement const& schema,
     default: break;
   }
 
-  if (inferred_converted_type == UNKNOWN and physical == INT64 and
-      logical_type.TIMESTAMP.unit.isset.NANOS) {
-    return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                 : type_id::TIMESTAMP_NANOSECONDS;
-  }
-
-  if (inferred_converted_type == UNKNOWN and physical == INT64 and
-      logical_type.TIME.unit.isset.NANOS) {
-    return type_id::DURATION_NANOSECONDS;
+  if (inferred_converted_type == UNKNOWN and physical == INT64 and logical_type.has_value()) {
+    if (logical_type->is_timestamp_nanos()) {
+      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
+                                                   : type_id::TIMESTAMP_NANOSECONDS;
+    } else if (logical_type->is_time_nanos()) {
+      return type_id::DURATION_NANOSECONDS;
+    }
   }
 
   // is it simply a struct?
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 50589f23626..c06acc1690b 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -284,6 +284,7 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::BOOLEAN;
     col_schema.stats_dtype = statistics_dtype::dtype_bool;
+    // BOOLEAN needs no converted or logical type
   }
 
   template <typename T>
@@ -292,6 +293,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::INT_8;
     col_schema.stats_dtype    = statistics_dtype::dtype_int8;
+    col_schema.logical_type   = LogicalType{IntType{8, true}};
   }
 
   template <typename T>
@@ -300,6 +302,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::INT_16;
     col_schema.stats_dtype    = statistics_dtype::dtype_int16;
+    col_schema.logical_type   = LogicalType{IntType{16, true}};
   }
 
   template <typename T>
@@ -307,6 +310,7 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::INT32;
     col_schema.stats_dtype = statistics_dtype::dtype_int32;
+    // INT32 needs no converted or logical type
   }
 
   template <typename T>
@@ -314,6 +318,7 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    // INT64 needs no converted or logical type
   }
 
   template <typename T>
@@ -322,6 +327,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::UINT_8;
     col_schema.stats_dtype    = statistics_dtype::dtype_int8;
+    col_schema.logical_type   = LogicalType{IntType{8, false}};
   }
 
   template <typename T>
@@ -330,6 +336,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::UINT_16;
     col_schema.stats_dtype    = statistics_dtype::dtype_int16;
+    col_schema.logical_type   = LogicalType{IntType{16, false}};
   }
 
   template <typename T>
@@ -338,6 +345,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::UINT_32;
     col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+    col_schema.logical_type   = LogicalType{IntType{32, false}};
   }
 
   template <typename T>
@@ -346,6 +354,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT64;
     col_schema.converted_type = ConvertedType::UINT_64;
     col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+    col_schema.logical_type   = LogicalType{IntType{64, false}};
   }
 
   template <typename T>
@@ -353,6 +362,7 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::FLOAT;
     col_schema.stats_dtype = statistics_dtype::dtype_float32;
+    // FLOAT needs no converted or logical type
   }
 
   template <typename T>
@@ -360,6 +370,7 @@ struct leaf_schema_fn {
   {
     col_schema.type        = Type::DOUBLE;
     col_schema.stats_dtype = statistics_dtype::dtype_float64;
+    // DOUBLE needs no converted or logical type
   }
 
   template <typename T>
@@ -367,11 +378,12 @@ struct leaf_schema_fn {
   {
     col_schema.type = Type::BYTE_ARRAY;
     if (col_meta.is_enabled_output_as_binary()) {
-      col_schema.converted_type = ConvertedType::UNKNOWN;
-      col_schema.stats_dtype    = statistics_dtype::dtype_byte_array;
+      col_schema.stats_dtype = statistics_dtype::dtype_byte_array;
+      // BYTE_ARRAY needs no converted or logical type
     } else {
       col_schema.converted_type = ConvertedType::UTF8;
       col_schema.stats_dtype    = statistics_dtype::dtype_string;
+      col_schema.logical_type   = LogicalType{LogicalType::STRING};
     }
   }
 
@@ -381,49 +393,55 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::DATE;
     col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+    col_schema.logical_type   = LogicalType{LogicalType::DATE};
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_s>, void> operator()()
   {
-    col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.converted_type =
-      (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MILLIS;
+    col_schema.type        = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
     col_schema.ts_scale    = 1000;
+    if (not timestamp_is_int96) {
+      col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS;
+      col_schema.logical_type   = LogicalType{TimestampType{false, TimeUnit::MILLIS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ms>, void> operator()()
   {
-    col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.converted_type =
-      (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MILLIS;
+    col_schema.type        = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
+    if (not timestamp_is_int96) {
+      col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS;
+      col_schema.logical_type   = LogicalType{TimestampType{false, TimeUnit::MILLIS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_us>, void> operator()()
   {
-    col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.converted_type =
-      (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MICROS;
+    col_schema.type        = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
     col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
+    if (not timestamp_is_int96) {
+      col_schema.converted_type = ConvertedType::TIMESTAMP_MICROS;
+      col_schema.logical_type   = LogicalType{TimestampType{false, TimeUnit::MICROS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
   {
     col_schema.type           = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.converted_type = ConvertedType::UNKNOWN;
+    col_schema.converted_type = thrust::nullopt;
     col_schema.stats_dtype    = statistics_dtype::dtype_timestamp64;
     if (timestamp_is_int96) {
       col_schema.ts_scale = -1000;  // negative value indicates division by absolute value
     }
     // set logical type if it's not int96
     else {
-      col_schema.logical_type.isset.TIMESTAMP            = true;
-      col_schema.logical_type.TIMESTAMP.unit.isset.NANOS = true;
+      col_schema.logical_type = LogicalType{TimestampType{false, TimeUnit::NANOS}};
     }
   }
 
@@ -431,53 +449,48 @@ struct leaf_schema_fn {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
   {
-    col_schema.type                                = Type::INT32;
-    col_schema.converted_type                      = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype                         = statistics_dtype::dtype_int32;
-    col_schema.ts_scale                            = 24 * 60 * 60 * 1000;
-    col_schema.logical_type.isset.TIME             = true;
-    col_schema.logical_type.TIME.unit.isset.MILLIS = true;
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+    col_schema.ts_scale       = 24 * 60 * 60 * 1000;
+    col_schema.logical_type   = LogicalType{TimeType{false, TimeUnit::MILLIS}};
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
   {
-    col_schema.type                                = Type::INT32;
-    col_schema.converted_type                      = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype                         = statistics_dtype::dtype_int32;
-    col_schema.ts_scale                            = 1000;
-    col_schema.logical_type.isset.TIME             = true;
-    col_schema.logical_type.TIME.unit.isset.MILLIS = true;
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+    col_schema.ts_scale       = 1000;
+    col_schema.logical_type   = LogicalType{TimeType{false, TimeUnit::MILLIS}};
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
   {
-    col_schema.type                                = Type::INT32;
-    col_schema.converted_type                      = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype                         = statistics_dtype::dtype_int32;
-    col_schema.logical_type.isset.TIME             = true;
-    col_schema.logical_type.TIME.unit.isset.MILLIS = true;
+    col_schema.type           = Type::INT32;
+    col_schema.converted_type = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+    col_schema.logical_type   = LogicalType{TimeType{false, TimeUnit::MILLIS}};
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_us>, void> operator()()
   {
-    col_schema.type                                = Type::INT64;
-    col_schema.converted_type                      = ConvertedType::TIME_MICROS;
-    col_schema.stats_dtype                         = statistics_dtype::dtype_int64;
-    col_schema.logical_type.isset.TIME             = true;
-    col_schema.logical_type.TIME.unit.isset.MICROS = true;
+    col_schema.type           = Type::INT64;
+    col_schema.converted_type = ConvertedType::TIME_MICROS;
+    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+    col_schema.logical_type   = LogicalType{TimeType{false, TimeUnit::MICROS}};
   }
 
   //  unsupported outside cudf for parquet 1.0.
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ns>, void> operator()()
   {
-    col_schema.type                               = Type::INT64;
-    col_schema.stats_dtype                        = statistics_dtype::dtype_int64;
-    col_schema.logical_type.isset.TIME            = true;
-    col_schema.logical_type.TIME.unit.isset.NANOS = true;
+    col_schema.type         = Type::INT64;
+    col_schema.stats_dtype  = statistics_dtype::dtype_int64;
+    col_schema.logical_type = LogicalType{TimeType{false, TimeUnit::NANOS}};
   }
 
   template <typename T>
@@ -487,27 +500,32 @@ struct leaf_schema_fn {
       col_schema.type              = Type::INT32;
       col_schema.stats_dtype       = statistics_dtype::dtype_int32;
       col_schema.decimal_precision = MAX_DECIMAL32_PRECISION;
+      col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL32_PRECISION}};
     } else if (std::is_same_v<T, numeric::decimal64>) {
       col_schema.type              = Type::INT64;
       col_schema.stats_dtype       = statistics_dtype::dtype_decimal64;
       col_schema.decimal_precision = MAX_DECIMAL64_PRECISION;
+      col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL64_PRECISION}};
     } else if (std::is_same_v<T, numeric::decimal128>) {
       col_schema.type              = Type::FIXED_LEN_BYTE_ARRAY;
       col_schema.type_length       = sizeof(__int128_t);
       col_schema.stats_dtype       = statistics_dtype::dtype_decimal128;
       col_schema.decimal_precision = MAX_DECIMAL128_PRECISION;
+      col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL128_PRECISION}};
     } else {
       CUDF_FAIL("Unsupported fixed point type for parquet writer");
     }
     col_schema.converted_type = ConvertedType::DECIMAL;
     col_schema.decimal_scale = -col->type().scale();  // parquet and cudf disagree about scale signs
+    col_schema.logical_type->decimal_type->scale = -col->type().scale();
     if (col_meta.is_decimal_precision_set()) {
       CUDF_EXPECTS(col_meta.get_decimal_precision() >= col_schema.decimal_scale,
                    "Precision must be equal to or greater than scale!");
       if (col_schema.type == Type::INT64 and col_meta.get_decimal_precision() < 10) {
         CUDF_LOG_WARN("Parquet writer: writing a decimal column with precision < 10 as int64");
       }
-      col_schema.decimal_precision = col_meta.get_decimal_precision();
+      col_schema.decimal_precision                     = col_meta.get_decimal_precision();
+      col_schema.logical_type->decimal_type->precision = col_meta.get_decimal_precision();
     }
   }
 
@@ -593,7 +611,7 @@ std::vector<schema_tree_node> construct_schema_tree(
 
         schema_tree_node col_schema{};
         col_schema.type            = Type::BYTE_ARRAY;
-        col_schema.converted_type  = ConvertedType::UNKNOWN;
+        col_schema.converted_type  = thrust::nullopt;
         col_schema.stats_dtype     = statistics_dtype::dtype_byte_array;
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
@@ -762,7 +780,10 @@ struct parquet_column_view {
 
   [[nodiscard]] column_view cudf_column_view() const { return cudf_col; }
   [[nodiscard]] Type physical_type() const { return schema_node.type; }
-  [[nodiscard]] ConvertedType converted_type() const { return schema_node.converted_type; }
+  [[nodiscard]] ConvertedType converted_type() const
+  {
+    return schema_node.converted_type.value_or(UNKNOWN);
+  }
 
   std::vector<std::string> const& get_path_in_schema() { return path_in_schema; }
 
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 2a654bd7e8c..fece83f891b 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -4075,11 +4075,12 @@ int32_t compare(T& v1, T& v2)
 int32_t compare_binary(std::vector<uint8_t> const& v1,
                        std::vector<uint8_t> const& v2,
                        cudf::io::parquet::detail::Type ptype,
-                       cudf::io::parquet::detail::ConvertedType ctype)
+                       thrust::optional<cudf::io::parquet::detail::ConvertedType> const& ctype)
 {
+  auto ctype_val = ctype.value_or(cudf::io::parquet::detail::UNKNOWN);
   switch (ptype) {
     case cudf::io::parquet::detail::INT32:
-      switch (ctype) {
+      switch (ctype_val) {
         case cudf::io::parquet::detail::UINT_8:
         case cudf::io::parquet::detail::UINT_16:
         case cudf::io::parquet::detail::UINT_32:
@@ -4091,7 +4092,7 @@ int32_t compare_binary(std::vector<uint8_t> const& v1,
       }
 
     case cudf::io::parquet::detail::INT64:
-      if (ctype == cudf::io::parquet::detail::UINT_64) {
+      if (ctype_val == cudf::io::parquet::detail::UINT_64) {
         return compare(*(reinterpret_cast<uint64_t const*>(v1.data())),
                        *(reinterpret_cast<uint64_t const*>(v2.data())));
       }

From 8ae3aab79e14f8af733879e1e4b62b75b0f62368 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 23 Oct 2023 10:43:33 -0700
Subject: [PATCH 039/118] Extract `debug_utilities.hpp/cu` from
 `column_utilities.hpp/cu` (#13720)

This PR extracts the implementation of the debug utility function `cudf::test::print()` from `column_utilities.hpp/cu` into its separate header/source files (`debug_utilities.hpp/cu`) for better organizing the relevant code. The new header name is also more expressive and more relevant to its purpose.

The changes in this PR are only moving code around. Not any new functionality or implementation was added.
Closes https://github.com/rapidsai/cudf/issues/13450 (although this is not to address that issue).

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Robert Maynard (https://github.com/robertmaynard)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/13720
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cudf_test/column_utilities.hpp    |  33 --
 cpp/include/cudf_test/debug_utilities.hpp     |  47 ++
 .../cudf_test/detail/column_utilities.hpp     |  85 ----
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/groupby/structs_tests.cpp           |   1 +
 cpp/tests/utilities/column_utilities.cu       | 408 +--------------
 cpp/tests/utilities/debug_utilities.cu        | 480 ++++++++++++++++++
 .../utilities_tests/column_debug_tests.cpp    | 137 +++++
 .../column_utilities_tests.cpp                | 100 ----
 10 files changed, 674 insertions(+), 619 deletions(-)
 create mode 100644 cpp/include/cudf_test/debug_utilities.hpp
 delete mode 100644 cpp/include/cudf_test/detail/column_utilities.hpp
 create mode 100644 cpp/tests/utilities/debug_utilities.cu
 create mode 100644 cpp/tests/utilities_tests/column_debug_tests.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f8b9762f1d4..472ee9d9fd4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -835,6 +835,7 @@ if(CUDF_BUILD_TESTUTIL)
     tests/io/metadata_utilities.cpp
     tests/utilities/base_fixture.cpp
     tests/utilities/column_utilities.cu
+    tests/utilities/debug_utilities.cu
     tests/utilities/table_utilities.cu
     tests/utilities/tdigest_utilities.cu
   )
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index 059bd10eae1..f6872fcdd6d 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -140,39 +140,6 @@ void expect_equal_buffers(void const* lhs, void const* rhs, std::size_t size_byt
  */
 void expect_column_empty(cudf::column_view const& col);
 
-/**
- * @brief Formats a column view as a string
- *
- * @param col The column view
- * @param delimiter The delimiter to put between strings
- */
-std::string to_string(cudf::column_view const& col, std::string const& delimiter);
-
-/**
- * @brief Formats a null mask as a string
- *
- * @param null_mask The null mask buffer
- * @param null_mask_size Size of the null mask (in rows)
- */
-std::string to_string(std::vector<bitmask_type> const& null_mask, size_type null_mask_size);
-
-/**
- * @brief Convert column values to a host vector of strings
- *
- * @param col The column view
- */
-std::vector<std::string> to_strings(cudf::column_view const& col);
-
-/**
- * @brief Print a column view to an ostream
- *
- * @param os        The output stream
- * @param col       The column view
- */
-void print(cudf::column_view const& col,
-           std::ostream& os             = std::cout,
-           std::string const& delimiter = ",");
-
 /**
  * @brief Copy the null bitmask from a column view to a host vector
  *
diff --git a/cpp/include/cudf_test/debug_utilities.hpp b/cpp/include/cudf_test/debug_utilities.hpp
new file mode 100644
index 00000000000..a0881490b82
--- /dev/null
+++ b/cpp/include/cudf_test/debug_utilities.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/null_mask.hpp>
+
+namespace cudf::test {
+
+/**
+ * @brief Formats a column view as a string
+ *
+ * @param col The input column view
+ * @param delimiter The delimiter to put between strings
+ */
+std::string to_string(cudf::column_view const& col, std::string const& delimiter);
+
+/**
+ * @brief Convert column values to a host vector of strings
+ *
+ * @param col The input column view
+ */
+std::vector<std::string> to_strings(cudf::column_view const& col);
+
+/**
+ * @brief Print a column view to an ostream
+ *
+ * @param col The input column view
+ * @param os The output stream
+ */
+void print(cudf::column_view const& col, std::ostream& os = std::cout);
+
+}  // namespace cudf::test
diff --git a/cpp/include/cudf_test/detail/column_utilities.hpp b/cpp/include/cudf_test/detail/column_utilities.hpp
deleted file mode 100644
index f8270f61f10..00000000000
--- a/cpp/include/cudf_test/detail/column_utilities.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/column/column_view.hpp>
-#include <cudf/null_mask.hpp>
-
-namespace cudf {
-namespace test {
-namespace detail {
-
-/**
- * @brief Formats a column view as a string
- *
- * @param col The column view
- * @param delimiter The delimiter to put between strings
- * @param indent Indentation for all output. See detail::to_strings for detailed
- * explanation.
- */
-std::string to_string(cudf::column_view const& col,
-                      std::string const& delimiter,
-                      std::string const& indent = "");
-
-/**
- * @brief Formats a null mask as a string
- *
- * @param null_mask The null mask buffer
- * @param null_mask_size Size of the null mask (in rows)
- * @param indent Indentation for all output. See detail::to_strings for detailed
- * explanation.
- */
-std::string to_string(std::vector<bitmask_type> const& null_mask,
-                      size_type null_mask_size,
-                      std::string const& indent = "");
-
-/**
- * @brief Convert column values to a host vector of strings
- *
- * Supports indentation of all output.  For example, if the displayed output of your column
- * would be
- *
- * @code{.pseudo}
- * "1,2,3,4,5"
- * @endcode
- * and the `indent` parameter was "   ", that indentation would be prepended to
- * result in the output
- * @code{.pseudo}
- * "   1,2,3,4,5"
- * @endcode
- *
- * The can be useful for displaying complex types. An example use case would be for
- * displaying the nesting of a LIST type column (via recursion).
- *
- *  List<List<int>>:
- *  Length : 3
- *  Offsets : 0, 2, 5, 6
- *  Children :
- *     List<int>:
- *     Length : 6
- *     Offsets : 0, 2, 4, 7, 8, 9, 11
- *     Children :
- *        1, 2, 3, 4, 5, 6, 7, 0, 8, 9, 10
- *
- * @param col The column view
- * @param indent Indentation for all output
- */
-std::vector<std::string> to_strings(cudf::column_view const& col, std::string const& indent = "");
-
-}  // namespace detail
-}  // namespace test
-}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 16e7239ebd8..eb0585d8f3e 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -357,6 +357,7 @@ ConfigureTest(
 ConfigureTest(
   UTILITIES_TEST
   utilities_tests/type_list_tests.cpp
+  utilities_tests/column_debug_tests.cpp
   utilities_tests/column_utilities_tests.cpp
   utilities_tests/column_wrapper_tests.cpp
   utilities_tests/lists_column_wrapper_tests.cpp
diff --git a/cpp/tests/groupby/structs_tests.cpp b/cpp/tests/groupby/structs_tests.cpp
index f85fc6335f6..af6f613d344 100644
--- a/cpp/tests/groupby/structs_tests.cpp
+++ b/cpp/tests/groupby/structs_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 620e0bfe8de..f54ea28d9b2 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -14,28 +14,24 @@
  * limitations under the License.
  */
 
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/debug_utilities.hpp>
+#include <cudf_test/default_stream.hpp>
+
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/convert/convert_datetime.hpp>
+#include <cudf/lists/list_view.hpp>
 #include <cudf/structs/struct_view.hpp>
-#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
-#include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/default_stream.hpp>
-#include <cudf_test/detail/column_utilities.hpp>
-
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/copy.h>
@@ -928,396 +924,6 @@ std::vector<bitmask_type> bitmask_to_host(cudf::column_view const& c)
   }
 }
 
-namespace {
-
-template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
-static auto numeric_to_string_precise(T value)
-{
-  return std::to_string(value);
-}
-
-template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
-static auto numeric_to_string_precise(T value)
-{
-  std::ostringstream o;
-  o << std::setprecision(std::numeric_limits<T>::max_digits10) << value;
-  return o.str();
-}
-
-static auto duration_suffix(cudf::duration_D) { return " days"; }
-
-static auto duration_suffix(cudf::duration_s) { return " seconds"; }
-
-static auto duration_suffix(cudf::duration_ms) { return " milliseconds"; }
-
-static auto duration_suffix(cudf::duration_us) { return " microseconds"; }
-
-static auto duration_suffix(cudf::duration_ns) { return " nanoseconds"; }
-
-std::string get_nested_type_str(cudf::column_view const& view)
-{
-  if (view.type().id() == cudf::type_id::LIST) {
-    lists_column_view lcv(view);
-    return cudf::type_to_name(view.type()) + "<" + (get_nested_type_str(lcv.child())) + ">";
-  }
-
-  if (view.type().id() == cudf::type_id::STRUCT) {
-    std::ostringstream out;
-
-    out << cudf::type_to_name(view.type()) + "<";
-    std::transform(view.child_begin(),
-                   view.child_end(),
-                   std::ostream_iterator<std::string>(out, ","),
-                   [&out](auto const col) { return get_nested_type_str(col); });
-    out << ">";
-    return out.str();
-  }
-
-  return cudf::type_to_name(view.type());
-}
-
-template <typename NestedColumnView>
-std::string nested_offsets_to_string(NestedColumnView const& c, std::string const& delimiter = ", ")
-{
-  column_view offsets = (c.parent()).child(NestedColumnView::offsets_column_index);
-  CUDF_EXPECTS(offsets.type().id() == type_id::INT32,
-               "Column does not appear to be an offsets column");
-  CUDF_EXPECTS(offsets.offset() == 0, "Offsets column has an internal offset!");
-  size_type output_size = c.size() + 1;
-
-  // the first offset value to normalize everything against
-  size_type first =
-    cudf::detail::get_value<size_type>(offsets, c.offset(), cudf::test::get_default_stream());
-  rmm::device_uvector<size_type> shifted_offsets(output_size, cudf::test::get_default_stream());
-
-  // normalize the offset values for the column offset
-  size_type const* d_offsets = offsets.head<size_type>() + c.offset();
-  thrust::transform(
-    rmm::exec_policy(cudf::test::get_default_stream()),
-    d_offsets,
-    d_offsets + output_size,
-    shifted_offsets.begin(),
-    [first] __device__(int32_t offset) { return static_cast<size_type>(offset - first); });
-
-  auto const h_shifted_offsets =
-    cudf::detail::make_host_vector_sync(shifted_offsets, cudf::test::get_default_stream());
-  std::ostringstream buffer;
-  for (size_t idx = 0; idx < h_shifted_offsets.size(); idx++) {
-    buffer << h_shifted_offsets[idx];
-    if (idx < h_shifted_offsets.size() - 1) { buffer << delimiter; }
-  }
-  return buffer.str();
-}
-
-struct column_view_printer {
-  template <typename Element, std::enable_if_t<is_numeric<Element>()>* = nullptr>
-  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
-  {
-    auto h_data = cudf::test::to_host<Element>(col);
-
-    out.resize(col.size());
-
-    if (col.nullable()) {
-      std::transform(thrust::make_counting_iterator(size_type{0}),
-                     thrust::make_counting_iterator(col.size()),
-                     out.begin(),
-                     [&h_data](auto idx) {
-                       return bit_is_set(h_data.second.data(), idx)
-                                ? numeric_to_string_precise(h_data.first[idx])
-                                : std::string("NULL");
-                     });
-
-    } else {
-      std::transform(h_data.first.begin(), h_data.first.end(), out.begin(), [](Element el) {
-        return numeric_to_string_precise(el);
-      });
-    }
-  }
-
-  template <typename Element, std::enable_if_t<is_timestamp<Element>()>* = nullptr>
-  void operator()(cudf::column_view const& col,
-                  std::vector<std::string>& out,
-                  std::string const& indent)
-  {
-    //  For timestamps, convert timestamp column to column of strings, then
-    //  call string version
-    std::string format = [&]() {
-      if constexpr (std::is_same_v<cudf::timestamp_s, Element>) {
-        return std::string{"%Y-%m-%dT%H:%M:%SZ"};
-      } else if constexpr (std::is_same_v<cudf::timestamp_ms, Element>) {
-        return std::string{"%Y-%m-%dT%H:%M:%S.%3fZ"};
-      } else if constexpr (std::is_same_v<cudf::timestamp_us, Element>) {
-        return std::string{"%Y-%m-%dT%H:%M:%S.%6fZ"};
-      } else if constexpr (std::is_same_v<cudf::timestamp_ns, Element>) {
-        return std::string{"%Y-%m-%dT%H:%M:%S.%9fZ"};
-      }
-      return std::string{"%Y-%m-%d"};
-    }();
-
-    auto col_as_strings = cudf::strings::from_timestamps(col, format);
-    if (col_as_strings->size() == 0) { return; }
-
-    this->template operator()<cudf::string_view>(*col_as_strings, out, indent);
-  }
-
-  template <typename Element, std::enable_if_t<cudf::is_fixed_point<Element>()>* = nullptr>
-  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
-  {
-    auto const h_data = cudf::test::to_host<Element>(col);
-    if (col.nullable()) {
-      std::transform(thrust::make_counting_iterator(size_type{0}),
-                     thrust::make_counting_iterator(col.size()),
-                     std::back_inserter(out),
-                     [&h_data](auto idx) {
-                       return h_data.second.empty() || bit_is_set(h_data.second.data(), idx)
-                                ? static_cast<std::string>(h_data.first[idx])
-                                : std::string("NULL");
-                     });
-    } else {
-      std::transform(std::cbegin(h_data.first),
-                     std::cend(h_data.first),
-                     std::back_inserter(out),
-                     [col](auto const& fp) { return static_cast<std::string>(fp); });
-    }
-  }
-
-  template <typename Element,
-            std::enable_if_t<std::is_same_v<Element, cudf::string_view>>* = nullptr>
-  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
-  {
-    //
-    //  Implementation for strings, call special to_host variant
-    //
-    if (col.is_empty()) return;
-    auto h_data = cudf::test::to_host<std::string>(col);
-
-    // explicitly replace some special whitespace characters with their literal equivalents
-    auto cleaned = [](std::string_view in) {
-      std::string out(in);
-      auto replace_char = [](std::string& out, char c, std::string_view repl) {
-        for (std::string::size_type pos{}; out.npos != (pos = out.find(c, pos)); pos++) {
-          out.replace(pos, 1, repl);
-        }
-      };
-      replace_char(out, '\a', "\\a");
-      replace_char(out, '\b', "\\b");
-      replace_char(out, '\f', "\\f");
-      replace_char(out, '\r', "\\r");
-      replace_char(out, '\t', "\\t");
-      replace_char(out, '\n', "\\n");
-      replace_char(out, '\v', "\\v");
-      return out;
-    };
-
-    out.resize(col.size());
-    std::transform(thrust::make_counting_iterator(size_type{0}),
-                   thrust::make_counting_iterator(col.size()),
-                   out.begin(),
-                   [&](auto idx) {
-                     return h_data.second.empty() || bit_is_set(h_data.second.data(), idx)
-                              ? cleaned(h_data.first[idx])
-                              : std::string("NULL");
-                   });
-  }
-
-  template <typename Element,
-            std::enable_if_t<std::is_same_v<Element, cudf::dictionary32>>* = nullptr>
-  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
-  {
-    cudf::dictionary_column_view dictionary(col);
-    if (col.is_empty()) return;
-    std::vector<std::string> keys    = to_strings(dictionary.keys());
-    std::vector<std::string> indices = to_strings({dictionary.indices().type(),
-                                                   dictionary.size(),
-                                                   dictionary.indices().head(),
-                                                   dictionary.null_mask(),
-                                                   dictionary.null_count(),
-                                                   dictionary.offset()});
-    out.insert(out.end(), keys.begin(), keys.end());
-    if (!indices.empty()) {
-      std::string first = "\x08 : " + indices.front();  // use : as delimiter
-      out.push_back(first);                             // between keys and indices
-      out.insert(out.end(), indices.begin() + 1, indices.end());
-    }
-  }
-
-  // Print the tick counts with the units
-  template <typename Element, std::enable_if_t<is_duration<Element>()>* = nullptr>
-  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
-  {
-    auto h_data = cudf::test::to_host<Element>(col);
-
-    out.resize(col.size());
-
-    if (col.nullable()) {
-      std::transform(thrust::make_counting_iterator(size_type{0}),
-                     thrust::make_counting_iterator(col.size()),
-                     out.begin(),
-                     [&h_data](auto idx) {
-                       return bit_is_set(h_data.second.data(), idx)
-                                ? numeric_to_string_precise(h_data.first[idx].count()) +
-                                    duration_suffix(h_data.first[idx])
-                                : std::string("NULL");
-                     });
-
-    } else {
-      std::transform(h_data.first.begin(), h_data.first.end(), out.begin(), [](Element el) {
-        return numeric_to_string_precise(el.count()) + duration_suffix(el);
-      });
-    }
-  }
-
-  template <typename Element, std::enable_if_t<std::is_same_v<Element, cudf::list_view>>* = nullptr>
-  void operator()(cudf::column_view const& col,
-                  std::vector<std::string>& out,
-                  std::string const& indent)
-  {
-    lists_column_view lcv(col);
-
-    // propagate slicing to the child if necessary
-    column_view child    = lcv.get_sliced_child(cudf::test::get_default_stream());
-    bool const is_sliced = lcv.offset() > 0 || child.offset() > 0;
-
-    std::string tmp =
-      get_nested_type_str(col) + (is_sliced ? "(sliced)" : "") + ":\n" + indent +
-      "Length : " + std::to_string(lcv.size()) + "\n" + indent +
-      "Offsets : " + (lcv.size() > 0 ? nested_offsets_to_string(lcv) : "") + "\n" +
-      (lcv.parent().nullable()
-         ? indent + "Null count: " + std::to_string(lcv.null_count()) + "\n" +
-             detail::to_string(bitmask_to_host(col), col.size(), indent) + "\n"
-         : "") +
-      // non-nested types don't typically display their null masks, so do it here for convenience.
-      (!is_nested(child.type()) && child.nullable()
-         ? "   " + detail::to_string(bitmask_to_host(child), child.size(), indent) + "\n"
-         : "") +
-      (detail::to_string(child, ", ", indent + "   ")) + "\n";
-
-    out.push_back(tmp);
-  }
-
-  template <typename Element,
-            std::enable_if_t<std::is_same_v<Element, cudf::struct_view>>* = nullptr>
-  void operator()(cudf::column_view const& col,
-                  std::vector<std::string>& out,
-                  std::string const& indent)
-  {
-    structs_column_view view{col};
-
-    std::ostringstream out_stream;
-
-    out_stream << get_nested_type_str(col) << ":\n"
-               << indent << "Length : " << view.size() << ":\n";
-    if (view.nullable()) {
-      out_stream << indent << "Null count: " << view.null_count() << "\n"
-                 << detail::to_string(bitmask_to_host(col), col.size(), indent) << "\n";
-    }
-
-    auto iter = thrust::make_counting_iterator(0);
-    std::transform(
-      iter,
-      iter + view.num_children(),
-      std::ostream_iterator<std::string>(out_stream, "\n"),
-      [&](size_type index) {
-        auto child = view.get_sliced_child(index, cudf::test::get_default_stream());
-
-        // non-nested types don't typically display their null masks, so do it here for convenience.
-        return (!is_nested(child.type()) && child.nullable()
-                  ? "   " + detail::to_string(bitmask_to_host(child), child.size(), indent) + "\n"
-                  : "") +
-               detail::to_string(child, ", ", indent + "   ");
-      });
-
-    out.push_back(out_stream.str());
-  }
-};
-
-}  // namespace
-
-namespace detail {
-
-/**
- * @copydoc cudf::test::detail::to_strings
- */
-std::vector<std::string> to_strings(cudf::column_view const& col, std::string const& indent)
-{
-  std::vector<std::string> reply;
-  cudf::type_dispatcher(col.type(), column_view_printer{}, col, reply, indent);
-  return reply;
-}
-
-/**
- * @copydoc cudf::test::detail::to_string(cudf::column_view, std::string, std::string)
- *
- * @param indent Indentation for all output
- */
-std::string to_string(cudf::column_view const& col,
-                      std::string const& delimiter,
-                      std::string const& indent)
-{
-  std::ostringstream buffer;
-  std::vector<std::string> h_data = to_strings(col, indent);
-
-  buffer << indent;
-  std::copy(h_data.begin(),
-            h_data.end() - (!h_data.empty()),
-            std::ostream_iterator<std::string>(buffer, delimiter.c_str()));
-  if (!h_data.empty()) buffer << h_data.back();
-
-  return buffer.str();
-}
-
-/**
- * @copydoc cudf::test::detail::to_string(std::vector<bitmask_type>, size_type, std::string)
- *
- * @param indent Indentation for all output.  See comment in `to_strings` for
- * a detailed description.
- */
-std::string to_string(std::vector<bitmask_type> const& null_mask,
-                      size_type null_mask_size,
-                      std::string const& indent)
-{
-  std::ostringstream buffer;
-  buffer << indent;
-  for (int idx = null_mask_size - 1; idx >= 0; idx--) {
-    buffer << (cudf::bit_is_set(null_mask.data(), idx) ? "1" : "0");
-  }
-  return buffer.str();
-}
-
-}  // namespace detail
-
-/**
- * @copydoc cudf::test::to_strings
- */
-std::vector<std::string> to_strings(cudf::column_view const& col)
-{
-  return detail::to_strings(col);
-}
-
-/**
- * @copydoc cudf::test::to_string(cudf::column_view, std::string)
- */
-std::string to_string(cudf::column_view const& col, std::string const& delimiter)
-{
-  return detail::to_string(col, delimiter);
-}
-
-/**
- * @copydoc cudf::test::to_string(std::vector<bitmask_type>, size_type)
- */
-std::string to_string(std::vector<bitmask_type> const& null_mask, size_type null_mask_size)
-{
-  return detail::to_string(null_mask, null_mask_size);
-}
-
-/**
- * @copydoc cudf::test::print
- */
-void print(cudf::column_view const& col, std::ostream& os, std::string const& delimiter)
-{
-  os << to_string(col, delimiter) << std::endl;
-}
-
 /**
  * @copydoc cudf::test::validate_host_masks
  */
diff --git a/cpp/tests/utilities/debug_utilities.cu b/cpp/tests/utilities/debug_utilities.cu
new file mode 100644
index 00000000000..a8a43ffb4ca
--- /dev/null
+++ b/cpp/tests/utilities/debug_utilities.cu
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/debug_utilities.hpp>
+
+#include <cudf/detail/get_value.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/convert/convert_datetime.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+#include <iomanip>
+#include <sstream>
+
+namespace cudf::test {
+
+// Forward declaration.
+namespace detail {
+
+/**
+ * @brief Formats a column view as a string
+ *
+ * @param col The column view
+ * @param delimiter The delimiter to put between strings
+ * @param indent Indentation for all output
+ */
+std::string to_string(cudf::column_view const& col,
+                      std::string const& delimiter,
+                      std::string const& indent = "");
+
+/**
+ * @brief Formats a null mask as a string
+ *
+ * @param null_mask The null mask buffer
+ * @param null_mask_size Size of the null mask (in rows)
+ * @param indent Indentation for all output
+ */
+std::string to_string(std::vector<bitmask_type> const& null_mask,
+                      size_type null_mask_size,
+                      std::string const& indent = "");
+
+/**
+ * @brief Convert column values to a host vector of strings
+ *
+ * Supports indentation of all output.  For example, if the displayed output of your column
+ * would be
+ *
+ * @code{.pseudo}
+ * "1,2,3,4,5"
+ * @endcode
+ * and the `indent` parameter was "   ", that indentation would be prepended to
+ * result in the output
+ * @code{.pseudo}
+ * "   1,2,3,4,5"
+ * @endcode
+ *
+ * The can be useful for displaying complex types. An example use case would be for
+ * displaying the nesting of a LIST type column (via recursion).
+ *
+ *  List<List<int>>:
+ *  Length : 3
+ *  Offsets : 0, 2, 5, 6
+ *  Children :
+ *     List<int>:
+ *     Length : 6
+ *     Offsets : 0, 2, 4, 7, 8, 9, 11
+ *     Children :
+ *        1, 2, 3, 4, 5, 6, 7, 0, 8, 9, 10
+ *
+ * @param col The column view
+ * @param indent Indentation for all output
+ */
+std::vector<std::string> to_strings(cudf::column_view const& col, std::string const& indent = "");
+
+}  // namespace detail
+
+namespace {
+
+template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+static auto numeric_to_string_precise(T value)
+{
+  return std::to_string(value);
+}
+
+template <typename T, std::enable_if_t<std::is_floating_point_v<T>>* = nullptr>
+static auto numeric_to_string_precise(T value)
+{
+  std::ostringstream o;
+  o << std::setprecision(std::numeric_limits<T>::max_digits10) << value;
+  return o.str();
+}
+
+static auto duration_suffix(cudf::duration_D) { return " days"; }
+
+static auto duration_suffix(cudf::duration_s) { return " seconds"; }
+
+static auto duration_suffix(cudf::duration_ms) { return " milliseconds"; }
+
+static auto duration_suffix(cudf::duration_us) { return " microseconds"; }
+
+static auto duration_suffix(cudf::duration_ns) { return " nanoseconds"; }
+
+std::string get_nested_type_str(cudf::column_view const& view)
+{
+  if (view.type().id() == cudf::type_id::LIST) {
+    lists_column_view lcv(view);
+    return cudf::type_to_name(view.type()) + "<" + (get_nested_type_str(lcv.child())) + ">";
+  }
+
+  if (view.type().id() == cudf::type_id::STRUCT) {
+    std::ostringstream out;
+
+    out << cudf::type_to_name(view.type()) + "<";
+    std::transform(view.child_begin(),
+                   view.child_end(),
+                   std::ostream_iterator<std::string>(out, ","),
+                   [&out](auto const col) { return get_nested_type_str(col); });
+    out << ">";
+    return out.str();
+  }
+
+  return cudf::type_to_name(view.type());
+}
+
+template <typename NestedColumnView>
+std::string nested_offsets_to_string(NestedColumnView const& c, std::string const& delimiter = ", ")
+{
+  column_view offsets = (c.parent()).child(NestedColumnView::offsets_column_index);
+  CUDF_EXPECTS(offsets.type().id() == type_id::INT32,
+               "Column does not appear to be an offsets column");
+  CUDF_EXPECTS(offsets.offset() == 0, "Offsets column has an internal offset!");
+  size_type output_size = c.size() + 1;
+
+  // the first offset value to normalize everything against
+  size_type first =
+    cudf::detail::get_value<size_type>(offsets, c.offset(), cudf::get_default_stream());
+  rmm::device_uvector<size_type> shifted_offsets(output_size, cudf::get_default_stream());
+
+  // normalize the offset values for the column offset
+  size_type const* d_offsets = offsets.head<size_type>() + c.offset();
+  thrust::transform(
+    rmm::exec_policy(cudf::get_default_stream()),
+    d_offsets,
+    d_offsets + output_size,
+    shifted_offsets.begin(),
+    [first] __device__(int32_t offset) { return static_cast<size_type>(offset - first); });
+
+  auto const h_shifted_offsets =
+    cudf::detail::make_host_vector_sync(shifted_offsets, cudf::get_default_stream());
+  std::ostringstream buffer;
+  for (size_t idx = 0; idx < h_shifted_offsets.size(); idx++) {
+    buffer << h_shifted_offsets[idx];
+    if (idx < h_shifted_offsets.size() - 1) { buffer << delimiter; }
+  }
+  return buffer.str();
+}
+
+struct column_view_printer {
+  template <typename Element, std::enable_if_t<is_numeric<Element>()>* = nullptr>
+  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
+  {
+    auto h_data = cudf::test::to_host<Element>(col);
+
+    out.resize(col.size());
+
+    if (col.nullable()) {
+      std::transform(thrust::make_counting_iterator(size_type{0}),
+                     thrust::make_counting_iterator(col.size()),
+                     out.begin(),
+                     [&h_data](auto idx) {
+                       return bit_is_set(h_data.second.data(), idx)
+                                ? numeric_to_string_precise(h_data.first[idx])
+                                : std::string("NULL");
+                     });
+
+    } else {
+      std::transform(h_data.first.begin(), h_data.first.end(), out.begin(), [](Element el) {
+        return numeric_to_string_precise(el);
+      });
+    }
+  }
+
+  template <typename Element, std::enable_if_t<is_timestamp<Element>()>* = nullptr>
+  void operator()(cudf::column_view const& col,
+                  std::vector<std::string>& out,
+                  std::string const& indent)
+  {
+    //  For timestamps, convert timestamp column to column of strings, then
+    //  call string version
+    std::string format = [&]() {
+      if constexpr (std::is_same_v<cudf::timestamp_s, Element>) {
+        return std::string{"%Y-%m-%dT%H:%M:%SZ"};
+      } else if constexpr (std::is_same_v<cudf::timestamp_ms, Element>) {
+        return std::string{"%Y-%m-%dT%H:%M:%S.%3fZ"};
+      } else if constexpr (std::is_same_v<cudf::timestamp_us, Element>) {
+        return std::string{"%Y-%m-%dT%H:%M:%S.%6fZ"};
+      } else if constexpr (std::is_same_v<cudf::timestamp_ns, Element>) {
+        return std::string{"%Y-%m-%dT%H:%M:%S.%9fZ"};
+      }
+      return std::string{"%Y-%m-%d"};
+    }();
+
+    auto col_as_strings = cudf::strings::from_timestamps(col, format);
+    if (col_as_strings->size() == 0) { return; }
+
+    this->template operator()<cudf::string_view>(*col_as_strings, out, indent);
+  }
+
+  template <typename Element, std::enable_if_t<cudf::is_fixed_point<Element>()>* = nullptr>
+  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
+  {
+    auto const h_data = cudf::test::to_host<Element>(col);
+    if (col.nullable()) {
+      std::transform(thrust::make_counting_iterator(size_type{0}),
+                     thrust::make_counting_iterator(col.size()),
+                     std::back_inserter(out),
+                     [&h_data](auto idx) {
+                       return h_data.second.empty() || bit_is_set(h_data.second.data(), idx)
+                                ? static_cast<std::string>(h_data.first[idx])
+                                : std::string("NULL");
+                     });
+    } else {
+      std::transform(std::cbegin(h_data.first),
+                     std::cend(h_data.first),
+                     std::back_inserter(out),
+                     [col](auto const& fp) { return static_cast<std::string>(fp); });
+    }
+  }
+
+  template <typename Element,
+            std::enable_if_t<std::is_same_v<Element, cudf::string_view>>* = nullptr>
+  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
+  {
+    //
+    //  Implementation for strings, call special to_host variant
+    //
+    if (col.is_empty()) return;
+    auto h_data = cudf::test::to_host<std::string>(col);
+
+    // explicitly replace some special whitespace characters with their literal equivalents
+    auto cleaned = [](std::string_view in) {
+      std::string out(in);
+      auto replace_char = [](std::string& out, char c, std::string_view repl) {
+        for (std::string::size_type pos{}; out.npos != (pos = out.find(c, pos)); pos++) {
+          out.replace(pos, 1, repl);
+        }
+      };
+      replace_char(out, '\a', "\\a");
+      replace_char(out, '\b', "\\b");
+      replace_char(out, '\f', "\\f");
+      replace_char(out, '\r', "\\r");
+      replace_char(out, '\t', "\\t");
+      replace_char(out, '\n', "\\n");
+      replace_char(out, '\v', "\\v");
+      return out;
+    };
+
+    out.resize(col.size());
+    std::transform(thrust::make_counting_iterator(size_type{0}),
+                   thrust::make_counting_iterator(col.size()),
+                   out.begin(),
+                   [&](auto idx) {
+                     return h_data.second.empty() || bit_is_set(h_data.second.data(), idx)
+                              ? cleaned(h_data.first[idx])
+                              : std::string("NULL");
+                   });
+  }
+
+  template <typename Element,
+            std::enable_if_t<std::is_same_v<Element, cudf::dictionary32>>* = nullptr>
+  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
+  {
+    cudf::dictionary_column_view dictionary(col);
+    if (col.is_empty()) return;
+    std::vector<std::string> keys    = to_strings(dictionary.keys());
+    std::vector<std::string> indices = to_strings({dictionary.indices().type(),
+                                                   dictionary.size(),
+                                                   dictionary.indices().head(),
+                                                   dictionary.null_mask(),
+                                                   dictionary.null_count(),
+                                                   dictionary.offset()});
+    out.insert(out.end(), keys.begin(), keys.end());
+    if (!indices.empty()) {
+      std::string first = "\x08 : " + indices.front();  // use : as delimiter
+      out.push_back(first);                             // between keys and indices
+      out.insert(out.end(), indices.begin() + 1, indices.end());
+    }
+  }
+
+  // Print the tick counts with the units
+  template <typename Element, std::enable_if_t<is_duration<Element>()>* = nullptr>
+  void operator()(cudf::column_view const& col, std::vector<std::string>& out, std::string const&)
+  {
+    auto h_data = cudf::test::to_host<Element>(col);
+
+    out.resize(col.size());
+
+    if (col.nullable()) {
+      std::transform(thrust::make_counting_iterator(size_type{0}),
+                     thrust::make_counting_iterator(col.size()),
+                     out.begin(),
+                     [&h_data](auto idx) {
+                       return bit_is_set(h_data.second.data(), idx)
+                                ? numeric_to_string_precise(h_data.first[idx].count()) +
+                                    duration_suffix(h_data.first[idx])
+                                : std::string("NULL");
+                     });
+
+    } else {
+      std::transform(h_data.first.begin(), h_data.first.end(), out.begin(), [](Element el) {
+        return numeric_to_string_precise(el.count()) + duration_suffix(el);
+      });
+    }
+  }
+
+  template <typename Element, std::enable_if_t<std::is_same_v<Element, cudf::list_view>>* = nullptr>
+  void operator()(cudf::column_view const& col,
+                  std::vector<std::string>& out,
+                  std::string const& indent)
+  {
+    lists_column_view lcv(col);
+
+    // propagate slicing to the child if necessary
+    column_view child    = lcv.get_sliced_child(cudf::get_default_stream());
+    bool const is_sliced = lcv.offset() > 0 || child.offset() > 0;
+
+    std::string tmp =
+      get_nested_type_str(col) + (is_sliced ? "(sliced)" : "") + ":\n" + indent +
+      "Length : " + std::to_string(lcv.size()) + "\n" + indent +
+      "Offsets : " + (lcv.size() > 0 ? nested_offsets_to_string(lcv) : "") + "\n" +
+      (lcv.parent().nullable()
+         ? indent + "Null count: " + std::to_string(lcv.null_count()) + "\n" +
+             detail::to_string(cudf::test::bitmask_to_host(col), col.size(), indent) + "\n"
+         : "") +
+      // non-nested types don't typically display their null masks, so do it here for convenience.
+      (!is_nested(child.type()) && child.nullable()
+         ? "   " + detail::to_string(cudf::test::bitmask_to_host(child), child.size(), indent) +
+             "\n"
+         : "") +
+      (detail::to_string(child, ", ", indent + "   ")) + "\n";
+
+    out.push_back(tmp);
+  }
+
+  template <typename Element,
+            std::enable_if_t<std::is_same_v<Element, cudf::struct_view>>* = nullptr>
+  void operator()(cudf::column_view const& col,
+                  std::vector<std::string>& out,
+                  std::string const& indent)
+  {
+    structs_column_view view{col};
+
+    std::ostringstream out_stream;
+
+    out_stream << get_nested_type_str(col) << ":\n"
+               << indent << "Length : " << view.size() << ":\n";
+    if (view.nullable()) {
+      out_stream << indent << "Null count: " << view.null_count() << "\n"
+                 << detail::to_string(cudf::test::bitmask_to_host(col), col.size(), indent) << "\n";
+    }
+
+    auto iter = thrust::make_counting_iterator(0);
+    std::transform(
+      iter,
+      iter + view.num_children(),
+      std::ostream_iterator<std::string>(out_stream, "\n"),
+      [&](size_type index) {
+        auto child = view.get_sliced_child(index, cudf::get_default_stream());
+
+        // non-nested types don't typically display their null masks, so do it here for convenience.
+        return (!is_nested(child.type()) && child.nullable()
+                  ? "   " +
+                      detail::to_string(cudf::test::bitmask_to_host(child), child.size(), indent) +
+                      "\n"
+                  : "") +
+               detail::to_string(child, ", ", indent + "   ");
+      });
+
+    out.push_back(out_stream.str());
+  }
+};
+
+}  // namespace
+
+namespace detail {
+
+/**
+ * @copydoc cudf::test::detail::to_strings
+ */
+std::vector<std::string> to_strings(cudf::column_view const& col, std::string const& indent)
+{
+  std::vector<std::string> reply;
+  cudf::type_dispatcher(col.type(), column_view_printer{}, col, reply, indent);
+  return reply;
+}
+
+/**
+ * @copydoc cudf::test::detail::to_string(cudf::column_view, std::string, std::string)
+ *
+ * @param indent Indentation for all output
+ */
+std::string to_string(cudf::column_view const& col,
+                      std::string const& delimiter,
+                      std::string const& indent)
+{
+  std::ostringstream buffer;
+  std::vector<std::string> h_data = to_strings(col, indent);
+
+  buffer << indent;
+  std::copy(h_data.begin(),
+            h_data.end() - (!h_data.empty()),
+            std::ostream_iterator<std::string>(buffer, delimiter.c_str()));
+  if (!h_data.empty()) buffer << h_data.back();
+
+  return buffer.str();
+}
+
+/**
+ * @copydoc cudf::test::detail::to_string(std::vector<bitmask_type>, size_type, std::string)
+ *
+ * @param indent Indentation for all output.  See comment in `to_strings` for
+ * a detailed description.
+ */
+std::string to_string(std::vector<bitmask_type> const& null_mask,
+                      size_type null_mask_size,
+                      std::string const& indent)
+{
+  std::ostringstream buffer;
+  buffer << indent;
+  for (int idx = null_mask_size - 1; idx >= 0; idx--) {
+    buffer << (cudf::bit_is_set(null_mask.data(), idx) ? "1" : "0");
+  }
+  return buffer.str();
+}
+
+}  // namespace detail
+
+std::vector<std::string> to_strings(cudf::column_view const& col)
+{
+  return detail::to_strings(col);
+}
+
+std::string to_string(cudf::column_view const& col, std::string const& delimiter)
+{
+  return detail::to_string(col, delimiter);
+}
+
+std::string to_string(std::vector<bitmask_type> const& null_mask, size_type null_mask_size)
+{
+  return detail::to_string(null_mask, null_mask_size);
+}
+
+void print(cudf::column_view const& col, std::ostream& os)
+{
+  os << to_string(col, ",") << std::endl;
+}
+
+}  // namespace cudf::test
diff --git a/cpp/tests/utilities_tests/column_debug_tests.cpp b/cpp/tests/utilities_tests/column_debug_tests.cpp
new file mode 100644
index 00000000000..0dae407ad21
--- /dev/null
+++ b/cpp/tests/utilities_tests/column_debug_tests.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/debug_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <thrust/iterator/transform_iterator.h>
+
+#include <type_traits>
+
+template <typename T>
+struct ColumnDebugTestIntegral : public cudf::test::BaseFixture {};
+template <typename T>
+struct ColumnDebugTestFloatingPoint : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(ColumnDebugTestIntegral, cudf::test::IntegralTypes);
+TYPED_TEST_SUITE(ColumnDebugTestFloatingPoint, cudf::test::FloatingPointTypes);
+
+TYPED_TEST(ColumnDebugTestIntegral, PrintColumnNumeric)
+{
+  char const* delimiter = ",";
+
+  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col({1, 2, 3, 4, 5});
+  auto std_col = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
+
+  std::stringstream tmp;
+  auto string_iter =
+    thrust::make_transform_iterator(std::begin(std_col), [](auto e) { return std::to_string(e); });
+
+  std::copy(string_iter,
+            string_iter + std_col.size() - 1,
+            std::ostream_iterator<std::string>(tmp, delimiter));
+
+  tmp << std::to_string(std_col.back());
+
+  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), tmp.str());
+}
+
+TYPED_TEST(ColumnDebugTestIntegral, PrintColumnWithInvalids)
+{
+  char const* delimiter = ",";
+
+  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col{{1, 2, 3, 4, 5}, {1, 0, 1, 0, 1}};
+  auto std_col = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
+
+  std::ostringstream tmp;
+  tmp << std::to_string(std_col[0]) << delimiter << "NULL" << delimiter
+      << std::to_string(std_col[2]) << delimiter << "NULL" << delimiter
+      << std::to_string(std_col[4]);
+
+  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), tmp.str());
+}
+
+TYPED_TEST(ColumnDebugTestFloatingPoint, PrintColumnNumeric)
+{
+  char const* delimiter = ",";
+
+  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col(
+    {10001523.25, 2.0, 3.75, 0.000000034, 5.3});
+
+  auto expected = std::is_same_v<TypeParam, double>
+                    ? "10001523.25,2,3.75,3.4e-08,5.2999999999999998"
+                    : "10001523,2,3.75,3.39999993e-08,5.30000019";
+
+  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
+}
+
+TYPED_TEST(ColumnDebugTestFloatingPoint, PrintColumnWithInvalids)
+{
+  char const* delimiter = ",";
+
+  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col(
+    {10001523.25, 2.0, 3.75, 0.000000034, 5.3}, {1, 0, 1, 0, 1});
+
+  auto expected = std::is_same_v<TypeParam, double>
+                    ? "10001523.25,NULL,3.75,NULL,5.2999999999999998"
+                    : "10001523,NULL,3.75,NULL,5.30000019";
+
+  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
+}
+
+struct ColumnDebugStringsTest : public cudf::test::BaseFixture {};
+
+TEST_F(ColumnDebugStringsTest, PrintColumnDuration)
+{
+  char const* delimiter = ",";
+
+  cudf::test::fixed_width_column_wrapper<cudf::duration_s, int32_t> cudf_col({100, 0, 7, 140000});
+
+  auto expected = "100 seconds,0 seconds,7 seconds,140000 seconds";
+
+  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
+}
+
+TEST_F(ColumnDebugStringsTest, StringsToString)
+{
+  char const* delimiter = ",";
+
+  std::vector<char const*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
+  cudf::test::strings_column_wrapper strings(
+    h_strings.begin(),
+    h_strings.end(),
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+
+  std::ostringstream tmp;
+  tmp << h_strings[0] << delimiter << h_strings[1] << delimiter << "NULL" << delimiter
+      << h_strings[3] << delimiter << h_strings[4] << delimiter << h_strings[5] << delimiter
+      << h_strings[6];
+
+  EXPECT_EQ(cudf::test::to_string(strings, delimiter), tmp.str());
+}
+
+TEST_F(ColumnDebugStringsTest, PrintEscapeStrings)
+{
+  char const* delimiter = ",";
+  cudf::test::strings_column_wrapper input({"e\te\ne", "é\bé\ré", "e\vé\fé\abell"});
+  std::string expected{"e\\te\\ne,é\\bé\\ré,e\\vé\\fé\\abell"};
+  EXPECT_EQ(cudf::test::to_string(input, delimiter), expected);
+}
diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp
index 90a7270cb29..07d2bea2b28 100644
--- a/cpp/tests/utilities_tests/column_utilities_tests.cpp
+++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp
@@ -182,106 +182,6 @@ TEST_F(ColumnUtilitiesStringsTest, StringsToHostAllNulls)
   EXPECT_TRUE(std::all_of(results.begin(), results.end(), [](auto s) { return s.empty(); }));
 }
 
-TEST_F(ColumnUtilitiesStringsTest, PrintColumnDuration)
-{
-  char const* delimiter = ",";
-
-  cudf::test::fixed_width_column_wrapper<cudf::duration_s, int32_t> cudf_col({100, 0, 7, 140000});
-
-  auto expected = "100 seconds,0 seconds,7 seconds,140000 seconds";
-
-  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
-}
-
-TYPED_TEST(ColumnUtilitiesTestIntegral, PrintColumnNumeric)
-{
-  char const* delimiter = ",";
-
-  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col({1, 2, 3, 4, 5});
-  auto std_col = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
-
-  std::stringstream tmp;
-  auto string_iter =
-    thrust::make_transform_iterator(std::begin(std_col), [](auto e) { return std::to_string(e); });
-
-  std::copy(string_iter,
-            string_iter + std_col.size() - 1,
-            std::ostream_iterator<std::string>(tmp, delimiter));
-
-  tmp << std::to_string(std_col.back());
-
-  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), tmp.str());
-}
-
-TYPED_TEST(ColumnUtilitiesTestIntegral, PrintColumnWithInvalids)
-{
-  char const* delimiter = ",";
-
-  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col{{1, 2, 3, 4, 5}, {1, 0, 1, 0, 1}};
-  auto std_col = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
-
-  std::ostringstream tmp;
-  tmp << std::to_string(std_col[0]) << delimiter << "NULL" << delimiter
-      << std::to_string(std_col[2]) << delimiter << "NULL" << delimiter
-      << std::to_string(std_col[4]);
-
-  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), tmp.str());
-}
-
-TYPED_TEST(ColumnUtilitiesTestFloatingPoint, PrintColumnNumeric)
-{
-  char const* delimiter = ",";
-
-  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col(
-    {10001523.25, 2.0, 3.75, 0.000000034, 5.3});
-
-  auto expected = std::is_same_v<TypeParam, double>
-                    ? "10001523.25,2,3.75,3.4e-08,5.2999999999999998"
-                    : "10001523,2,3.75,3.39999993e-08,5.30000019";
-
-  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
-}
-
-TYPED_TEST(ColumnUtilitiesTestFloatingPoint, PrintColumnWithInvalids)
-{
-  char const* delimiter = ",";
-
-  cudf::test::fixed_width_column_wrapper<TypeParam> cudf_col(
-    {10001523.25, 2.0, 3.75, 0.000000034, 5.3}, {1, 0, 1, 0, 1});
-
-  auto expected = std::is_same_v<TypeParam, double>
-                    ? "10001523.25,NULL,3.75,NULL,5.2999999999999998"
-                    : "10001523,NULL,3.75,NULL,5.30000019";
-
-  EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected);
-}
-
-TEST_F(ColumnUtilitiesStringsTest, StringsToString)
-{
-  char const* delimiter = ",";
-
-  std::vector<char const*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  std::ostringstream tmp;
-  tmp << h_strings[0] << delimiter << h_strings[1] << delimiter << "NULL" << delimiter
-      << h_strings[3] << delimiter << h_strings[4] << delimiter << h_strings[5] << delimiter
-      << h_strings[6];
-
-  EXPECT_EQ(cudf::test::to_string(strings, delimiter), tmp.str());
-}
-
-TEST_F(ColumnUtilitiesStringsTest, PrintEscapeStrings)
-{
-  char const* delimiter = ",";
-  cudf::test::strings_column_wrapper input({"e\te\ne", "é\bé\ré", "e\vé\fé\abell"});
-  std::string expected{"e\\te\\ne,é\\bé\\ré,e\\vé\\fé\\abell"};
-  EXPECT_EQ(cudf::test::to_string(input, delimiter), expected);
-}
-
 TYPED_TEST(ColumnUtilitiesTestFixedPoint, NonNullableToHost)
 {
   using namespace numeric;

From e8cf0ebd517a9c81e294771a40c74ff5c4fb42da Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 23 Oct 2023 15:05:26 -0400
Subject: [PATCH 040/118] Expose stream parameter in public strings combine
 APIs (#14281)

Add stream parameter to public APIs:

- `cudf::strings::concatenate()` (x2)
- `cudf::strings::join_strings()`
- `cudf::strings::join_list_elements()` (x2)
- `cudf::strings::repeat_string()`
- `cudf::strings::repeat_strings()` (x2)

Also added stream gtests and fixed up some doxygen comments.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/14281
---
 cpp/include/cudf/strings/combine.hpp          | 88 ++++++++++--------
 cpp/include/cudf/strings/repeat_strings.hpp   |  8 +-
 cpp/src/strings/combine/concatenate.cu        | 14 +--
 cpp/src/strings/combine/join.cu               |  3 +-
 cpp/src/strings/combine/join_list_elements.cu | 13 +--
 cpp/src/strings/repeat_strings.cu             | 11 ++-
 cpp/tests/CMakeLists.txt                      |  1 +
 cpp/tests/streams/strings/combine_test.cpp    | 93 +++++++++++++++++++
 8 files changed, 168 insertions(+), 63 deletions(-)
 create mode 100644 cpp/tests/streams/strings/combine_test.cpp

diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp
index 71f65ac9080..568e8ac50ec 100644
--- a/cpp/include/cudf/strings/combine.hpp
+++ b/cpp/include/cudf/strings/combine.hpp
@@ -66,18 +66,20 @@ enum class output_if_empty_list {
  *
  * @throw cudf::logic_error if separator is not valid.
  *
- * @param strings Strings for this operation.
+ * @param input Strings for this operation
  * @param separator String that should inserted between each string.
  *        Default is an empty string.
- * @param narep String that should represent any null strings found.
+ * @param narep String to replace any null strings found.
  *        Default of invalid-scalar will ignore any null entries.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column containing one string.
  */
 std::unique_ptr<column> join_strings(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_scalar const& separator      = string_scalar(""),
   string_scalar const& narep          = string_scalar("", false),
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -127,18 +129,17 @@ std::unique_ptr<column> join_strings(
  * @throw cudf::logic_error if the number of rows from @p separators and @p strings_columns
  *                          do not match
  *
- * @param strings_columns List of strings columns to concatenate.
+ * @param strings_columns List of strings columns to concatenate
  * @param separators Strings column that provides the separator for a given row
- * @param separator_narep String that should be used in place of a null separator for a given
- *        row. Default of invalid-scalar means no row separator value replacements.
- *        Default is an invalid string.
- * @param col_narep String that should be used in place of any null strings
- *        found in any column. Default of invalid-scalar means no null column value replacements.
- *        Default is an invalid string.
+ * @param separator_narep String to replace a null separator for a given row.
+ *        Default of invalid-scalar means no row separator value replacements.
+ * @param col_narep String that should be used in place of any null strings found in any column.
+ *        Default of invalid-scalar means no null column value replacements.
  * @param separate_nulls If YES, then the separator is included for null rows
  *        if `col_narep` is valid.
- * @param mr Resource for allocating device memory.
- * @return New column with concatenated results.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Resource for allocating device memory
+ * @return New column with concatenated results
  */
 std::unique_ptr<column> concatenate(
   table_view const& strings_columns,
@@ -146,6 +147,7 @@ std::unique_ptr<column> concatenate(
   string_scalar const& separator_narep = string_scalar("", false),
   string_scalar const& col_narep       = string_scalar("", false),
   separator_on_nulls separate_nulls    = separator_on_nulls::YES,
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
@@ -184,21 +186,23 @@ std::unique_ptr<column> concatenate(
  * @throw cudf::logic_error if separator is not valid.
  * @throw cudf::logic_error if only one column is specified
  *
- * @param strings_columns List of string columns to concatenate.
+ * @param strings_columns List of string columns to concatenate
  * @param separator String that should inserted between each string from each row.
  *        Default is an empty string.
- * @param narep String that should be used in place of any null strings
- *        found in any column. Default of invalid-scalar means any null entry in any column will
+ * @param narep String to replace any null strings found in any column.
+ *        Default of invalid-scalar means any null entry in any column will
  *        produces a null result for that row.
- * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column with concatenated results.
+ * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with concatenated results
  */
 std::unique_ptr<column> concatenate(
   table_view const& strings_columns,
   string_scalar const& separator      = string_scalar(""),
   string_scalar const& narep          = string_scalar("", false),
   separator_on_nulls separate_nulls   = separator_on_nulls::YES,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -243,19 +247,20 @@ std::unique_ptr<column> concatenate(
  * @throw cudf::logic_error if the number of rows from `separators` and `lists_strings_column` do
  *        not match
  *
- * @param lists_strings_column Column containing lists of strings to concatenate.
- * @param separators Strings column that provides separators for concatenation.
- * @param separator_narep String that should be used to replace null separator, default is an
- *        invalid-scalar denoting that rows containing null separator will result in null string in
- *        the corresponding output rows.
- * @param string_narep String that should be used to replace null strings in any non-null list row,
- *        default is an invalid-scalar denoting that list rows containing null strings will result
- *        in null string in the corresponding output rows.
- * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
- * @param empty_list_policy if set to EMPTY_STRING, any input row that is an empty list will
+ * @param lists_strings_column Column containing lists of strings to concatenate
+ * @param separators Strings column that provides separators for concatenation
+ * @param separator_narep String that should be used to replace a null separator.
+ *        Default is an invalid-scalar denoting that rows containing null separator will result in
+ *        a null string in the corresponding output rows.
+ * @param string_narep String to replace null strings in any non-null list row.
+ *        Default is an invalid-scalar denoting that list rows containing null strings will result
+ *        in a null string in the corresponding output rows.
+ * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid
+ * @param empty_list_policy If set to EMPTY_STRING, any input row that is an empty list will
  *        result in an empty string. Otherwise, it will result in a null.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with concatenated results.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with concatenated results
  */
 std::unique_ptr<column> join_list_elements(
   lists_column_view const& lists_strings_column,
@@ -264,6 +269,7 @@ std::unique_ptr<column> join_list_elements(
   string_scalar const& string_narep      = string_scalar("", false),
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
   output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
 
 /**
@@ -303,17 +309,18 @@ std::unique_ptr<column> join_list_elements(
  * @throw cudf::logic_error if input column is not lists of strings column.
  * @throw cudf::logic_error if separator is not valid.
  *
- * @param lists_strings_column Column containing lists of strings to concatenate.
- * @param separator String that should inserted between strings of each list row, default is an
- *        empty string.
- * @param narep String that should be used to replace null strings in any non-null list row, default
- *        is an invalid-scalar denoting that list rows containing null strings will result in null
- *        string in the corresponding output rows.
- * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
- * @param empty_list_policy if set to EMPTY_STRING, any input row that is an empty list will result
+ * @param lists_strings_column Column containing lists of strings to concatenate
+ * @param separator String to insert between strings of each list row.
+ *        Default is an empty string.
+ * @param narep String to replace null strings in any non-null list row.
+ *        Default is an invalid-scalar denoting that list rows containing null strings will result
+ *        in a null string in the corresponding output rows.
+ * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid
+ * @param empty_list_policy If set to EMPTY_STRING, any input row that is an empty list will result
  *        in an empty string. Otherwise, it will result in a null.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column with concatenated results.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column with concatenated results
  */
 std::unique_ptr<column> join_list_elements(
   lists_column_view const& lists_strings_column,
@@ -321,6 +328,7 @@ std::unique_ptr<column> join_list_elements(
   string_scalar const& narep             = string_scalar("", false),
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
   output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp
index 2b6575f80d0..7dc9c33f579 100644
--- a/cpp/include/cudf/strings/repeat_strings.hpp
+++ b/cpp/include/cudf/strings/repeat_strings.hpp
@@ -52,12 +52,14 @@ namespace strings {
  *
  * @param input The scalar containing the string to repeat
  * @param repeat_times The number of times the input string is repeated
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned string scalar
  * @return New string scalar in which the input string is repeated
  */
 std::unique_ptr<string_scalar> repeat_string(
   string_scalar const& input,
   size_type repeat_times,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -81,12 +83,14 @@ std::unique_ptr<string_scalar> repeat_string(
  *
  * @param input The column containing strings to repeat
  * @param repeat_times The number of times each input string is repeated
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned strings column
  * @return New column containing the repeated strings
  */
 std::unique_ptr<column> repeat_strings(
   strings_column_view const& input,
   size_type repeat_times,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -115,13 +119,15 @@ std::unique_ptr<column> repeat_strings(
  *
  * @param input The column containing strings to repeat
  * @param repeat_times The column containing numbers of times that the corresponding input strings
- *                     are repeated
+ *                     for each row are repeated
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned strings column
  * @return New column containing the repeated strings.
  */
 std::unique_ptr<column> repeat_strings(
   strings_column_view const& input,
   column_view const& repeat_times,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index ba8acd23467..0a11b6dc460 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -267,11 +267,11 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& separator,
                                     string_scalar const& narep,
                                     separator_on_nulls separate_nulls,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(
-    strings_columns, separator, narep, separate_nulls, cudf::get_default_stream(), mr);
+  return detail::concatenate(strings_columns, separator, narep, separate_nulls, stream, mr);
 }
 
 std::unique_ptr<column> concatenate(table_view const& strings_columns,
@@ -279,16 +279,12 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& separator_narep,
                                     string_scalar const& col_narep,
                                     separator_on_nulls separate_nulls,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(strings_columns,
-                             separators,
-                             separator_narep,
-                             col_narep,
-                             separate_nulls,
-                             cudf::get_default_stream(),
-                             mr);
+  return detail::concatenate(
+    strings_columns, separators, separator_narep, col_narep, separate_nulls, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index faf1be6a26f..9ab527feaf8 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -180,10 +180,11 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
 std::unique_ptr<column> join_strings(strings_column_view const& strings,
                                      string_scalar const& separator,
                                      string_scalar const& narep,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::join_strings(strings, separator, narep, cudf::get_default_stream(), mr);
+  return detail::join_strings(strings, separator, narep, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index eee59e37478..372b49fb0ee 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -301,16 +301,12 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            string_scalar const& narep,
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::join_list_elements(lists_strings_column,
-                                    separator,
-                                    narep,
-                                    separate_nulls,
-                                    empty_list_policy,
-                                    cudf::get_default_stream(),
-                                    mr);
+  return detail::join_list_elements(
+    lists_strings_column, separator, narep, separate_nulls, empty_list_policy, stream, mr);
 }
 
 std::unique_ptr<column> join_list_elements(lists_column_view const& lists_strings_column,
@@ -319,6 +315,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            string_scalar const& string_narep,
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
@@ -328,7 +325,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                     string_narep,
                                     separate_nulls,
                                     empty_list_policy,
-                                    cudf::get_default_stream(),
+                                    stream,
                                     mr);
 }
 
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 396e1e6a2ac..847a64f5602 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -67,7 +67,7 @@ std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
                       return in_ptr[idx % str_size];
                     });
 
-  return std::make_unique<string_scalar>(std::move(buff));
+  return std::make_unique<string_scalar>(std::move(buff), true, stream, mr);
 }
 
 namespace {
@@ -260,26 +260,29 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
 
 std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
                                              size_type repeat_times,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat_string(input, repeat_times, cudf::get_default_stream(), mr);
+  return detail::repeat_string(input, repeat_times, stream, mr);
 }
 
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        size_type repeat_times,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat_strings(input, repeat_times, cudf::get_default_stream(), mr);
+  return detail::repeat_strings(input, repeat_times, stream, mr);
 }
 
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        column_view const& repeat_times,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat_strings(input, repeat_times, cudf::get_default_stream(), mr);
+  return detail::repeat_strings(input, repeat_times, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index eb0585d8f3e..e7f4914fe05 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -637,6 +637,7 @@ ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE tes
 ConfigureTest(
   STREAM_STRINGS_TEST
   streams/strings/case_test.cpp
+  streams/strings/combine_test.cpp
   streams/strings/convert_test.cpp
   streams/strings/find_test.cpp
   streams/strings/replace_test.cpp
diff --git a/cpp/tests/streams/strings/combine_test.cpp b/cpp/tests/streams/strings/combine_test.cpp
new file mode 100644
index 00000000000..9562634957a
--- /dev/null
+++ b/cpp/tests/streams/strings/combine_test.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/repeat_strings.hpp>
+
+#include <string>
+
+class StringsCombineTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsCombineTest, Concatenate)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést"});
+  auto view  = cudf::table_view({input, input});
+
+  auto separators      = cudf::test::strings_column_wrapper({"_", ".", " "});
+  auto separators_view = cudf::strings_column_view(separators);
+  auto sep_on_null     = cudf::strings::separator_on_nulls::YES;
+
+  auto const separator = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  auto const narep     = cudf::string_scalar("n/a", true, cudf::test::get_default_stream());
+  cudf::strings::concatenate(view, separator, narep, sep_on_null, cudf::test::get_default_stream());
+  cudf::strings::concatenate(
+    view, separators_view, narep, narep, sep_on_null, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsCombineTest, Join)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const separator = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  auto const narep     = cudf::string_scalar("n/a", true, cudf::test::get_default_stream());
+  cudf::strings::join_strings(view, separator, narep, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsCombineTest, JoinLists)
+{
+  using STR_LISTS  = cudf::test::lists_column_wrapper<cudf::string_view>;
+  auto const input = STR_LISTS{
+    STR_LISTS{"a", "bb", "ccc"}, STR_LISTS{"ddd", "efgh", "ijk"}, STR_LISTS{"zzz", "xxxxx"}};
+  auto view = cudf::lists_column_view(input);
+
+  auto separators      = cudf::test::strings_column_wrapper({"_", ".", " "});
+  auto separators_view = cudf::strings_column_view(separators);
+  auto sep_on_null     = cudf::strings::separator_on_nulls::YES;
+  auto if_empty        = cudf::strings::output_if_empty_list::EMPTY_STRING;
+
+  auto const separator = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
+  auto const narep     = cudf::string_scalar("n/a", true, cudf::test::get_default_stream());
+  cudf::strings::join_list_elements(
+    view, separator, narep, sep_on_null, if_empty, cudf::test::get_default_stream());
+  cudf::strings::join_list_elements(
+    view, separators_view, narep, narep, sep_on_null, if_empty, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsCombineTest, Repeat)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést"});
+  auto view  = cudf::strings_column_view(input);
+  cudf::strings::repeat_strings(view, 0, cudf::test::get_default_stream());
+  cudf::strings::repeat_strings(view, 1, cudf::test::get_default_stream());
+  cudf::strings::repeat_strings(view, 10, cudf::test::get_default_stream());
+
+  auto counts = cudf::test::fixed_width_column_wrapper<cudf::size_type>({9, 8, 7});
+  cudf::strings::repeat_strings(view, counts, cudf::test::get_default_stream());
+  cudf::strings::repeat_strings(view, counts, cudf::test::get_default_stream());
+
+  auto const str = cudf::string_scalar("X", true, cudf::test::get_default_stream());
+  cudf::strings::repeat_string(str, 0, cudf::test::get_default_stream());
+  cudf::strings::repeat_string(str, 1, cudf::test::get_default_stream());
+  cudf::strings::repeat_string(str, 10, cudf::test::get_default_stream());
+
+  auto const invalid = cudf::string_scalar("", false, cudf::test::get_default_stream());
+  cudf::strings::repeat_string(invalid, 10, cudf::test::get_default_stream());
+}

From 630982a370185112b2ea9a6f47284d98e12a36e8 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Mon, 23 Oct 2023 16:45:02 -0700
Subject: [PATCH 041/118] test is_valid before reading column data (#14318)

Fixes #14310.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/14318
---
 cpp/src/io/parquet/page_enc.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 1e4f061d2e0..9acafd50585 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -283,7 +283,7 @@ __device__ uint8_t const* delta_encode(page_enc_state_s<0>* s,
 
     cur_val_idx += nvals;
 
-    output_type v = s->col.leaf_column->element<I>(val_idx);
+    output_type v = is_valid ? s->col.leaf_column->element<I>(val_idx) : 0;
     if (scale < 0) {
       v /= -scale;
     } else {

From b390bca5055aaf91ef0e6e9f8eb0f6f25cce94d0 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 24 Oct 2023 09:46:56 -0700
Subject: [PATCH 042/118] Remove aws-sdk-pinning and revert to arrow 12.0.1
 (#14319)

The aws-sdk-pinning is proving to be far too problematic to maintain. It causes conflicts in many environments due to its common usage across many other packages in the conda-forge ecosystem that have since updated their pinning to require newer versions than the 1.10.* that we've pinned to. This reversion will unblock most of RAPIDS CI. We will search for alternative fixes to the dask-cuda/distributed issues that we're observing (in particular, resolution of the underlying issues https://github.com/apache/arrow/issues/38364 and https://github.com/aws/aws-sdk-cpp/issues/2681).

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/cudf/pull/14319
---
 conda/environments/all_cuda-118_arch-x86_64.yaml |  5 ++---
 conda/environments/all_cuda-120_arch-x86_64.yaml |  5 ++---
 conda/recipes/cudf/meta.yaml                     |  2 +-
 conda/recipes/libcudf/conda_build_config.yaml    |  7 ++-----
 conda/recipes/libcudf/meta.yaml                  |  6 ++----
 cpp/cmake/thirdparty/get_arrow.cmake             |  2 +-
 dependencies.yaml                                | 12 ++++++------
 python/cudf/pyproject.toml                       |  4 ++--
 python/cudf_kafka/pyproject.toml                 |  2 +-
 9 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 2c79cbb6b6c..b5782800946 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -9,7 +9,6 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
-- aws-sdk-cpp<1.11
 - benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
@@ -40,7 +39,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow==12.0.0.*
+- libarrow==12.0.1.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
@@ -69,7 +68,7 @@ dependencies:
 - pre-commit
 - protobuf>=4.21,<5
 - ptxcompiler
-- pyarrow==12.0.0.*
+- pyarrow==12.0.1.*
 - pydata-sphinx-theme
 - pyorc
 - pytest
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index c96b7428882..473b9d07d88 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -9,7 +9,6 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
-- aws-sdk-cpp<1.11
 - benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
@@ -42,7 +41,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow==12.0.0.*
+- libarrow==12.0.1.*
 - libcufile-dev
 - libcurand-dev
 - libkvikio==23.12.*
@@ -67,7 +66,7 @@ dependencies:
 - pip
 - pre-commit
 - protobuf>=4.21,<5
-- pyarrow==12.0.0.*
+- pyarrow==12.0.1.*
 - pydata-sphinx-theme
 - pyorc
 - pytest
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 16b064a262e..7405ae2dfb5 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -61,7 +61,7 @@ requirements:
     - scikit-build >=0.13.1
     - setuptools
     - dlpack >=0.5,<0.6.0a0
-    - pyarrow =12.0.0
+    - pyarrow ==12.0.1.*
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 4d33bb89220..63688a641de 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -22,11 +22,8 @@ gbench_version:
 gtest_version:
   - ">=1.13.0"
 
-aws_sdk_cpp_version:
-  - "<1.11"
-
-libarrow:
-  - "==12.0.0"
+libarrow_version:
+  - "==12.0.1"
 
 dlpack_version:
   - ">=0.5,<0.6.0a0"
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index b9aff2a9c82..627065817ba 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -65,7 +65,7 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
     - nvcomp {{ nvcomp_version }}
-    - libarrow {{ libarrow }}
+    - libarrow {{ libarrow_version }}
     - dlpack {{ dlpack_version }}
     - librdkafka {{ librdkafka_version }}
     - fmt {{ fmt_version }}
@@ -74,7 +74,6 @@ requirements:
     - gtest {{ gtest_version }}
     - gmock {{ gtest_version }}
     - zlib {{ zlib_version }}
-    - aws-sdk-cpp {{ aws_sdk_cpp_version }}
 
 outputs:
   - name: libcudf
@@ -104,11 +103,10 @@ outputs:
         - nvcomp {{ nvcomp_version }}
         - librmm ={{ minor_version }}
         - libkvikio ={{ minor_version }}
-        - libarrow {{ libarrow }}
+        - libarrow {{ libarrow_version }}
         - dlpack {{ dlpack_version }}
         - gtest {{ gtest_version }}
         - gmock {{ gtest_version }}
-        - aws-sdk-cpp {{ aws_sdk_cpp_version }}
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index c2d5cfbaf78..10d3145a36f 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -411,7 +411,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
       # This version must be kept in sync with the libarrow version pinned for builds in
       # dependencies.yaml.
-      12.0.0
+      12.0.1
       CACHE STRING "The version of Arrow to find (or build)"
   )
 endif()
diff --git a/dependencies.yaml b/dependencies.yaml
index e8114fa5615..c3223e4394d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -62,7 +62,6 @@ files:
     includes:
       - cudatoolkit
       - docs
-      - libarrow_run
       - py_version
   py_build_cudf:
     output: pyproject
@@ -219,14 +218,13 @@ dependencies:
           - libkvikio==23.12.*
       - output_types: conda
         packages:
-          - aws-sdk-cpp<1.11
           - fmt>=9.1.0,<10
           - &gbench benchmark==1.8.0
           - &gtest gtest>=1.13.0
           - &gmock gmock>=1.13.0
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - &libarrow libarrow==12.0.0.*
+          - libarrow==12.0.1.*
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==2.6.1
@@ -244,7 +242,7 @@ dependencies:
           - cython>=3.0.0
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - &pyarrow pyarrow==12.0.0.*
+          - pyarrow==12.0.1.*
           # TODO: Pin to numpy<1.25 until cudf requires pandas 2
           - &numpy numpy>=1.21,<1.25
   build_python:
@@ -263,12 +261,14 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - *libarrow
+          # Allow runtime version to float up to minor version
+          - libarrow==12.*
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - *pyarrow
+          # Allow runtime version to float up to minor version
+          - pyarrow==12.*
   cudatoolkit:
     specific:
       - output_types: conda
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index ccb5d5d4416..39a8dca0267 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -8,7 +8,7 @@ requires = [
     "ninja",
     "numpy>=1.21,<1.25",
     "protoc-wheel",
-    "pyarrow==12.0.0.*",
+    "pyarrow==12.0.1.*",
     "rmm==23.12.*",
     "scikit-build>=0.13.1",
     "setuptools",
@@ -38,7 +38,7 @@ dependencies = [
     "pandas>=1.3,<1.6.0dev0",
     "protobuf>=4.21,<5",
     "ptxcompiler",
-    "pyarrow==12.0.0.*",
+    "pyarrow==12.*",
     "rmm==23.12.*",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index ff475e5a72e..78a7a83ac3a 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -5,7 +5,7 @@
 requires = [
     "cython>=3.0.0",
     "numpy>=1.21,<1.25",
-    "pyarrow==12.0.0.*",
+    "pyarrow==12.0.1.*",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From 19d791cea7abb8ccbcf3f0cd8037644f3176166f Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 24 Oct 2023 15:45:41 -0500
Subject: [PATCH 043/118] Avoid `pyarrow.fs` import for local storage (#14321)

This is not a resolution, but may help mitigate problems from https://github.com/aws/aws-sdk-cpp/issues/2681

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Lawrence Mitchell (https://github.com/wence-)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14321
---
 python/cudf/cudf/io/orc.py        |  5 ++++-
 python/cudf/cudf/io/parquet.py    | 11 +++++++++--
 python/cudf/cudf/tests/test_s3.py | 14 ++++++++++++++
 python/cudf/cudf/utils/ioutils.py | 10 +++++++++-
 4 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index f51952d23bf..d135a31438e 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -5,7 +5,6 @@
 
 import pyarrow as pa
 from fsspec.utils import stringify_path
-from pyarrow import orc as orc
 
 import cudf
 from cudf._lib import orc as liborc
@@ -17,6 +16,8 @@
 
 
 def _make_empty_df(filepath_or_buffer, columns):
+    from pyarrow import orc
+
     orc_file = orc.ORCFile(filepath_or_buffer)
     schema = orc_file.schema
     col_names = schema.names if columns is None else columns
@@ -150,6 +151,7 @@ def _parse_column_statistics(cs, column_statistics_blob):
 @ioutils.doc_read_orc_metadata()
 def read_orc_metadata(path):
     """{docstring}"""
+    from pyarrow import orc
 
     orc_file = orc.ORCFile(path)
 
@@ -380,6 +382,7 @@ def read_orc(
             )
         )
     else:
+        from pyarrow import orc
 
         def read_orc_stripe(orc_file, stripe, columns):
             pa_table = orc_file.read_stripe(stripe, columns)
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index d84aff66d7b..1f346578d70 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 import pandas as pd
-from pyarrow import dataset as ds, parquet as pq
+from pyarrow import dataset as ds
 
 import cudf
 from cudf._lib import parquet as libparquet
@@ -266,6 +266,7 @@ def write_to_dataset(
 @_cudf_nvtx_annotate
 def read_parquet_metadata(path):
     """{docstring}"""
+    import pyarrow.parquet as pq
 
     pq_file = pq.ParquetFile(path)
 
@@ -303,7 +304,9 @@ def _process_dataset(
 
     # Convert filters to ds.Expression
     if filters is not None:
-        filters = pq.filters_to_expression(filters)
+        from pyarrow.parquet import filters_to_expression
+
+        filters = filters_to_expression(filters)
 
     # Initialize ds.FilesystemDataset
     # TODO: Remove the if len(paths) workaround after following bug is fixed:
@@ -825,6 +828,8 @@ def _read_parquet(
             use_pandas_metadata=use_pandas_metadata,
         )
     else:
+        import pyarrow.parquet as pq
+
         return cudf.DataFrame.from_arrow(
             pq.ParquetDataset(filepaths_or_buffers).read_pandas(
                 columns=columns, *args, **kwargs
@@ -930,6 +935,8 @@ def to_parquet(
         )
 
     else:
+        import pyarrow.parquet as pq
+
         if partition_offsets is not None:
             warnings.warn(
                 "partition_offsets will be ignored when engine is not cudf"
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index d54a2eabf22..d16cbd2377a 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -533,3 +533,17 @@ def test_write_chunked_parquet(s3_base, s3so):
             actual.sort_values(["b"]).reset_index(drop=True),
             cudf.concat([df1, df2]).sort_values(["b"]).reset_index(drop=True),
         )
+
+
+def test_no_s3fs_on_cudf_import():
+    import subprocess
+    import sys
+
+    output = subprocess.check_output(
+        [
+            sys.executable,
+            "-c",
+            "import cudf; import sys; print('pyarrow._s3fs' in sys.modules)",
+        ]
+    )
+    assert output.strip() == b"False"
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 91925bf3c0c..d2739b35049 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -13,7 +13,6 @@
 import pandas as pd
 from fsspec.core import get_fs_token_paths
 from pyarrow import PythonFile as ArrowPythonFile
-from pyarrow.fs import FSSpecHandler, PyFileSystem
 from pyarrow.lib import NativeFile
 
 from cudf.utils.docutils import docfmt_partial
@@ -1630,6 +1629,15 @@ def _open_remote_files(
             for path, rgs in zip(paths, row_groups)
         ]
 
+    # Avoid top-level pyarrow.fs import.
+    # Importing pyarrow.fs initializes a S3 SDK with a finalizer
+    # that runs atexit. In some circumstances it appears this
+    # runs a call into a logging system that is already shutdown.
+    # To avoid this, we only import this subsystem if it is
+    # really needed.
+    # See https://github.com/aws/aws-sdk-cpp/issues/2681
+    from pyarrow.fs import FSSpecHandler, PyFileSystem
+
     # Default open - Use pyarrow filesystem API
     pa_fs = PyFileSystem(FSSpecHandler(fs))
     return [

From bc4d38dc370a3ebe743bcc5c17581aa5cd73de6b Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Tue, 24 Oct 2023 18:13:57 -0400
Subject: [PATCH 044/118] fixing thread index overflow issue (#14290)

Ran across this issue during the review of https://github.com/NVIDIA/spark-rapids-jni/pull/1502 and since the code was modeled after this code, I am pushing the fix here as well.

Authors:
  - Mike Wilson (https://github.com/hyperbolic2346)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14290
---
 cpp/src/strings/convert/convert_urls.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 9e847131be2..511acc38d75 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -212,7 +212,8 @@ __global__ void url_decode_char_counter(column_device_view const in_strings,
   char* in_chars_shared = temporary_buffer[local_warp_id];
 
   // Loop through strings, and assign each string to a warp.
-  for (size_type row_idx = global_warp_id; row_idx < in_strings.size(); row_idx += nwarps) {
+  for (thread_index_type tidx = global_warp_id; tidx < in_strings.size(); tidx += nwarps) {
+    auto const row_idx = static_cast<size_type>(tidx);
     if (in_strings.is_null(row_idx)) {
       out_counts[row_idx] = 0;
       continue;
@@ -296,7 +297,8 @@ __global__ void url_decode_char_replacer(column_device_view const in_strings,
   char* in_chars_shared = temporary_buffer[local_warp_id];
 
   // Loop through strings, and assign each string to a warp
-  for (size_type row_idx = global_warp_id; row_idx < in_strings.size(); row_idx += nwarps) {
+  for (thread_index_type tidx = global_warp_id; tidx < in_strings.size(); tidx += nwarps) {
+    auto const row_idx = static_cast<size_type>(tidx);
     if (in_strings.is_null(row_idx)) continue;
 
     auto const in_string     = in_strings.element<string_view>(row_idx);

From 91aeec88deac8168f75a7a9d740eceba61df45bd Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 25 Oct 2023 14:37:21 -0500
Subject: [PATCH 045/118] Drop `pyorc` dependency and use `pandas`/`pyarrow`
 instead (#14323)

This PR removes dependency on `pyorc` in `cudf` altogether by using drop-in replacements found in `pandas` & `pyarrow`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Jake Awe (https://github.com/AyodeAwe)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14323
---
 .../all_cuda-118_arch-x86_64.yaml             |   1 -
 .../all_cuda-120_arch-x86_64.yaml             |   1 -
 cpp/tests/io/orc_test.cpp                     |  14 +-
 dependencies.yaml                             |   1 -
 docs/cudf/source/conf.py                      |   1 +
 docs/dask_cudf/source/conf.py                 |   1 +
 python/cudf/cudf/_fuzz_testing/orc.py         |  18 +-
 python/cudf/cudf/_fuzz_testing/utils.py       | 160 +------
 python/cudf/cudf/tests/test_orc.py            | 402 ++++++++----------
 python/cudf/pyproject.toml                    |   1 -
 10 files changed, 205 insertions(+), 395 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index b5782800946..8b6b32bc026 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -70,7 +70,6 @@ dependencies:
 - ptxcompiler
 - pyarrow==12.0.1.*
 - pydata-sphinx-theme
-- pyorc
 - pytest
 - pytest-benchmark
 - pytest-cases
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 473b9d07d88..ae15a6e97ab 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -68,7 +68,6 @@ dependencies:
 - protobuf>=4.21,<5
 - pyarrow==12.0.1.*
 - pydata-sphinx-theme
-- pyorc
 - pytest
 - pytest-benchmark
 - pytest-cases
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 890ef914713..3457c5675ad 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1299,20 +1299,16 @@ TEST_F(OrcStatisticsTest, Overflow)
 
 TEST_F(OrcStatisticsTest, HasNull)
 {
-  // This test can now be implemented with libcudf; keeping the pyorc version to keep the test
+  // This test can now be implemented with libcudf; keeping the pandas version to keep the test
   // inputs diversified
   // Method to create file:
-  // >>> import pyorc
-  // >>> output = open("./temp.orc", "wb")
-  // >>> writer = pyorc.Writer(output, pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt()))
-  // >>> writer.write((1, 3))
-  // >>> writer.write((2, 4))
-  // >>> writer.write((None, 5))
-  // >>> writer.close()
+  // >>> import pandas as pd
+  // >>> df = pd.DataFrame({'a':pd.Series([1, 2, None], dtype="Int64"), 'b':[3, 4, 5]})
+  // >>> df.to_orc("temp.orc")
   //
   // Contents of file:
   // >>> import pyarrow.orc as po
-  // >>> po.ORCFile('new.orc').read()
+  // >>> po.ORCFile('temp.orc').read()
   // pyarrow.Table
   // a: int64
   // b: int64
diff --git a/dependencies.yaml b/dependencies.yaml
index c3223e4394d..a7716a15360 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -548,7 +548,6 @@ dependencies:
           - fastavro>=0.22.9
           - hypothesis
           - mimesis>=4.1.0
-          - pyorc
           - pytest-benchmark
           - pytest-cases
           - python-snappy>=0.6.0
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index acb2a5d17f3..28e305b71cb 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -106,6 +106,7 @@
     "twitter_url": "https://twitter.com/rapidsai",
     "show_toc_level": 1,
     "navbar_align": "right",
+    "navigation_with_keys": True,
 }
 include_pandas_compat = True
 
diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py
index 6861a9b90f6..00568a57431 100644
--- a/docs/dask_cudf/source/conf.py
+++ b/docs/dask_cudf/source/conf.py
@@ -57,6 +57,7 @@
     "twitter_url": "https://twitter.com/rapidsai",
     "show_toc_level": 1,
     "navbar_align": "right",
+    "navigation_with_keys": True,
 }
 include_pandas_compat = True
 
diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py
index 65d2e09988f..ecddc72fa85 100644
--- a/python/cudf/cudf/_fuzz_testing/orc.py
+++ b/python/cudf/cudf/_fuzz_testing/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import copy
 import io
@@ -6,14 +6,13 @@
 import random
 
 import numpy as np
-import pyorc
+import pyarrow as pa
 
 import cudf
 from cudf._fuzz_testing.io import IOFuzz
 from cudf._fuzz_testing.utils import (
     ALL_POSSIBLE_VALUES,
     _generate_rand_meta,
-    pandas_to_orc,
     pyarrow_to_pandas,
 )
 from cudf.testing import dataset_generator as dg
@@ -82,12 +81,7 @@ def generate_input(self):
         logging.info(f"Shape of DataFrame generated: {table.shape}")
         self._df = df
         file_obj = io.BytesIO()
-        pandas_to_orc(
-            df,
-            file_io_obj=file_obj,
-            stripe_size=self._rand(len(df)),
-            arrow_table_schema=table.schema,
-        )
+        pa.orc.write_table(table, file_obj, stripe_size=self._rand(len(df)))
         file_obj.seek(0)
         buf = file_obj.read()
         self._current_buffer = copy.copy(buf)
@@ -109,8 +103,8 @@ def set_rand_params(self, params):
                     )
                 elif param == "stripes":
                     f = io.BytesIO(self._current_buffer)
-                    reader = pyorc.Reader(f)
-                    stripes = [i for i in range(reader.num_of_stripes)]
+                    orcFile = pa.orc.ORCFile(f)
+                    stripes = list(range(orcFile.nstripes))
                     params_dict[param] = np.random.choice(
                         [
                             None,
@@ -119,7 +113,7 @@ def set_rand_params(self, params):
                                     int,
                                     np.unique(
                                         np.random.choice(
-                                            stripes, reader.num_of_stripes
+                                            stripes, orcFile.nstripes
                                         )
                                     ),
                                 )
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 03418e00cde..0c88c1aeacd 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -1,13 +1,11 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 import random
-from collections import OrderedDict
 
 import fastavro
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-import pyorc
 
 import cudf
 from cudf.testing._utils import assert_eq
@@ -41,40 +39,6 @@
     cudf.dtype("<M8[us]"): {"type": "long", "logicalType": "timestamp-micros"},
 }
 
-PANDAS_TO_ORC_TYPES = {
-    cudf.dtype("int8"): pyorc.TinyInt(),
-    pd.Int8Dtype(): pyorc.TinyInt(),
-    pd.Int16Dtype(): pyorc.SmallInt(),
-    pd.Int32Dtype(): pyorc.Int(),
-    pd.Int64Dtype(): pyorc.BigInt(),
-    pd.Float32Dtype(): pyorc.Float(),
-    pd.Float64Dtype(): pyorc.Double(),
-    pd.BooleanDtype(): pyorc.Boolean(),
-    cudf.dtype("bool_"): pyorc.Boolean(),
-    cudf.dtype("int16"): pyorc.SmallInt(),
-    cudf.dtype("int32"): pyorc.Int(),
-    cudf.dtype("int64"): pyorc.BigInt(),
-    cudf.dtype("O"): pyorc.String(),
-    pd.StringDtype(): pyorc.String(),
-    cudf.dtype("float32"): pyorc.Float(),
-    cudf.dtype("float64"): pyorc.Double(),
-    cudf.dtype("<M8[ns]"): pyorc.Timestamp(),
-    cudf.dtype("<M8[ms]"): pyorc.Timestamp(),
-    cudf.dtype("<M8[us]"): pyorc.Timestamp(),
-}
-
-ORC_TO_PANDAS_TYPES = {
-    pyorc.TinyInt().name: pd.Int8Dtype(),
-    pyorc.Int().name: pd.Int32Dtype(),
-    pyorc.Boolean().name: pd.BooleanDtype(),
-    pyorc.SmallInt().name: pd.Int16Dtype(),
-    pyorc.BigInt().name: pd.Int64Dtype(),
-    pyorc.String().name: pd.StringDtype(),
-    pyorc.Float().name: pd.Float32Dtype(),
-    pyorc.Double().name: pd.Float64Dtype(),
-    pyorc.Timestamp().name: cudf.dtype("<M8[ns]"),
-}
-
 
 def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
     obj._current_params = {}
@@ -213,24 +177,6 @@ def get_avro_dtype_info(dtype):
         )
 
 
-def get_orc_dtype_info(dtype):
-    if dtype in PANDAS_TO_ORC_TYPES:
-        return PANDAS_TO_ORC_TYPES[dtype]
-    else:
-        raise TypeError(
-            f"Unsupported dtype({dtype}) according to orc spec:"
-            f" https://orc.apache.org/specification/"
-        )
-
-
-def get_arrow_dtype_info_for_pyorc(dtype):
-    if isinstance(dtype, pa.StructType):
-        return get_orc_schema(df=None, arrow_table_schema=dtype)
-    else:
-        pd_dtype = cudf.dtype(dtype.to_pandas_dtype())
-        return get_orc_dtype_info(pd_dtype)
-
-
 def get_avro_schema(df):
     fields = [
         {"name": col_name, "type": get_avro_dtype_info(col_dtype)}
@@ -240,22 +186,6 @@ def get_avro_schema(df):
     return schema
 
 
-def get_orc_schema(df, arrow_table_schema=None):
-    if arrow_table_schema is None:
-        ordered_dict = OrderedDict(
-            (col_name, get_orc_dtype_info(col_dtype))
-            for col_name, col_dtype in df.dtypes.items()
-        )
-    else:
-        ordered_dict = OrderedDict(
-            (field.name, get_arrow_dtype_info_for_pyorc(field.type))
-            for field in arrow_table_schema
-        )
-
-    schema = pyorc.Struct(**ordered_dict)
-    return schema
-
-
 def convert_nulls_to_none(records, df):
     columns_with_nulls = {col for col in df.columns if df[col].isnull().any()}
     scalar_columns_convert = [
@@ -296,99 +226,19 @@ def pandas_to_avro(df, file_name=None, file_io_obj=None):
         fastavro.writer(file_io_obj, avro_schema, records)
 
 
-def _preprocess_to_orc_tuple(df, arrow_table_schema):
-    def _null_to_None(value):
-        if value is pd.NA or value is pd.NaT:
-            return None
-        else:
-            return value
-
-    def sanitize(value, struct_type):
-        if value is None:
-            return None
-
-        values_list = []
-        for name, sub_type in struct_type.fields.items():
-            if isinstance(sub_type, cudf.StructDtype):
-                values_list.append(sanitize(value[name], sub_type))
-            else:
-                values_list.append(value[name])
-        return tuple(values_list)
-
-    has_nulls_or_nullable_dtype = any(
-        (col := df[colname]).dtype in pandas_dtypes_to_np_dtypes
-        or col.isnull().any()
-        for colname in df.columns
-    )
-    pdf = df.copy(deep=True)
-    for field in arrow_table_schema:
-        if isinstance(field.type, pa.StructType):
-            pdf[field.name] = pdf[field.name].apply(
-                sanitize, args=(cudf.StructDtype.from_arrow(field.type),)
-            )
-        else:
-            pdf[field.name] = pdf[field.name]
-
-    tuple_list = [
-        tuple(map(_null_to_None, tup)) if has_nulls_or_nullable_dtype else tup
-        for tup in pdf.itertuples(index=False, name=None)
-    ]
-
-    return tuple_list, pdf, df
-
-
-def pandas_to_orc(
-    df,
-    file_name=None,
-    file_io_obj=None,
-    stripe_size=67108864,
-    arrow_table_schema=None,
-):
-    schema = get_orc_schema(df, arrow_table_schema=arrow_table_schema)
-
-    tuple_list, pdf, df = _preprocess_to_orc_tuple(
-        df, arrow_table_schema=arrow_table_schema
-    )
-
-    if file_name is not None:
-        with open(file_name, "wb") as data:
-            with pyorc.Writer(data, schema, stripe_size=stripe_size) as writer:
-                writer.writerows(tuple_list)
-    elif file_io_obj is not None:
-        with pyorc.Writer(
-            file_io_obj, schema, stripe_size=stripe_size
-        ) as writer:
-            writer.writerows(tuple_list)
-
-
 def orc_to_pandas(file_name=None, file_io_obj=None, stripes=None):
     if file_name is not None:
         f = open(file_name, "rb")
     elif file_io_obj is not None:
         f = file_io_obj
 
-    reader = pyorc.Reader(f)
-
-    dtypes = {
-        col: ORC_TO_PANDAS_TYPES[pyorc_type.name]
-        for col, pyorc_type in reader.schema.fields.items()
-    }
-
     if stripes is None:
-        df = pd.DataFrame.from_records(
-            reader, columns=reader.schema.fields.keys()
-        )
+        df = pd.read_orc(f)
     else:
-        records = [
-            record for i in stripes for record in list(reader.read_stripe(i))
-        ]
-        df = pd.DataFrame.from_records(
-            records, columns=reader.schema.fields.keys()
-        )
-
-    # Need to type-cast to extracted `dtypes` from pyorc schema because
-    # a fully empty/ full <NA> can result in incorrect dtype by pandas.
-    df = df.astype(dtypes)
+        orc_file = pa.orc.ORCFile(f)
+        records = [orc_file.read_stripe(i) for i in stripes]
+        pa_table = pa.Table.from_batches(records)
+        df = pa_table.to_pandas()
 
     return df
 
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 07aa5430f4f..7407da9c4ac 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -10,8 +10,6 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-import pyarrow.orc
-import pyorc
 import pytest
 
 import cudf
@@ -150,9 +148,11 @@ def test_orc_reader_trailing_nulls(datadir):
     ["TestOrcFile.testDate1900.orc", "TestOrcFile.testDate2038.orc"],
 )
 def test_orc_reader_datetimestamp(datadir, inputfile, use_index):
+    from pyarrow import orc
+
     path = datadir / inputfile
     try:
-        orcfile = pa.orc.ORCFile(path)
+        orcfile = orc.ORCFile(path)
     except pa.ArrowIOError as e:
         pytest.skip(".orc file is not found: %s" % e)
 
@@ -295,28 +295,29 @@ def test_orc_read_rows(datadir, skiprows, num_rows):
 
 def test_orc_read_skiprows():
     buff = BytesIO()
-    data = [
-        True,
-        False,
-        True,
-        False,
-        None,
-        True,
-        True,
-        True,
-        False,
-        None,
-        False,
-        False,
-        True,
-        True,
-        True,
-        True,
-    ]
-    writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean()))
-    writer.writerows([(d,) for d in data])
-    writer.close()
-
+    df = pd.DataFrame(
+        {
+            "a": [
+                True,
+                False,
+                True,
+                False,
+                None,
+                True,
+                True,
+                True,
+                False,
+                None,
+                False,
+                False,
+                True,
+                True,
+                True,
+                True,
+            ]
+        }
+    )
+    df.to_orc(buff)
     # testing 10 skiprows due to a boolean specific bug fix that didn't
     # repro for other sizes of data
     skiprows = 10
@@ -605,6 +606,8 @@ def normalized_equals(value1, value2):
 @pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"])
 @pytest.mark.parametrize("nrows", [1, 100, 6000000])
 def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
+    from pyarrow import orc
+
     supported_stat_types = supported_numpy_dtypes + ["str"]
     # Can't write random bool columns until issue #6763 is fixed
     if nrows == 6000000:
@@ -623,7 +626,7 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
     gdf.to_orc(fname.strpath, statistics=stats_freq)
 
     # Read back written ORC's statistics
-    orc_file = pa.orc.ORCFile(fname)
+    orc_file = orc.ORCFile(fname)
     (
         file_stats,
         stripes_stats,
@@ -677,6 +680,8 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
 @pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"])
 @pytest.mark.parametrize("nrows", [2, 100, 6000000])
 def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
+    from pyarrow import orc
+
     np.random.seed(0)
     supported_stat_types = supported_numpy_dtypes + ["str"]
     # Can't write random bool columns until issue #6763 is fixed
@@ -729,7 +734,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
     expect = cudf.DataFrame(pd.concat([pdf1, pdf2]).reset_index(drop=True))
 
     # Read back written ORC's statistics
-    orc_file = pa.orc.ORCFile(gdf_fname)
+    orc_file = orc.ORCFile(gdf_fname)
     (
         file_stats,
         stripes_stats,
@@ -782,6 +787,8 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq):
 
 @pytest.mark.parametrize("nrows", [1, 100, 6000000])
 def test_orc_write_bool_statistics(tmpdir, datadir, nrows):
+    from pyarrow import orc
+
     # Make a dataframe
     gdf = cudf.DataFrame({"col_bool": gen_rand_series("bool", nrows)})
     fname = tmpdir.join("gdf.orc")
@@ -790,7 +797,7 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows):
     gdf.to_orc(fname.strpath)
 
     # Read back written ORC's statistics
-    orc_file = pa.orc.ORCFile(fname)
+    orc_file = orc.ORCFile(fname)
     (
         file_stats,
         stripes_stats,
@@ -978,44 +985,12 @@ def test_orc_string_stream_offset_issue():
     assert_eq(df, cudf.read_orc(buffer))
 
 
-# Data is generated using pyorc module
 def generate_list_struct_buff(size=100_000):
     rd = random.Random(1)
     np.random.seed(seed=1)
 
     buff = BytesIO()
 
-    schema = {
-        "lvl3_list": pyorc.Array(pyorc.Array(pyorc.Array(pyorc.BigInt()))),
-        "lvl1_list": pyorc.Array(pyorc.BigInt()),
-        "lvl1_struct": pyorc.Struct(
-            **{"a": pyorc.BigInt(), "b": pyorc.BigInt()}
-        ),
-        "lvl2_struct": pyorc.Struct(
-            **{
-                "a": pyorc.BigInt(),
-                "lvl1_struct": pyorc.Struct(
-                    **{"c": pyorc.BigInt(), "d": pyorc.BigInt()}
-                ),
-            }
-        ),
-        "list_nests_struct": pyorc.Array(
-            pyorc.Array(
-                pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()})
-            )
-        ),
-        "struct_nests_list": pyorc.Struct(
-            **{
-                "struct": pyorc.Struct(
-                    **{"a": pyorc.BigInt(), "b": pyorc.BigInt()}
-                ),
-                "list": pyorc.Array(pyorc.BigInt()),
-            }
-        ),
-    }
-
-    schema = pyorc.Struct(**schema)
-
     lvl3_list = [
         rd.choice(
             [
@@ -1024,50 +999,57 @@ def generate_list_struct_buff(size=100_000):
                     [
                         [
                             rd.choice([None, np.random.randint(1, 3)])
-                            for z in range(np.random.randint(1, 3))
+                            for _ in range(np.random.randint(1, 3))
                         ]
-                        for z in range(np.random.randint(0, 3))
+                        for _ in range(np.random.randint(0, 3))
                     ]
-                    for y in range(np.random.randint(0, 3))
+                    for _ in range(np.random.randint(0, 3))
                 ],
             ]
         )
-        for x in range(size)
+        for _ in range(size)
     ]
     lvl1_list = [
         [
             rd.choice([None, np.random.randint(0, 3)])
-            for y in range(np.random.randint(1, 4))
+            for _ in range(np.random.randint(1, 4))
         ]
-        for x in range(size)
+        for _ in range(size)
     ]
     lvl1_struct = [
-        rd.choice([None, (np.random.randint(0, 3), np.random.randint(0, 3))])
-        for x in range(size)
+        rd.choice(
+            [
+                None,
+                {"a": np.random.randint(0, 3), "b": np.random.randint(0, 3)},
+            ]
+        )
+        for _ in range(size)
     ]
     lvl2_struct = [
         rd.choice(
             [
                 None,
-                (
-                    rd.choice([None, np.random.randint(0, 3)]),
-                    (
-                        rd.choice([None, np.random.randint(0, 3)]),
-                        np.random.randint(0, 3),
-                    ),
-                ),
+                {"a": rd.choice([None, np.random.randint(0, 3)])},
+                {
+                    "lvl1_struct": {
+                        "c": rd.choice([None, np.random.randint(0, 3)]),
+                        "d": np.random.randint(0, 3),
+                    },
+                },
             ]
         )
-        for x in range(size)
+        for _ in range(size)
     ]
     list_nests_struct = [
         [
-            [rd.choice(lvl1_struct), rd.choice(lvl1_struct)]
-            for y in range(np.random.randint(1, 4))
+            {"a": rd.choice(lvl1_struct), "b": rd.choice(lvl1_struct)}
+            for _ in range(np.random.randint(1, 4))
         ]
-        for x in range(size)
+        for _ in range(size)
+    ]
+    struct_nests_list = [
+        {"struct": lvl1_struct[x], "list": lvl1_list[x]} for x in range(size)
     ]
-    struct_nests_list = [(lvl1_struct[x], lvl1_list[x]) for x in range(size)]
 
     df = pd.DataFrame(
         {
@@ -1080,15 +1062,7 @@ def generate_list_struct_buff(size=100_000):
         }
     )
 
-    writer = pyorc.Writer(buff, schema, stripe_size=1024)
-    tuples = list(
-        map(
-            lambda x: (None,) if x[0] is pd.NA else x,
-            list(df.itertuples(index=False, name=None)),
-        )
-    )
-    writer.writerows(tuples)
-    writer.close()
+    df.to_orc(buff, engine="pyarrow", engine_kwargs={"stripe_size": 1024})
 
     return buff
 
@@ -1109,6 +1083,8 @@ def list_struct_buff():
 @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100_000])
 @pytest.mark.parametrize("use_index", [True, False])
 def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff):
+    from pyarrow import orc
+
     gdf = cudf.read_orc(
         list_struct_buff,
         columns=columns,
@@ -1116,7 +1092,7 @@ def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff):
         use_index=use_index,
     )
 
-    pyarrow_tbl = pyarrow.orc.ORCFile(list_struct_buff).read()
+    pyarrow_tbl = orc.ORCFile(list_struct_buff).read()
 
     pyarrow_tbl = (
         pyarrow_tbl[:num_rows]
@@ -1155,111 +1131,96 @@ def test_pyspark_struct(datadir):
 def gen_map_buff(size=10000):
     from string import ascii_letters as al
 
+    from pyarrow import orc
+
     rd = random.Random(1)
     np.random.seed(seed=1)
 
     buff = BytesIO()
 
-    schema = {
-        "lvl1_map": pyorc.Map(key=pyorc.String(), value=pyorc.BigInt()),
-        "lvl2_map": pyorc.Map(
-            key=pyorc.String(), value=pyorc.Array(pyorc.BigInt())
-        ),
-        "lvl2_struct_map": pyorc.Map(
-            key=pyorc.String(),
-            value=pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()}),
-        ),
-    }
-
-    schema = pyorc.Struct(**schema)
-
-    lvl1_map = [
-        rd.choice(
-            [
-                None,
-                [
-                    (
-                        rd.choice(al),
-                        rd.choice([None, np.random.randint(1, 1500)]),
-                    )
-                    for y in range(2)
-                ],
-            ]
-        )
-        for x in range(size)
-    ]
-    lvl2_map = [
-        rd.choice(
-            [
-                None,
+    lvl1_map = pa.array(
+        [
+            rd.choice(
                 [
-                    (
-                        rd.choice(al),
-                        rd.choice(
-                            [
-                                None,
-                                [
-                                    rd.choice(
-                                        [None, np.random.randint(1, 1500)]
-                                    )
-                                    for z in range(5)
-                                ],
-                            ]
+                    None,
+                    {
+                        rd.choice(al): rd.choice(
+                            [None, np.random.randint(1, 1500)]
                         ),
-                    )
-                    for y in range(2)
-                ],
-            ]
-        )
-        for x in range(size)
-    ]
-    lvl2_struct_map = [
-        rd.choice(
-            [
-                None,
+                    },
+                ]
+            )
+            for _ in range(size)
+        ],
+        type=pa.map_(pa.string(), pa.int64()),
+    )
+    lvl2_map = pa.array(
+        [
+            rd.choice(
                 [
-                    (
-                        rd.choice(al),
-                        rd.choice(
-                            [
-                                None,
-                                (
-                                    rd.choice(
-                                        [None, np.random.randint(1, 1500)]
-                                    ),
-                                    rd.choice(
-                                        [None, np.random.randint(1, 1500)]
-                                    ),
-                                ),
-                            ]
-                        ),
-                    )
-                    for y in range(2)
-                ],
-            ]
-        )
-        for x in range(size)
-    ]
-
-    pdf = pd.DataFrame(
-        {
-            "lvl1_map": lvl1_map,
-            "lvl2_map": lvl2_map,
-            "lvl2_struct_map": lvl2_struct_map,
-        }
+                    None,
+                    *(
+                        {
+                            rd.choice(al): rd.choice(
+                                [
+                                    None,
+                                    [
+                                        rd.choice(
+                                            [None, np.random.randint(1, 1500)]
+                                        )
+                                        for _ in range(5)
+                                    ],
+                                ]
+                            )
+                        }
+                        for _ in range(2)
+                    ),
+                ]
+            )
+            for _ in range(size)
+        ],
+        type=pa.map_(pa.string(), pa.list_(pa.int64())),
     )
-    writer = pyorc.Writer(
-        buff, schema, stripe_size=1024, compression=pyorc.CompressionKind.NONE
+    lvl2_struct_map = pa.array(
+        [
+            rd.choice(
+                [
+                    None,
+                    *(
+                        {
+                            rd.choice(al): rd.choice(
+                                [
+                                    None,
+                                    {
+                                        "a": rd.choice(
+                                            [None, np.random.randint(1, 1500)]
+                                        ),
+                                        "b": rd.choice(
+                                            [None, np.random.randint(1, 1500)]
+                                        ),
+                                    },
+                                ]
+                            )
+                        }
+                        for _ in range(2)
+                    ),
+                ]
+            )
+            for _ in range(size)
+        ],
+        type=pa.map_(
+            pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()})
+        ),
     )
-    tuples = list(
-        map(
-            lambda x: (None,) if x[0] is pd.NA else x,
-            list(pdf.itertuples(index=False, name=None)),
-        )
+
+    pa_table = pa.Table.from_arrays(
+        [lvl1_map, lvl2_map, lvl2_struct_map],
+        ["lvl1_map", "lvl2_map", "lvl2_struct_map"],
     )
 
-    writer.writerows(tuples)
-    writer.close()
+    orc.write_table(
+        pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED"
+    )
 
     return buff
 
@@ -1274,7 +1235,9 @@ def gen_map_buff(size=10000):
 @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100000])
 @pytest.mark.parametrize("use_index", [True, False])
 def test_map_type_read(columns, num_rows, use_index):
-    tbl = pa.orc.ORCFile(map_buff).read()
+    from pyarrow import orc
+
+    tbl = orc.read_table(map_buff)
 
     lvl1_map = (
         tbl["lvl1_map"]
@@ -1460,18 +1423,22 @@ def test_writer_timestamp_stream_size(datadir, tmpdir):
     ],
 )
 def test_no_row_group_index_orc_read(datadir, fname):
+    from pyarrow import orc
+
     fpath = datadir / fname
 
-    expect = pa.orc.ORCFile(fpath).read()
+    expect = orc.ORCFile(fpath).read()
     got = cudf.read_orc(fpath)
 
     assert expect.equals(got.to_arrow())
 
 
 def test_names_in_struct_dtype_nesting(datadir):
+    from pyarrow import orc
+
     fname = datadir / "TestOrcFile.NestedStructDataFrame.orc"
 
-    expect = pa.orc.ORCFile(fname).read()
+    expect = orc.ORCFile(fname).read()
     got = cudf.read_orc(fname)
 
     # test dataframes
@@ -1483,12 +1450,14 @@ def test_names_in_struct_dtype_nesting(datadir):
 
 
 def test_writer_lists_structs(list_struct_buff):
+    from pyarrow import orc
+
     df_in = cudf.read_orc(list_struct_buff)
 
     buff = BytesIO()
     df_in.to_orc(buff)
 
-    pyarrow_tbl = pyarrow.orc.ORCFile(buff).read()
+    pyarrow_tbl = orc.ORCFile(buff).read()
 
     assert pyarrow_tbl.equals(df_in.to_arrow())
 
@@ -1527,12 +1496,10 @@ def test_statistics_sum_overflow():
     minint64 = np.iinfo(np.int64).min
 
     buff = BytesIO()
-    with pyorc.Writer(
-        buff,
-        pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt(), c=pyorc.BigInt()),
-    ) as writer:
-        writer.write((maxint64, minint64, minint64))
-        writer.write((1, -1, 1))
+    df = pd.DataFrame(
+        {"a": [maxint64, 1], "b": [minint64, -1], "c": [minint64, 1]}
+    )
+    df.to_orc(buff)
 
     file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff])
     assert file_stats[0]["a"].get("sum") is None
@@ -1545,22 +1512,24 @@ def test_statistics_sum_overflow():
 
 
 def test_empty_statistics():
+    from pyarrow import orc
+
     buff = BytesIO()
-    orc_schema = pyorc.Struct(
-        a=pyorc.BigInt(),
-        b=pyorc.Double(),
-        c=pyorc.String(),
-        d=pyorc.Decimal(11, 2),
-        e=pyorc.Date(),
-        f=pyorc.Timestamp(),
-        g=pyorc.Boolean(),
-        h=pyorc.Binary(),
-        i=pyorc.BigInt(),
-        # One column with non null value, else cudf/pyorc readers crash
+    pa_table = pa.Table.from_arrays(
+        [
+            pa.array([None], type=pa.int64()),
+            pa.array([None], type=pa.float64()),
+            pa.array([None], type=pa.string()),
+            pa.array([None], type=pa.decimal128(11, 2)),
+            pa.array([None], type=pa.timestamp("ns")),
+            pa.array([None], type=pa.date64()),
+            pa.array([None], type=pa.bool_()),
+            pa.array([None], type=pa.binary()),
+            pa.array([1], type=pa.int64()),
+        ],
+        ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
     )
-    data = tuple([None] * (len(orc_schema.fields) - 1) + [1])
-    with pyorc.Writer(buff, orc_schema) as writer:
-        writer.write(data)
+    orc.write_table(pa_table, buff)
 
     got = cudf.io.orc.read_orc_statistics([buff])
 
@@ -1615,6 +1584,8 @@ def test_select_nested(list_struct_buff, equivalent_columns):
 
 
 def test_orc_writer_rle_stream_size(datadir, tmpdir):
+    from pyarrow import orc
+
     original = datadir / "TestOrcFile.int16.rle.size.orc"
     reencoded = tmpdir.join("int16_map.orc")
 
@@ -1622,7 +1593,7 @@ def test_orc_writer_rle_stream_size(datadir, tmpdir):
     df.to_orc(reencoded)
 
     # Segfaults when RLE stream sizes don't account for varint length
-    pa_out = pa.orc.ORCFile(reencoded).read()
+    pa_out = orc.ORCFile(reencoded).read()
     assert df.to_arrow().equals(pa_out)
 
 
@@ -1642,11 +1613,13 @@ def test_empty_columns():
 
 
 def test_orc_reader_zstd_compression(list_struct_buff):
+    from pyarrow import orc
+
     expected = cudf.read_orc(list_struct_buff)
     # save with ZSTD compression
     buffer = BytesIO()
-    pyarrow_tbl = pyarrow.orc.ORCFile(list_struct_buff).read()
-    writer = pyarrow.orc.ORCWriter(buffer, compression="zstd")
+    pyarrow_tbl = orc.ORCFile(list_struct_buff).read()
+    writer = orc.ORCWriter(buffer, compression="zstd")
     writer.write(pyarrow_tbl)
     writer.close()
     try:
@@ -1845,10 +1818,7 @@ def negative_timestamp_df():
 @pytest.mark.parametrize("engine", ["cudf", "pyarrow"])
 def test_orc_reader_negative_timestamp(negative_timestamp_df, engine):
     buffer = BytesIO()
-    pyorc_table = pa.Table.from_pandas(
-        negative_timestamp_df.to_pandas(), preserve_index=False
-    )
-    pyarrow.orc.write_table(pyorc_table, buffer)
+    negative_timestamp_df.to_orc(buffer)
 
     # We warn the user that this function will fall back to the CPU for reading
     # when the engine is pyarrow.
@@ -1859,11 +1829,13 @@ def test_orc_reader_negative_timestamp(negative_timestamp_df, engine):
 
 
 def test_orc_writer_negative_timestamp(negative_timestamp_df):
+    from pyarrow import orc
+
     buffer = BytesIO()
     negative_timestamp_df.to_orc(buffer)
 
     assert_eq(negative_timestamp_df, pd.read_orc(buffer))
-    assert_eq(negative_timestamp_df, pyarrow.orc.ORCFile(buffer).read())
+    assert_eq(negative_timestamp_df, orc.ORCFile(buffer).read())
 
 
 def test_orc_reader_apache_negative_timestamp(datadir):
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 39a8dca0267..90759074750 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -58,7 +58,6 @@ test = [
     "hypothesis",
     "mimesis>=4.1.0",
     "msgpack",
-    "pyorc",
     "pytest",
     "pytest-benchmark",
     "pytest-cases",

From 865c21e4262aff1d6f99fdb00b892e7521087ffa Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 25 Oct 2023 15:52:12 -0400
Subject: [PATCH 046/118] Expose stream parameter in public strings contains
 APIs (#14280)

Add stream parameter to public APIs:

- `cudf::strings::contains_re()`
- `cudf::strings::matches_re()`
- `cudf::strings::count_re()`
- `cudf::strings::like()` (x2)
- `cudf::strings::extract()`
- `cudf::strings::extract_all_record()`

Also cleaned up some of the doxygen comments.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14280
---
 cpp/include/cudf/strings/contains.hpp       | 32 ++++++++-----
 cpp/include/cudf/strings/extract.hpp        | 12 +++--
 cpp/src/strings/contains.cu                 | 15 +++---
 cpp/src/strings/extract/extract.cu          |  5 +-
 cpp/src/strings/extract/extract_all.cu      |  5 +-
 cpp/src/strings/like.cu                     |  6 ++-
 cpp/tests/CMakeLists.txt                    |  2 +
 cpp/tests/streams/strings/contains_test.cpp | 52 +++++++++++++++++++++
 cpp/tests/streams/strings/extract_test.cpp  | 37 +++++++++++++++
 9 files changed, 139 insertions(+), 27 deletions(-)
 create mode 100644 cpp/tests/streams/strings/contains_test.cpp
 create mode 100644 cpp/tests/streams/strings/extract_test.cpp

diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp
index 23c77cb60da..341c146df92 100644
--- a/cpp/include/cudf/strings/contains.hpp
+++ b/cpp/include/cudf/strings/contains.hpp
@@ -31,7 +31,7 @@ struct regex_program;
  * @addtogroup strings_contains
  * @{
  * @file strings/contains.hpp
- * @brief Strings APIs for regex contains, count, matches
+ * @brief Strings APIs for regex contains, count, matches, like
  */
 
 /**
@@ -50,14 +50,16 @@ struct regex_program;
  *
  * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
  *
- * @param strings Strings instance for this operation
+ * @param input Strings instance for this operation
  * @param prog Regex program instance
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column of boolean results for each string
  */
 std::unique_ptr<column> contains_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   regex_program const& prog,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -76,14 +78,16 @@ std::unique_ptr<column> contains_re(
  *
  * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
  *
- * @param strings Strings instance for this operation
+ * @param input Strings instance for this operation
  * @param prog Regex program instance
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column of boolean results for each string
  */
 std::unique_ptr<column> matches_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   regex_program const& prog,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -102,14 +106,16 @@ std::unique_ptr<column> matches_re(
  *
  * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
  *
- * @param strings Strings instance for this operation
+ * @param input Strings instance for this operation
  * @param prog Regex program instance
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column of match counts for each string
  */
 std::unique_ptr<column> count_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   regex_program const& prog,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -146,8 +152,9 @@ std::unique_ptr<column> count_re(
  *
  * @param input Strings instance for this operation
  * @param pattern Like pattern to match within each string
- * @param escape_character Optional character specifies the escape prefix;
- *                         default is no escape character
+ * @param escape_character Optional character specifies the escape prefix.
+ *                         Default is no escape character.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New boolean column
  */
@@ -155,6 +162,7 @@ std::unique_ptr<column> like(
   strings_column_view const& input,
   string_scalar const& pattern,
   string_scalar const& escape_character = string_scalar(""),
+  rmm::cuda_stream_view stream          = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
 
 /**
@@ -185,8 +193,9 @@ std::unique_ptr<column> like(
  *
  * @param input Strings instance for this operation
  * @param patterns Like patterns to match within each corresponding string
- * @param escape_character Optional character specifies the escape prefix;
- *                         default is no escape character
+ * @param escape_character Optional character specifies the escape prefix.
+ *                         Default is no escape character.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New boolean column
  */
@@ -194,6 +203,7 @@ std::unique_ptr<column> like(
   strings_column_view const& input,
   strings_column_view const& patterns,
   string_scalar const& escape_character = string_scalar(""),
+  rmm::cuda_stream_view stream          = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp
index 586cb1f3f26..a4db1ac46da 100644
--- a/cpp/include/cudf/strings/extract.hpp
+++ b/cpp/include/cudf/strings/extract.hpp
@@ -53,14 +53,16 @@ struct regex_program;
  *
  * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
  *
- * @param strings Strings instance for this operation
+ * @param input Strings instance for this operation
  * @param prog Regex program instance
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Columns of strings extracted from the input column
  */
 std::unique_ptr<table> extract(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   regex_program const& prog,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -87,14 +89,16 @@ std::unique_ptr<table> extract(
  *
  * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
  *
- * @param strings Strings instance for this operation
+ * @param input Strings instance for this operation
  * @param prog Regex program instance
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate any returned device memory
  * @return Lists column containing strings extracted from the input column
  */
 std::unique_ptr<column> extract_all_record(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   regex_program const& prog,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 22534870409..4383f358a33 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -123,28 +123,31 @@ std::unique_ptr<column> count_re(strings_column_view const& input,
 
 // external APIs
 
-std::unique_ptr<column> contains_re(strings_column_view const& strings,
+std::unique_ptr<column> contains_re(strings_column_view const& input,
                                     regex_program const& prog,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains_re(strings, prog, cudf::get_default_stream(), mr);
+  return detail::contains_re(input, prog, stream, mr);
 }
 
-std::unique_ptr<column> matches_re(strings_column_view const& strings,
+std::unique_ptr<column> matches_re(strings_column_view const& input,
                                    regex_program const& prog,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::matches_re(strings, prog, cudf::get_default_stream(), mr);
+  return detail::matches_re(input, prog, stream, mr);
 }
 
-std::unique_ptr<column> count_re(strings_column_view const& strings,
+std::unique_ptr<column> count_re(strings_column_view const& input,
                                  regex_program const& prog,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_re(strings, prog, cudf::get_default_stream(), mr);
+  return detail::count_re(input, prog, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu
index 532053e750e..8edcd167e5c 100644
--- a/cpp/src/strings/extract/extract.cu
+++ b/cpp/src/strings/extract/extract.cu
@@ -131,12 +131,13 @@ std::unique_ptr<table> extract(strings_column_view const& input,
 
 // external API
 
-std::unique_ptr<table> extract(strings_column_view const& strings,
+std::unique_ptr<table> extract(strings_column_view const& input,
                                regex_program const& prog,
+                               rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract(strings, prog, cudf::get_default_stream(), mr);
+  return detail::extract(input, prog, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index 8a2f8f0cbfc..0c0d4ae4fbf 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -164,12 +164,13 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& input,
 
 // external API
 
-std::unique_ptr<column> extract_all_record(strings_column_view const& strings,
+std::unique_ptr<column> extract_all_record(strings_column_view const& input,
                                            regex_program const& prog,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_all_record(strings, prog, cudf::get_default_stream(), mr);
+  return detail::extract_all_record(input, prog, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/like.cu b/cpp/src/strings/like.cu
index 5b91f295efb..93e00592ef2 100644
--- a/cpp/src/strings/like.cu
+++ b/cpp/src/strings/like.cu
@@ -185,19 +185,21 @@ std::unique_ptr<column> like(strings_column_view const& input,
 std::unique_ptr<column> like(strings_column_view const& input,
                              string_scalar const& pattern,
                              string_scalar const& escape_character,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::like(input, pattern, escape_character, cudf::get_default_stream(), mr);
+  return detail::like(input, pattern, escape_character, stream, mr);
 }
 
 std::unique_ptr<column> like(strings_column_view const& input,
                              strings_column_view const& patterns,
                              string_scalar const& escape_character,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::like(input, patterns, escape_character, cudf::get_default_stream(), mr);
+  return detail::like(input, patterns, escape_character, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index e7f4914fe05..95411668284 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -638,7 +638,9 @@ ConfigureTest(
   STREAM_STRINGS_TEST
   streams/strings/case_test.cpp
   streams/strings/combine_test.cpp
+  streams/strings/contains_test.cpp
   streams/strings/convert_test.cpp
+  streams/strings/extract_test.cpp
   streams/strings/find_test.cpp
   streams/strings/replace_test.cpp
   streams/strings/split_test.cpp
diff --git a/cpp/tests/streams/strings/contains_test.cpp b/cpp/tests/streams/strings/contains_test.cpp
new file mode 100644
index 00000000000..383d48abe1e
--- /dev/null
+++ b/cpp/tests/streams/strings/contains_test.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/strings/contains.hpp>
+#include <cudf/strings/regex/regex_program.hpp>
+
+#include <string>
+
+class StringsContainsTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsContainsTest, Contains)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést strings", ""});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const pattern = std::string("[a-z]");
+  auto const prog    = cudf::strings::regex_program::create(pattern);
+  cudf::strings::contains_re(view, *prog, cudf::test::get_default_stream());
+  cudf::strings::matches_re(view, *prog, cudf::test::get_default_stream());
+  cudf::strings::count_re(view, *prog, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsContainsTest, Like)
+{
+  auto input = cudf::test::strings_column_wrapper({"Héllo", "thesés", "tést", ""});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const pattern = cudf::string_scalar("%és", true, cudf::test::get_default_stream());
+  auto const escape  = cudf::string_scalar("%", true, cudf::test::get_default_stream());
+  cudf::strings::like(view, pattern, escape, cudf::test::get_default_stream());
+
+  auto const patterns = cudf::test::strings_column_wrapper({"H%", "t%s", "t", ""});
+  cudf::strings::like(
+    view, cudf::strings_column_view(patterns), escape, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/strings/extract_test.cpp b/cpp/tests/streams/strings/extract_test.cpp
new file mode 100644
index 00000000000..06570fc5b38
--- /dev/null
+++ b/cpp/tests/streams/strings/extract_test.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/strings/extract.hpp>
+#include <cudf/strings/regex/regex_program.hpp>
+
+#include <string>
+
+class StringsExtractTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsExtractTest, Extract)
+{
+  auto input = cudf::test::strings_column_wrapper({"Joe Schmoe", "John Smith", "Jane Smith"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const pattern = std::string("([A-Z][a-z]+)");
+  auto const prog    = cudf::strings::regex_program::create(pattern);
+  cudf::strings::extract(view, *prog, cudf::test::get_default_stream());
+  cudf::strings::extract_all_record(view, *prog, cudf::test::get_default_stream());
+}

From 76bdb82a12440beda0d256bc92d12d59e668e939 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 25 Oct 2023 16:32:10 -0500
Subject: [PATCH 047/118] Run IO tests for Dask-cuDF (#14327)

We are not currently running any IO tests for `dask_cudf` in CI. This PR should correct this. It also modifies a test that *would* be failing due to https://github.com/rapidsai/cudf/issues/14326

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14327
---
 ci/test_wheel_dask_cudf.sh                          | 3 ++-
 python/dask_cudf/dask_cudf/io/tests/test_parquet.py | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 01019bb7598..8c4ab696249 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -16,4 +16,5 @@ python -m pip install git+https://github.com/dask/dask.git@2023.9.2 git+https://
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
 
-python -m pytest -n 8 ./python/dask_cudf/dask_cudf/tests
+# Run tests in dask_cudf/tests and dask_cudf/io/tests
+python -m pytest -n 8 ./python/dask_cudf/dask_cudf/
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 85ec36cf2c5..234b8fc5212 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -481,7 +481,9 @@ def test_create_metadata_file_inconsistent_schema(tmpdir):
     # call `compute` on `ddf1`, because the dtype of
     # the inconsistent column ("a") may be "object"
     # before computing, and "int" after
-    dd.assert_eq(ddf1.compute(), ddf2)
+    # TODO: Uncomment after cudf#14326 is closed
+    # (See: https://github.com/rapidsai/cudf/issues/14326)
+    # dd.assert_eq(ddf1.compute(), ddf2)
     dd.assert_eq(ddf1.compute(), ddf2.compute())
 
 
From ce93a004766a14384f132a39c95266182a19c07a Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 25 Oct 2023 15:27:25 -0700
Subject: [PATCH 048/118] Minor improvements in `source_info` (#14308)

Apply a common parameter passing pattern (pass by value + move) to avoid an unnecessary `source_info` copy when passing a temporary object.
Also changed the default `io_type` in `source_info` to VOID so that the default-constructed objects are safe to use.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - https://github.com/shrshi

URL: https://github.com/rapidsai/cudf/pull/14308
---
 cpp/include/cudf/io/avro.hpp    |  8 ++++----
 cpp/include/cudf/io/csv.hpp     |  6 +++---
 cpp/include/cudf/io/json.hpp    |  6 +++---
 cpp/include/cudf/io/orc.hpp     |  6 +++---
 cpp/include/cudf/io/parquet.hpp |  6 +++---
 cpp/include/cudf/io/types.hpp   | 12 +++++++++---
 cpp/src/io/functions.cpp        | 20 ++++++++++----------
 7 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp
index 17c168f38d4..89207302850 100644
--- a/cpp/include/cudf/io/avro.hpp
+++ b/cpp/include/cudf/io/avro.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ class avro_reader_options {
    *
    * @param src source information used to read avro file
    */
-  explicit avro_reader_options(source_info const& src) : _source(src) {}
+  explicit avro_reader_options(source_info src) : _source{std::move(src)} {}
 
   friend avro_reader_options_builder;
 
@@ -123,7 +123,7 @@ class avro_reader_options {
    * @param src source information used to read avro file
    * @returns builder to build reader options
    */
-  static avro_reader_options_builder builder(source_info const& src);
+  static avro_reader_options_builder builder(source_info src);
 };
 
 /**
@@ -145,7 +145,7 @@ class avro_reader_options_builder {
    *
    * @param src The source information used to read avro file
    */
-  explicit avro_reader_options_builder(source_info const& src) : options(src) {}
+  explicit avro_reader_options_builder(source_info src) : options{std::move(src)} {}
 
   /**
    * @brief Set names of the column to be read.
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index b49a13a8ea9..ac885c54356 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -138,7 +138,7 @@ class csv_reader_options {
    *
    * @param src source information used to read csv file
    */
-  explicit csv_reader_options(source_info const& src) : _source(src) {}
+  explicit csv_reader_options(source_info src) : _source{std::move(src)} {}
 
   friend csv_reader_options_builder;
 
@@ -156,7 +156,7 @@ class csv_reader_options {
    * @param src Source information to read csv file
    * @return Builder to build reader options
    */
-  static csv_reader_options_builder builder(source_info const& src);
+  static csv_reader_options_builder builder(source_info src);
 
   /**
    * @brief Returns source info.
@@ -835,7 +835,7 @@ class csv_reader_options_builder {
    *
    * @param src The source information used to read csv file
    */
-  csv_reader_options_builder(source_info const& src) : options(src) {}
+  csv_reader_options_builder(source_info src) : options{std::move(src)} {}
 
   /**
    * @brief Sets compression format of the source.
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index d408d249a7f..55aa534ac6c 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -121,7 +121,7 @@ class json_reader_options {
    *
    * @param src source information used to read parquet file
    */
-  explicit json_reader_options(source_info const& src) : _source(src) {}
+  explicit json_reader_options(source_info src) : _source{std::move(src)} {}
 
   friend json_reader_options_builder;
 
@@ -139,7 +139,7 @@ class json_reader_options {
    * @param src source information used to read json file
    * @returns builder to build the options
    */
-  static json_reader_options_builder builder(source_info const& src);
+  static json_reader_options_builder builder(source_info src);
 
   /**
    * @brief Returns source info.
@@ -351,7 +351,7 @@ class json_reader_options_builder {
    *
    * @param src The source information used to read avro file
    */
-  explicit json_reader_options_builder(source_info const& src) : options(src) {}
+  explicit json_reader_options_builder(source_info src) : options{std::move(src)} {}
 
   /**
    * @brief Set data types for columns to be read.
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 024f4f23b94..5801d2c1008 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -80,7 +80,7 @@ class orc_reader_options {
    *
    * @param src source information used to read orc file
    */
-  explicit orc_reader_options(source_info const& src) : _source(src) {}
+  explicit orc_reader_options(source_info src) : _source{std::move(src)} {}
 
  public:
   /**
@@ -96,7 +96,7 @@ class orc_reader_options {
    * @param src Source information to read orc file
    * @return Builder to build reader options
    */
-  static orc_reader_options_builder builder(source_info const& src);
+  static orc_reader_options_builder builder(source_info src);
 
   /**
    * @brief Returns source info.
@@ -269,7 +269,7 @@ class orc_reader_options_builder {
    *
    * @param src The source information used to read orc file
    */
-  explicit orc_reader_options_builder(source_info const& src) : options{src} {};
+  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {};
 
   /**
    * @brief Sets names of the column to read.
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 6283099e700..354bf839632 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -80,7 +80,7 @@ class parquet_reader_options {
    *
    * @param src source information used to read parquet file
    */
-  explicit parquet_reader_options(source_info const& src) : _source(src) {}
+  explicit parquet_reader_options(source_info src) : _source{std::move(src)} {}
 
   friend parquet_reader_options_builder;
 
@@ -98,7 +98,7 @@ class parquet_reader_options {
    * @param src Source information to read parquet file
    * @return Builder to build reader options
    */
-  static parquet_reader_options_builder builder(source_info const& src);
+  static parquet_reader_options_builder builder(source_info src);
 
   /**
    * @brief Returns source info.
@@ -265,7 +265,7 @@ class parquet_reader_options_builder {
    *
    * @param src The source information used to read parquet file
    */
-  explicit parquet_reader_options_builder(source_info const& src) : options(src) {}
+  explicit parquet_reader_options_builder(source_info src) : options{std::move(src)} {}
 
   /**
    * @brief Sets names of the columns to be read.
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index a97f81182ac..abf400da102 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -293,14 +293,20 @@ struct source_info {
    *
    * @param file_paths Input files paths
    */
-  explicit source_info(std::vector<std::string> const& file_paths) : _filepaths(file_paths) {}
+  explicit source_info(std::vector<std::string> const& file_paths)
+    : _type(io_type::FILEPATH), _filepaths(file_paths)
+  {
+  }
 
   /**
    * @brief Construct a new source info object for a single file
    *
    * @param file_path Single input file
    */
-  explicit source_info(std::string const& file_path) : _filepaths({file_path}) {}
+  explicit source_info(std::string const& file_path)
+    : _type(io_type::FILEPATH), _filepaths({file_path})
+  {
+  }
 
   /**
    * @brief Construct a new source info object for multiple buffers in host memory
@@ -444,7 +450,7 @@ struct source_info {
   [[nodiscard]] auto const& user_sources() const { return _user_sources; }
 
  private:
-  io_type _type = io_type::FILEPATH;
+  io_type _type = io_type::VOID;
   std::vector<std::string> _filepaths;
   std::vector<cudf::host_span<std::byte const>> _host_buffers;
   std::vector<cudf::device_span<std::byte const>> _device_buffers;
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 726442d752e..29ebb1ddbde 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -41,9 +41,9 @@
 namespace cudf {
 namespace io {
 // Returns builder for csv_reader_options
-csv_reader_options_builder csv_reader_options::builder(source_info const& src)
+csv_reader_options_builder csv_reader_options::builder(source_info src)
 {
-  return csv_reader_options_builder{src};
+  return csv_reader_options_builder{std::move(src)};
 }
 
 // Returns builder for csv_writer_options
@@ -54,9 +54,9 @@ csv_writer_options_builder csv_writer_options::builder(sink_info const& sink,
 }
 
 // Returns builder for orc_reader_options
-orc_reader_options_builder orc_reader_options::builder(source_info const& src)
+orc_reader_options_builder orc_reader_options::builder(source_info src)
 {
-  return orc_reader_options_builder{src};
+  return orc_reader_options_builder{std::move(src)};
 }
 
 // Returns builder for orc_writer_options
@@ -73,15 +73,15 @@ chunked_orc_writer_options_builder chunked_orc_writer_options::builder(sink_info
 }
 
 // Returns builder for avro_reader_options
-avro_reader_options_builder avro_reader_options::builder(source_info const& src)
+avro_reader_options_builder avro_reader_options::builder(source_info src)
 {
-  return avro_reader_options_builder(src);
+  return avro_reader_options_builder(std::move(src));
 }
 
 // Returns builder for json_reader_options
-json_reader_options_builder json_reader_options::builder(source_info const& src)
+json_reader_options_builder json_reader_options::builder(source_info src)
 {
-  return json_reader_options_builder(src);
+  return json_reader_options_builder(std::move(src));
 }
 
 // Returns builder for orc_writer_options
@@ -92,9 +92,9 @@ json_writer_options_builder json_writer_options::builder(sink_info const& sink,
 }
 
 // Returns builder for parquet_reader_options
-parquet_reader_options_builder parquet_reader_options::builder(source_info const& src)
+parquet_reader_options_builder parquet_reader_options::builder(source_info src)
 {
-  return parquet_reader_options_builder{src};
+  return parquet_reader_options_builder{std::move(src)};
 }
 
 // Returns builder for parquet_writer_options

From 203f1dff4295e088db5b6e53c5dbc78924e5458a Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 26 Oct 2023 05:29:12 -0700
Subject: [PATCH 049/118] Temporarily avoid the current build of
 pydata-sphinx-theme (#14332)

There appears to be a bug in the latest release of pydata-sphinx theme where a warning that was intended to be thrown conditionally is appearing unconditionally in our builds, triggering a doc build failure because we build with `-W`. See https://github.com/pydata/pydata-sphinx-theme/issues/1539 for more information. We should be OK to simply avoid the current version for now. If the next release is a minor release then the warning will be removed and we can automatically upgrade. If they have another patch release then we can reevaluate the pinning, but the current pinning seems like the most likely to require no additional work going forward.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/14332
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-120_arch-x86_64.yaml | 2 +-
 dependencies.yaml                                | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 8b6b32bc026..d847690a48b 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -69,7 +69,7 @@ dependencies:
 - protobuf>=4.21,<5
 - ptxcompiler
 - pyarrow==12.0.1.*
-- pydata-sphinx-theme
+- pydata-sphinx-theme!=0.14.2
 - pytest
 - pytest-benchmark
 - pytest-cases
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index ae15a6e97ab..163e82b1325 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -67,7 +67,7 @@ dependencies:
 - pre-commit
 - protobuf>=4.21,<5
 - pyarrow==12.0.1.*
-- pydata-sphinx-theme
+- pydata-sphinx-theme!=0.14.2
 - pytest
 - pytest-benchmark
 - pytest-cases
diff --git a/dependencies.yaml b/dependencies.yaml
index a7716a15360..da3ba0e5108 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -379,7 +379,8 @@ dependencies:
           - nbsphinx
           - numpydoc
           - pandoc
-          - pydata-sphinx-theme
+          # https://github.com/pydata/pydata-sphinx-theme/issues/1539
+          - pydata-sphinx-theme!=0.14.2
           - scipy
           - sphinx
           - sphinx-autobuild

From 2e85a3f090ab3293d151fd130cd24c89983917f3 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 26 Oct 2023 08:16:17 -0500
Subject: [PATCH 050/118] Unpin dask and distributed for 23.12 development
 (#14320)

This PR relaxes `dask` and `distributed` versions pinnings for `23.12` development.

Authors:
   - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
   - https://github.com/jakirkham
   - Peter Andreas Entschev (https://github.com/pentschev)
   - Ray Douglass (https://github.com/raydouglass)
---
 ci/test_wheel_dask_cudf.sh                    |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             |  6 +-
 .../all_cuda-120_arch-x86_64.yaml             |  6 +-
 conda/recipes/custreamz/meta.yaml             |  6 +-
 conda/recipes/dask-cudf/meta.yaml             | 12 +--
 conda/recipes/dask-cudf/run_test.sh           |  2 +-
 dependencies.yaml                             |  6 +-
 python/dask_cudf/dask_cudf/backends.py        | 80 ++++++++-----------
 python/dask_cudf/pyproject.toml               |  4 +-
 9 files changed, 57 insertions(+), 67 deletions(-)

diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 8c4ab696249..f89aa43c20a 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -11,7 +11,7 @@ RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from
 python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
 
 # Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@2023.9.2 git+https://github.com/dask/distributed.git@2023.9.2 git+https://github.com/rapidsai/dask-cuda.git@branch-23.12
+python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.12
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index d847690a48b..623f79b7c34 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -24,10 +24,10 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core==2023.9.2
+- dask-core>=2023.9.2
 - dask-cuda==23.12.*
-- dask==2023.9.2
-- distributed==2023.9.2
+- dask>=2023.9.2
+- distributed>=2023.9.2
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 163e82b1325..770095ad088 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -26,10 +26,10 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-core==2023.9.2
+- dask-core>=2023.9.2
 - dask-cuda==23.12.*
-- dask==2023.9.2
-- distributed==2023.9.2
+- dask>=2023.9.2
+- distributed>=2023.9.2
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index 233d51baf31..c5d14f1c884 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -45,9 +45,9 @@ requirements:
     - streamz
     - cudf ={{ version }}
     - cudf_kafka ={{ version }}
-    - dask ==2023.9.2
-    - dask-core ==2023.9.2
-    - distributed ==2023.9.2
+    - dask >=2023.9.2
+    - dask-core >=2023.9.2
+    - distributed >=2023.9.2
     - python-confluent-kafka >=1.9.0,<1.10.0a0
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 4c8af071074..444a9850c74 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -38,16 +38,16 @@ requirements:
   host:
     - python
     - cudf ={{ version }}
-    - dask ==2023.9.2
-    - dask-core ==2023.9.2
-    - distributed ==2023.9.2
+    - dask >=2023.9.2
+    - dask-core >=2023.9.2
+    - distributed >=2023.9.2
     - cuda-version ={{ cuda_version }}
   run:
     - python
     - cudf ={{ version }}
-    - dask ==2023.9.2
-    - dask-core ==2023.9.2
-    - distributed ==2023.9.2
+    - dask >=2023.9.2
+    - dask-core >=2023.9.2
+    - distributed >=2023.9.2
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh
index c79c014a89a..e7238d00f2b 100644
--- a/conda/recipes/dask-cudf/run_test.sh
+++ b/conda/recipes/dask-cudf/run_test.sh
@@ -18,7 +18,7 @@ if [ "${ARCH}" = "aarch64" ]; then
 fi
 
 # Dask & Distributed option to install main(nightly) or `conda-forge` packages.
-export INSTALL_DASK_MAIN=0
+export INSTALL_DASK_MAIN=1
 
 # Dask version to install when `INSTALL_DASK_MAIN=0`
 export DASK_STABLE_VERSION="2023.9.2"
diff --git a/dependencies.yaml b/dependencies.yaml
index da3ba0e5108..59755c31e92 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -478,12 +478,12 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask==2023.9.2
-          - distributed==2023.9.2
+          - dask>=2023.9.2
+          - distributed>=2023.9.2
       - output_types: conda
         packages:
           - cupy>=12.0.0
-          - dask-core==2023.9.2  # dask-core in conda is the actual package & dask is the meta package
+          - dask-core>=2023.9.2  # dask-core in conda is the actual package & dask is the meta package
       - output_types: pyproject
         packages:
           - &cudf cudf==23.12.*
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 7b35c71ff09..65d9c438fba 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -12,6 +12,7 @@
 
 import dask.dataframe as dd
 from dask import config
+from dask.array.dispatch import percentile_lookup
 from dask.dataframe.backends import (
     DataFrameBackendEntrypoint,
     PandasBackendEntrypoint,
@@ -320,56 +321,45 @@ def get_grouper_cudf(obj):
     return cudf.core.groupby.Grouper
 
 
-try:
-    try:
-        from dask.array.dispatch import percentile_lookup
-    except ImportError:
-        from dask.dataframe.dispatch import (
-            percentile_dispatch as percentile_lookup,
-        )
-
-    @percentile_lookup.register((cudf.Series, cp.ndarray, cudf.BaseIndex))
-    @_dask_cudf_nvtx_annotate
-    def percentile_cudf(a, q, interpolation="linear"):
-        # Cudf dispatch to the equivalent of `np.percentile`:
-        # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html
-        a = cudf.Series(a)
-        # a is series.
-        n = len(a)
-        if not len(a):
-            return None, n
-        if isinstance(q, Iterator):
-            q = list(q)
-
-        if cudf.api.types.is_categorical_dtype(a.dtype):
-            result = cp.percentile(a.cat.codes, q, interpolation=interpolation)
-
-            return (
-                pd.Categorical.from_codes(
-                    result, a.dtype.categories, a.dtype.ordered
-                ),
-                n,
-            )
-        if np.issubdtype(a.dtype, np.datetime64):
-            result = a.quantile(
-                [i / 100.0 for i in q], interpolation=interpolation
-            )
+@percentile_lookup.register((cudf.Series, cp.ndarray, cudf.BaseIndex))
+@_dask_cudf_nvtx_annotate
+def percentile_cudf(a, q, interpolation="linear"):
+    # Cudf dispatch to the equivalent of `np.percentile`:
+    # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html
+    a = cudf.Series(a)
+    # a is series.
+    n = len(a)
+    if not len(a):
+        return None, n
+    if isinstance(q, Iterator):
+        q = list(q)
+
+    if cudf.api.types.is_categorical_dtype(a.dtype):
+        result = cp.percentile(a.cat.codes, q, interpolation=interpolation)
 
-            if q[0] == 0:
-                # https://github.com/dask/dask/issues/6864
-                result[0] = min(result[0], a.min())
-            return result.to_pandas(), n
-        if not np.issubdtype(a.dtype, np.number):
-            interpolation = "nearest"
         return (
-            a.quantile(
-                [i / 100.0 for i in q], interpolation=interpolation
-            ).to_pandas(),
+            pd.Categorical.from_codes(
+                result, a.dtype.categories, a.dtype.ordered
+            ),
             n,
         )
+    if np.issubdtype(a.dtype, np.datetime64):
+        result = a.quantile(
+            [i / 100.0 for i in q], interpolation=interpolation
+        )
 
-except ImportError:
-    pass
+        if q[0] == 0:
+            # https://github.com/dask/dask/issues/6864
+            result[0] = min(result[0], a.min())
+        return result.to_pandas(), n
+    if not np.issubdtype(a.dtype, np.number):
+        interpolation = "nearest"
+    return (
+        a.quantile(
+            [i / 100.0 for i in q], interpolation=interpolation
+        ).to_pandas(),
+        n,
+    )
 
 
 @pyarrow_schema_dispatch.register((cudf.DataFrame,))
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 0a6e776e0f9..8461c51c573 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -20,8 +20,8 @@ requires-python = ">=3.9"
 dependencies = [
     "cudf==23.12.*",
     "cupy-cuda11x>=12.0.0",
-    "dask==2023.9.2",
-    "distributed==2023.9.2",
+    "dask>=2023.9.2",
+    "distributed>=2023.9.2",
     "fsspec>=0.6.0",
     "numpy>=1.21,<1.25",
     "pandas>=1.3,<1.6.0dev0",

From a2abdb1a9e4d6737bfcab85874589057afdbae6e Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 26 Oct 2023 13:47:42 -0400
Subject: [PATCH 051/118] Fix gtest validity setting for
 TextTokenizeTest.Vocabulary (#14312)

Fixes the `TextTokenizeTest.Vocabulary` which incorrectly created a non-empty null row for input and the expected output.
This was found while working on optimizing the `nvtext::tokenize_with_vocabulary` API.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/14312
---
 cpp/tests/text/tokenize_tests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp
index d78f2dfbdf3..fbc706ea290 100644
--- a/cpp/tests/text/tokenize_tests.cpp
+++ b/cpp/tests/text/tokenize_tests.cpp
@@ -208,7 +208,7 @@ TEST_F(TextTokenizeTest, Vocabulary)
     {"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"});
   auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary));
 
-  auto validity = cudf::test::iterators::null_at(1);
+  auto validity = cudf::test::iterators::null_at(4);
   cudf::test::strings_column_wrapper input({"the fox jumped over the dog",
                                             "the dog chased the cat",
                                             "the cat chased the mouse",

From d8f079030b9d91f143d1c93958fa344be18eaf73 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 26 Oct 2023 17:28:47 -0400
Subject: [PATCH 052/118] Move and rename byte-pair-encoding source files
 (#14284)

Moves and renames the byte-pair-encoding source files. The source files are moved from the `text/subword` to `text/bpe` and the filenames have remove the `tokenize` since these functions only do encoding.
No functions names have been changed. The `nvtext::load_merge_pairs_file` API has been deprecated. Callers must load the pairs into a strings column (using the CSV or text readers in cuio) and call the new `nvtext::load_merge_pairs` API instead.

Follow on PRs will address function and performance issues.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/14284
---
 cpp/CMakeLists.txt                            |  4 +-
 ...pe_tokenize.hpp => byte_pair_encoding.hpp} | 41 +++++++++++++++++--
 .../byte_pair_encoding.cu}                    |  4 +-
 .../byte_pair_encoding.cuh}                   |  2 +-
 .../load_merge_pairs.cu}                      |  4 +-
 cpp/tests/text/bpe_tests.cpp                  | 12 +++---
 6 files changed, 50 insertions(+), 17 deletions(-)
 rename cpp/include/nvtext/{bpe_tokenize.hpp => byte_pair_encoding.hpp} (73%)
 rename cpp/src/text/{subword/bpe_tokenizer.cu => bpe/byte_pair_encoding.cu} (99%)
 rename cpp/src/text/{subword/bpe_tokenizer.cuh => bpe/byte_pair_encoding.cuh} (99%)
 rename cpp/src/text/{subword/load_merges_file.cu => bpe/load_merge_pairs.cu} (98%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 472ee9d9fd4..f7662006cac 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -614,10 +614,10 @@ add_library(
   src/text/normalize.cu
   src/text/replace.cu
   src/text/stemmer.cu
-  src/text/subword/bpe_tokenizer.cu
+  src/text/bpe/byte_pair_encoding.cu
+  src/text/bpe/load_merge_pairs.cu
   src/text/subword/data_normalizer.cu
   src/text/subword/load_hash_file.cu
-  src/text/subword/load_merges_file.cu
   src/text/subword/subword_tokenize.cu
   src/text/subword/wordpiece_tokenizer.cu
   src/text/tokenize.cu
diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/byte_pair_encoding.hpp
similarity index 73%
rename from cpp/include/nvtext/bpe_tokenize.hpp
rename to cpp/include/nvtext/byte_pair_encoding.hpp
index c67f4bd8b1c..1f4851d7057 100644
--- a/cpp/include/nvtext/bpe_tokenize.hpp
+++ b/cpp/include/nvtext/byte_pair_encoding.hpp
@@ -32,7 +32,7 @@ namespace nvtext {
 /**
  * @brief The table of merge pairs for the BPE encoder.
  *
- * To create an instance, call @ref nvtext::load_merge_pairs_file
+ * To create an instance, call @ref nvtext::load_merge_pairs
  */
 struct bpe_merge_pairs {
   struct bpe_merge_pairs_impl;
@@ -66,6 +66,8 @@ struct bpe_merge_pairs {
 /**
  * @brief Create a nvtext::bpe_merge_pairs from an input file.
  *
+ * @deprecated Since 23.12
+ *
  * The file should contain a pair of strings per line separated by
  * a single space.
  *
@@ -94,10 +96,40 @@ struct bpe_merge_pairs {
  * @param mr Memory resource to allocate any returned objects.
  * @return A nvtext::bpe_merge_pairs object
  */
-std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
+[[deprecated]] std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
   std::string const& filename_merges,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Create a nvtext::bpe_merge_pairs from a strings column
+ *
+ * The input column should contain a unique pair of strings per line separated by
+ * a single space. An incorrect format or non-unique entries will result in
+ * undefined behavior.
+ *
+ * Example:
+ * @code{.pseudo}
+ * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
+ * mps = load_merge_pairs(merge_pairs)
+ * // the mps object can be passed to the byte_pair_encoding API
+ * @endcode
+ *
+ * The pairs are expected to be ordered in the file by their rank
+ * relative to each other. A pair earlier in the file has priority over
+ * any pairs below it.
+ *
+ * @throw cudf::logic_error if `merge_pairs` is empty or contains nulls
+ *
+ * @param merge_pairs Column containing the unique merge pairs
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Memory resource to allocate any returned objects
+ * @return A nvtext::bpe_merge_pairs object
+ */
+std::unique_ptr<bpe_merge_pairs> load_merge_pairs(
+  cudf::strings_column_view const& merge_pairs,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Byte pair encode the input strings.
  *
@@ -110,7 +142,8 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
  * pairs before the result is joined to make the output string.
  *
  * @code{.pseudo}
- * mps = load_merges_file("merges.txt") // see doxygen for example contents
+ * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
+ * mps = load_merge_pairs(merge_pairs)
  * input = ["test sentence", "thisis test"]
  * result = byte_pair_encoding(input, mps)
  * result is now ["test sent ence", "this is test"]
@@ -120,7 +153,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
  * @throw cudf::logic_error if `separator` is invalid
  *
  * @param input Strings to encode.
- * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs_file.
+ * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs.
  * @param separator String used to build the output after encoding.
  *                  Default is a space.
  * @param mr Memory resource to allocate any returned objects.
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/bpe/byte_pair_encoding.cu
similarity index 99%
rename from cpp/src/text/subword/bpe_tokenizer.cu
rename to cpp/src/text/bpe/byte_pair_encoding.cu
index 13c744ac6bd..42cd9bcbcbe 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/bpe/byte_pair_encoding.cu
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include <text/subword/bpe_tokenizer.cuh>
+#include <text/bpe/byte_pair_encoding.cuh>
 
-#include <nvtext/bpe_tokenize.hpp>
+#include <nvtext/byte_pair_encoding.hpp>
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/bpe/byte_pair_encoding.cuh
similarity index 99%
rename from cpp/src/text/subword/bpe_tokenizer.cuh
rename to cpp/src/text/bpe/byte_pair_encoding.cuh
index 2fa879ea734..cefd32e8f60 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/bpe/byte_pair_encoding.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <nvtext/bpe_tokenize.hpp>
+#include <nvtext/byte_pair_encoding.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/bpe/load_merge_pairs.cu
similarity index 98%
rename from cpp/src/text/subword/load_merges_file.cu
rename to cpp/src/text/bpe/load_merge_pairs.cu
index db6ad2e2dd2..77f0ebba43f 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/bpe/load_merge_pairs.cu
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include <text/subword/bpe_tokenizer.cuh>
+#include <text/bpe/byte_pair_encoding.cuh>
 
-#include <nvtext/bpe_tokenize.hpp>
+#include <nvtext/byte_pair_encoding.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp
index 234d8c4fecc..044c0ab0804 100644
--- a/cpp/tests/text/bpe_tests.cpp
+++ b/cpp/tests/text/bpe_tests.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <nvtext/bpe_tokenize.hpp>
+#include <nvtext/byte_pair_encoding.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
@@ -24,9 +24,9 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-struct TextBPETokenize : public cudf::test::BaseFixture {};
+struct TextBytePairEncoding : public cudf::test::BaseFixture {};
 
-TEST_F(TextBPETokenize, BytePairEncoding)
+TEST_F(TextBytePairEncoding, BytePairEncoding)
 {
   // partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt
   auto mpt = cudf::test::strings_column_wrapper({
@@ -74,7 +74,7 @@ TEST_F(TextBPETokenize, BytePairEncoding)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected);
 }
 
-TEST_F(TextBPETokenize, BytePairEncodingSeparator)
+TEST_F(TextBytePairEncoding, BytePairEncodingSeparator)
 {
   auto mpt = cudf::test::strings_column_wrapper(
     {"e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"});
@@ -91,7 +91,7 @@ TEST_F(TextBPETokenize, BytePairEncodingSeparator)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
-TEST_F(TextBPETokenize, BPE_Empty)
+TEST_F(TextBytePairEncoding, BPE_Empty)
 {
   auto mpt = cudf::test::strings_column_wrapper({"i s", "i t"});
   nvtext::bpe_merge_pairs merge_pairs{mpt.release()};
@@ -100,7 +100,7 @@ TEST_F(TextBPETokenize, BPE_Empty)
   EXPECT_EQ(0, results->size());
 }
 
-TEST_F(TextBPETokenize, BPE_Error)
+TEST_F(TextBytePairEncoding, BPE_Error)
 {
   auto empty = cudf::make_empty_column(cudf::type_id::STRING);
   nvtext::bpe_merge_pairs merge_pairs{std::move(empty)};

From f6099ca49592eedafd64022815757193fc8ac402 Mon Sep 17 00:00:00 2001
From: Suraj Aralihalli <suraj.ara16@gmail.com>
Date: Thu, 26 Oct 2023 22:51:10 -0700
Subject: [PATCH 053/118] Add stream parameter to Set Operations (Public List
 APIs) (#14305)

This PR marks the conclusion of the List API series. The PR introduces the stream parameter to the Set operations (Public List Comparison and Intersection APIs).

 Comparison and Intersection (`set_operations.hpp`)

```
have_overlap
intersect_distinct
union_distinct
difference_distinct
```

Reference [13744](https://github.com/rapidsai/cudf/issues/13744)

Authors:
  - Suraj Aralihalli (https://github.com/SurajAralihalli)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/14305
---
 cpp/benchmarks/lists/set_operations.cpp   |  1 +
 cpp/include/cudf/lists/set_operations.hpp | 10 ++++-
 cpp/src/lists/set_operations.cu           | 14 ++++---
 cpp/tests/streams/lists_test.cpp          | 45 +++++++++++++++++++++++
 4 files changed, 63 insertions(+), 7 deletions(-)

diff --git a/cpp/benchmarks/lists/set_operations.cpp b/cpp/benchmarks/lists/set_operations.cpp
index 5b240923358..6bed33d2570 100644
--- a/cpp/benchmarks/lists/set_operations.cpp
+++ b/cpp/benchmarks/lists/set_operations.cpp
@@ -54,6 +54,7 @@ void nvbench_set_op(nvbench::state& state, BenchFuncPtr bfunc)
           cudf::lists_column_view{*rhs},
           cudf::null_equality::EQUAL,
           cudf::nan_equality::ALL_EQUAL,
+          cudf::get_default_stream(),
           rmm::mr::get_current_device_resource());
   });
 }
diff --git a/cpp/include/cudf/lists/set_operations.hpp b/cpp/include/cudf/lists/set_operations.hpp
index 9d58d0f5b98..6fb8989f0bb 100644
--- a/cpp/include/cudf/lists/set_operations.hpp
+++ b/cpp/include/cudf/lists/set_operations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,6 +53,7 @@ namespace cudf::lists {
  *        to be `UNEQUAL` which means only non-null elements are checked for overlapping
  * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal
  * @param mr Device memory resource used to allocate the returned object
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return A column of type BOOL containing the check results
  */
 std::unique_ptr<column> have_overlap(
@@ -60,6 +61,7 @@ std::unique_ptr<column> have_overlap(
   lists_column_view const& rhs,
   null_equality nulls_equal           = null_equality::EQUAL,
   nan_equality nans_equal             = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -87,6 +89,7 @@ std::unique_ptr<column> have_overlap(
  * @param rhs The input lists column for the other side
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
  * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned object
  * @return A lists column containing the intersection results
  */
@@ -95,6 +98,7 @@ std::unique_ptr<column> intersect_distinct(
   lists_column_view const& rhs,
   null_equality nulls_equal           = null_equality::EQUAL,
   nan_equality nans_equal             = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -122,6 +126,7 @@ std::unique_ptr<column> intersect_distinct(
  * @param rhs The input lists column for the other side
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
  * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned object
  * @return A lists column containing the union results
  */
@@ -130,6 +135,7 @@ std::unique_ptr<column> union_distinct(
   lists_column_view const& rhs,
   null_equality nulls_equal           = null_equality::EQUAL,
   nan_equality nans_equal             = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -157,6 +163,7 @@ std::unique_ptr<column> union_distinct(
  * @param rhs The input lists column of elements to exclude
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
  * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned object
  * @return A lists column containing the difference results
  */
@@ -165,6 +172,7 @@ std::unique_ptr<column> difference_distinct(
   lists_column_view const& rhs,
   null_equality nulls_equal           = null_equality::EQUAL,
   nan_equality nans_equal             = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index 5687a491363..5647b503cf7 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -278,42 +278,44 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
                                      lists_column_view const& rhs,
                                      null_equality nulls_equal,
                                      nan_equality nans_equal,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::have_overlap(lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
+  return detail::have_overlap(lhs, rhs, nulls_equal, nans_equal, stream, mr);
 }
 
 std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
                                            lists_column_view const& rhs,
                                            null_equality nulls_equal,
                                            nan_equality nans_equal,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::intersect_distinct(
-    lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
+  return detail::intersect_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr);
 }
 
 std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
                                        lists_column_view const& rhs,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::union_distinct(lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
+  return detail::union_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr);
 }
 
 std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
                                             lists_column_view const& rhs,
                                             null_equality nulls_equal,
                                             nan_equality nans_equal,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::difference_distinct(
-    lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
+  return detail::difference_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/tests/streams/lists_test.cpp b/cpp/tests/streams/lists_test.cpp
index 82a4cb8aa4a..74e0e8837f7 100644
--- a/cpp/tests/streams/lists_test.cpp
+++ b/cpp/tests/streams/lists_test.cpp
@@ -25,6 +25,7 @@
 #include <cudf/lists/filling.hpp>
 #include <cudf/lists/gather.hpp>
 #include <cudf/lists/reverse.hpp>
+#include <cudf/lists/set_operations.hpp>
 #include <cudf/lists/sorting.hpp>
 #include <cudf/lists/stream_compaction.hpp>
 
@@ -166,3 +167,47 @@ TEST_F(ListTest, Distinct)
                         cudf::nan_equality::ALL_EQUAL,
                         cudf::test::get_default_stream());
 }
+
+TEST_F(ListTest, DifferenceDistinct)
+{
+  cudf::test::lists_column_wrapper<int> list_col_a{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::test::lists_column_wrapper<int> list_col_b{{0, 1}, {1, 3, 6, 8}, {5}};
+  cudf::lists::difference_distinct(list_col_a,
+                                   list_col_b,
+                                   cudf::null_equality::EQUAL,
+                                   cudf::nan_equality::ALL_EQUAL,
+                                   cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, IntersectDistinct)
+{
+  cudf::test::lists_column_wrapper<int> list_col_a{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::test::lists_column_wrapper<int> list_col_b{{0, 1}, {1, 3, 6, 8}, {5}};
+  cudf::lists::intersect_distinct(list_col_a,
+                                  list_col_b,
+                                  cudf::null_equality::EQUAL,
+                                  cudf::nan_equality::ALL_EQUAL,
+                                  cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, UnionDistinct)
+{
+  cudf::test::lists_column_wrapper<int> list_col_a{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::test::lists_column_wrapper<int> list_col_b{{0, 1}, {1, 3, 6, 8}, {5}};
+  cudf::lists::union_distinct(list_col_a,
+                              list_col_b,
+                              cudf::null_equality::EQUAL,
+                              cudf::nan_equality::ALL_EQUAL,
+                              cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, HaveOverlap)
+{
+  cudf::test::lists_column_wrapper<int> list_col_a{{0, 1}, {2, 3, 7, 8}, {4, 5}};
+  cudf::test::lists_column_wrapper<int> list_col_b{{0, 1}, {1, 3, 6, 8}, {5}};
+  cudf::lists::have_overlap(list_col_a,
+                            list_col_b,
+                            cudf::null_equality::EQUAL,
+                            cudf::nan_equality::ALL_EQUAL,
+                            cudf::test::get_default_stream());
+}

From 7d6c377ccba923ee21922d0cc9d13682cfac3e72 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 27 Oct 2023 05:19:35 -0500
Subject: [PATCH 054/118] Upgrade `arrow` to `13` (#14330)

This PR upgrades `arrow` to `13`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/jakirkham

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/jakirkham
  - Ray Douglass (https://github.com/raydouglass)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14330
---
 .../all_cuda-118_arch-x86_64.yaml             |  4 ++--
 .../all_cuda-120_arch-x86_64.yaml             |  4 ++--
 conda/recipes/cudf/meta.yaml                  |  6 ++---
 conda/recipes/libcudf/conda_build_config.yaml |  2 +-
 conda/recipes/libcudf/meta.yaml               |  3 ++-
 cpp/cmake/thirdparty/get_arrow.cmake          | 22 ++++++++++++++-----
 dependencies.yaml                             | 21 +++++++++++++-----
 python/cudf/cudf/tests/test_decimal.py        | 11 +++++++++-
 8 files changed, 52 insertions(+), 21 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 623f79b7c34..cfcbde71b01 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -39,7 +39,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow==12.0.1.*
+- libarrow==13.0.0.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
@@ -68,7 +68,7 @@ dependencies:
 - pre-commit
 - protobuf>=4.21,<5
 - ptxcompiler
-- pyarrow==12.0.1.*
+- pyarrow==13.0.0.*
 - pydata-sphinx-theme!=0.14.2
 - pytest
 - pytest-benchmark
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 770095ad088..db19d658b0d 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -41,7 +41,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow==12.0.1.*
+- libarrow==13.0.0.*
 - libcufile-dev
 - libcurand-dev
 - libkvikio==23.12.*
@@ -66,7 +66,7 @@ dependencies:
 - pip
 - pre-commit
 - protobuf>=4.21,<5
-- pyarrow==12.0.1.*
+- pyarrow==13.0.0.*
 - pydata-sphinx-theme!=0.14.2
 - pytest
 - pytest-benchmark
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 7405ae2dfb5..619df00087c 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -55,13 +55,13 @@ requirements:
     - cuda-version ={{ cuda_version }}
     - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
-    - protobuf ==4.21.*
+    - protobuf ==4.23.*
     - python
     - cython >=3.0.0
     - scikit-build >=0.13.1
     - setuptools
     - dlpack >=0.5,<0.6.0a0
-    - pyarrow ==12.0.1.*
+    - pyarrow ==13.0.0.*
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
@@ -82,7 +82,7 @@ requirements:
     - numba >=0.57,<0.58
     # TODO: Pin to numpy<1.25 until cudf requires pandas 2
     - numpy >=1.21,<1.25
-    - {{ pin_compatible('pyarrow', max_pin='x.x.x') }}
+    - {{ pin_compatible('pyarrow', max_pin='x') }}
     - libcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 63688a641de..fe692614b8e 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -23,7 +23,7 @@ gtest_version:
   - ">=1.13.0"
 
 libarrow_version:
-  - "==12.0.1"
+  - "==13.0.0"
 
 dlpack_version:
   - ">=0.5,<0.6.0a0"
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 627065817ba..ecd777bf91f 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -91,6 +91,8 @@ outputs:
     requirements:
       build:
         - cmake {{ cmake_version }}
+      host:
+        - libarrow {{ libarrow_version }}
       run:
         {% if cuda_major == "11" %}
         - cudatoolkit
@@ -103,7 +105,6 @@ outputs:
         - nvcomp {{ nvcomp_version }}
         - librmm ={{ minor_version }}
         - libkvikio ={{ minor_version }}
-        - libarrow {{ libarrow_version }}
         - dlpack {{ dlpack_version }}
         - gtest {{ gtest_version }}
         - gmock {{ gtest_version }}
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 10d3145a36f..3b2cbc57d1c 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -408,12 +408,22 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
 endfunction()
 
 if(NOT DEFINED CUDF_VERSION_Arrow)
-  set(CUDF_VERSION_Arrow
-      # This version must be kept in sync with the libarrow version pinned for builds in
-      # dependencies.yaml.
-      12.0.1
-      CACHE STRING "The version of Arrow to find (or build)"
-  )
+  # Temporarily use Arrow 12.0.1 in wheels and Arrow 13.0.0 otherwise
+  if(USE_LIBARROW_FROM_PYARROW)
+    set(CUDF_VERSION_Arrow
+        # This version must be kept in sync with the libarrow version pinned for builds in
+        # dependencies.yaml.
+        12.0.1
+        CACHE STRING "The version of Arrow to find (or build)"
+    )
+  else()
+    set(CUDF_VERSION_Arrow
+        # This version must be kept in sync with the libarrow version pinned for builds in
+        # dependencies.yaml.
+        13.0.0
+        CACHE STRING "The version of Arrow to find (or build)"
+    )
+  endif()
 endif()
 
 find_and_configure_arrow(
diff --git a/dependencies.yaml b/dependencies.yaml
index 59755c31e92..1f2b42c49c4 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -224,7 +224,7 @@ dependencies:
           - &gmock gmock>=1.13.0
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - libarrow==12.0.1.*
+          - libarrow==13.0.0.*
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==2.6.1
@@ -240,11 +240,18 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - cython>=3.0.0
+          # TODO: Pin to numpy<1.25 until cudf requires pandas 2
+          - &numpy numpy>=1.21,<1.25
+      - output_types: [conda]
+        packages:
+          # Hard pin the patch version used during the build. This must be kept
+          # in sync with the version pinned in get_arrow.cmake.
+          - pyarrow==13.0.0.*
+      - output_types: [requirements, pyproject]
+        packages:
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
           - pyarrow==12.0.1.*
-          # TODO: Pin to numpy<1.25 until cudf requires pandas 2
-          - &numpy numpy>=1.21,<1.25
   build_python:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -262,10 +269,14 @@ dependencies:
       - output_types: conda
         packages:
           # Allow runtime version to float up to minor version
-          - libarrow==12.*
+          - libarrow==13.*
   pyarrow_run:
     common:
-      - output_types: [conda, requirements, pyproject]
+      - output_types: [conda]
+        packages:
+          # Allow runtime version to float up to minor version
+          - pyarrow==13.*
+      - output_types: [requirements, pyproject]
         packages:
           # Allow runtime version to float up to minor version
           - pyarrow==12.*
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index e4b2af90448..0745e5aba48 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -6,6 +6,7 @@
 import numpy as np
 import pyarrow as pa
 import pytest
+from packaging import version
 
 import cudf
 from cudf.core.column import Decimal32Column, Decimal64Column, NumericalColumn
@@ -91,7 +92,15 @@ def test_from_arrow_max_precision_decimal32():
     "to_dtype",
     [Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(18, 9)],
 )
-def test_typecast_from_float_to_decimal(data, from_dtype, to_dtype):
+def test_typecast_from_float_to_decimal(request, data, from_dtype, to_dtype):
+    request.applymarker(
+        pytest.mark.xfail(
+            condition=version.parse(pa.__version__) >= version.parse("13.0.0")
+            and from_dtype == np.dtype("float32")
+            and to_dtype.precision > 7,
+            reason="https://github.com/rapidsai/cudf/issues/14169",
+        )
+    )
     got = data.astype(from_dtype)
 
     pa_arr = got.to_arrow().cast(

From 52f7d5c7d5d340c3c9011beaa075babc98c1cc0b Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 27 Oct 2023 11:59:46 -0400
Subject: [PATCH 055/118] Expose stream parameter in public strings filter APIs
 (#14293)

Add stream parameter to public APIs:

- `cudf::strings::translate()`
- `cudf::strings::filter_characters()`
- `cudf::strings::filter_characters_of_type()`
- `cudf::strings::all_characters_of_type()`
- `cudf::strings::reverse()`

Also cleaned up some of the doxygen comments.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)
  - https://github.com/shrshi

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/shrshi

URL: https://github.com/rapidsai/cudf/pull/14293
---
 .../cudf/strings/char_types/char_types.hpp    | 26 ++++---
 cpp/include/cudf/strings/reverse.hpp          |  4 +-
 cpp/include/cudf/strings/translate.hpp        | 30 ++++----
 cpp/src/strings/char_types/char_types.cu      | 11 +--
 cpp/src/strings/filter_chars.cu               |  5 +-
 cpp/src/strings/reverse.cu                    |  3 +-
 cpp/src/strings/translate.cu                  |  5 +-
 cpp/tests/CMakeLists.txt                      |  2 +
 cpp/tests/streams/strings/filter_test.cpp     | 77 +++++++++++++++++++
 cpp/tests/streams/strings/reverse_test.cpp    | 34 ++++++++
 10 files changed, 162 insertions(+), 35 deletions(-)
 create mode 100644 cpp/tests/streams/strings/filter_test.cpp
 create mode 100644 cpp/tests/streams/strings/reverse_test.cpp

diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index 8b6c434719a..c6db5dab08a 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,18 +53,20 @@ namespace strings {
  *
  * Any null row results in a null entry for that row in the output column.
  *
- * @param strings Strings instance for this operation.
- * @param types The character types to check in each string.
+ * @param input Strings instance for this operation
+ * @param types The character types to check in each string
  * @param verify_types Only verify against these character types.
  *                     Default `ALL_TYPES` means return `true`
  *                     iff all characters match `types`.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
  */
 std::unique_ptr<column> all_characters_of_type(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_character_types types,
   string_character_types verify_types = string_character_types::ALL_TYPES,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -96,20 +98,22 @@ std::unique_ptr<column> all_characters_of_type(
  * @throw cudf::logic_error if neither or both `types_to_remove` and
  *        `types_to_keep` are set to `ALL_TYPES`.
  *
- * @param strings Strings instance for this operation.
+ * @param input Strings instance for this operation
  * @param types_to_remove The character types to check in each string.
  *        Use `ALL_TYPES` here to specify `types_to_keep` instead.
- * @param replacement The replacement character to use when removing characters.
+ * @param replacement The replacement character to use when removing characters
  * @param types_to_keep Default `ALL_TYPES` means all characters of
  *        `types_to_remove` will be filtered.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column of boolean results for each string.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return New column of boolean results for each string
  */
 std::unique_ptr<column> filter_characters_of_type(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   string_character_types types_to_remove,
   string_scalar const& replacement     = string_scalar(""),
   string_character_types types_to_keep = string_character_types::ALL_TYPES,
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/reverse.hpp b/cpp/include/cudf/strings/reverse.hpp
index 26fb36a540e..4fc8fbf67c2 100644
--- a/cpp/include/cudf/strings/reverse.hpp
+++ b/cpp/include/cudf/strings/reverse.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,10 +42,12 @@ namespace strings {
  *
  * @param input Strings column for this operation
  * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return New strings column
  */
 std::unique_ptr<column> reverse(
   strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/translate.hpp b/cpp/include/cudf/strings/translate.hpp
index 0cbf6b22029..4bd09352b09 100644
--- a/cpp/include/cudf/strings/translate.hpp
+++ b/cpp/include/cudf/strings/translate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,14 +47,16 @@ namespace strings {
  * r is now ["AA", "", "cccc", "AcQ"]
  * @endcode
  *
- * @param strings Strings instance for this operation.
- * @param chars_table Table of UTF-8 character mappings.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column with padded strings.
+ * @param input Strings instance for this operation
+ * @param chars_table Table of UTF-8 character mappings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with padded strings
  */
 std::unique_ptr<column> translate(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::vector<std::pair<char_utf8, char_utf8>> const& chars_table,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -87,19 +89,21 @@ enum class filter_type : bool {
  *
  * @throw cudf::logic_error if `replacement` is invalid
  *
- * @param strings Strings instance for this operation.
- * @param characters_to_filter Table of character ranges to filter on.
+ * @param input Strings instance for this operation
+ * @param characters_to_filter Table of character ranges to filter on
  * @param keep_characters If true, the `characters_to_filter` are retained and all other characters
- * are removed.
- * @param replacement Optional replacement string for each character removed.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column with filtered strings.
+ * are removed
+ * @param replacement Optional replacement string for each character removed
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with filtered strings
  */
 std::unique_ptr<column> filter_characters(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> characters_to_filter,
   filter_type keep_characters         = filter_type::KEEP,
   string_scalar const& replacement    = string_scalar(""),
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 0c0ad0ad29e..35b0c0a2690 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -214,25 +214,26 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
 
 // external API
 
-std::unique_ptr<column> all_characters_of_type(strings_column_view const& strings,
+std::unique_ptr<column> all_characters_of_type(strings_column_view const& input,
                                                string_character_types types,
                                                string_character_types verify_types,
+                                               rmm::cuda_stream_view stream,
                                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::all_characters_of_type(
-    strings, types, verify_types, cudf::get_default_stream(), mr);
+  return detail::all_characters_of_type(input, types, verify_types, stream, mr);
 }
 
-std::unique_ptr<column> filter_characters_of_type(strings_column_view const& strings,
+std::unique_ptr<column> filter_characters_of_type(strings_column_view const& input,
                                                   string_character_types types_to_remove,
                                                   string_scalar const& replacement,
                                                   string_character_types types_to_keep,
+                                                  rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::filter_characters_of_type(
-    strings, types_to_remove, replacement, types_to_keep, cudf::get_default_stream(), mr);
+    input, types_to_remove, replacement, types_to_keep, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 3e38b5fa775..9f95fedfe0b 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -154,15 +154,16 @@ std::unique_ptr<column> filter_characters(
  * @copydoc cudf::strings::filter_characters
  */
 std::unique_ptr<column> filter_characters(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> characters_to_filter,
   filter_type keep_characters,
   string_scalar const& replacement,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::filter_characters(
-    strings, characters_to_filter, keep_characters, replacement, cudf::get_default_stream(), mr);
+    input, characters_to_filter, keep_characters, replacement, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/reverse.cu b/cpp/src/strings/reverse.cu
index 090705ac25d..2855bdbb827 100644
--- a/cpp/src/strings/reverse.cu
+++ b/cpp/src/strings/reverse.cu
@@ -79,10 +79,11 @@ std::unique_ptr<column> reverse(strings_column_view const& input,
 }  // namespace detail
 
 std::unique_ptr<column> reverse(strings_column_view const& input,
+                                rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::reverse(input, cudf::get_default_stream(), mr);
+  return detail::reverse(input, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index e7b637c52f3..0ca5e103d3d 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -124,12 +124,13 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
 
 // external APIs
 
-std::unique_ptr<column> translate(strings_column_view const& strings,
+std::unique_ptr<column> translate(strings_column_view const& input,
                                   std::vector<std::pair<uint32_t, uint32_t>> const& chars_table,
+                                  rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::translate(strings, chars_table, cudf::get_default_stream(), mr);
+  return detail::translate(input, chars_table, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 95411668284..1259594dbc0 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -641,8 +641,10 @@ ConfigureTest(
   streams/strings/contains_test.cpp
   streams/strings/convert_test.cpp
   streams/strings/extract_test.cpp
+  streams/strings/filter_test.cpp
   streams/strings/find_test.cpp
   streams/strings/replace_test.cpp
+  streams/strings/reverse_test.cpp
   streams/strings/split_test.cpp
   streams/strings/strings_tests.cpp
   STREAM_MODE
diff --git a/cpp/tests/streams/strings/filter_test.cpp b/cpp/tests/streams/strings/filter_test.cpp
new file mode 100644
index 00000000000..3c44eb81380
--- /dev/null
+++ b/cpp/tests/streams/strings/filter_test.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/strings/char_types/char_types.hpp>
+#include <cudf/strings/translate.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <string>
+#include <vector>
+
+class StringsFilterTest : public cudf::test::BaseFixture {};
+
+static std::pair<cudf::char_utf8, cudf::char_utf8> make_entry(char const* from, char const* to)
+{
+  cudf::char_utf8 in  = 0;
+  cudf::char_utf8 out = 0;
+  cudf::strings::detail::to_char_utf8(from, in);
+  if (to) cudf::strings::detail::to_char_utf8(to, out);
+  return std::pair(in, out);
+}
+
+TEST_F(StringsFilterTest, Translate)
+{
+  auto input = cudf::test::strings_column_wrapper({"  aBc  ", "   ", "aaaa ", "\tb"});
+  auto view  = cudf::strings_column_view(input);
+
+  std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> translate_table{
+    make_entry("b", 0), make_entry("a", "A"), make_entry(" ", "_")};
+  cudf::strings::translate(view, translate_table, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsFilterTest, Filter)
+{
+  auto input = cudf::test::strings_column_wrapper({"  aBc  ", "   ", "aaaa ", "\tb"});
+  auto view  = cudf::strings_column_view(input);
+
+  std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> filter_table{
+    make_entry("b", 0), make_entry("a", "A"), make_entry(" ", "_")};
+
+  auto const repl = cudf::string_scalar("X", true, cudf::test::get_default_stream());
+  auto const keep = cudf::strings::filter_type::KEEP;
+  cudf::strings::filter_characters(
+    view, filter_table, keep, repl, cudf::test::get_default_stream());
+}
+
+TEST_F(StringsFilterTest, FilterTypes)
+{
+  auto input = cudf::test::strings_column_wrapper({"  aBc  ", "   ", "aaaa ", "\tb"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto const verify_types =
+    cudf::strings::string_character_types::LOWER | cudf::strings::string_character_types::UPPER;
+  auto const all_types = cudf::strings::string_character_types::ALL_TYPES;
+  cudf::strings::all_characters_of_type(
+    view, verify_types, all_types, cudf::test::get_default_stream());
+
+  auto const repl        = cudf::string_scalar("X", true, cudf::test::get_default_stream());
+  auto const space_types = cudf::strings::string_character_types::SPACE;
+  cudf::strings::filter_characters_of_type(
+    view, all_types, repl, space_types, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/strings/reverse_test.cpp b/cpp/tests/streams/strings/reverse_test.cpp
new file mode 100644
index 00000000000..83dcf24594e
--- /dev/null
+++ b/cpp/tests/streams/strings/reverse_test.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/strings/reverse.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <string>
+#include <vector>
+
+class StringsReverseTest : public cudf::test::BaseFixture {};
+
+TEST_F(StringsReverseTest, Reverse)
+{
+  auto input = cudf::test::strings_column_wrapper({"aBcdef", "   ", "12345"});
+  auto view  = cudf::strings_column_view(input);
+
+  cudf::strings::reverse(view, cudf::test::get_default_stream());
+}

From 83746a408381f45eccd15971f8a901149dce743e Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 27 Oct 2023 13:23:40 -0400
Subject: [PATCH 056/118] Expose stream parameter in public nvtext tokenize
 APIs (#14317)

Add stream parameter to public APIs:

- `nvtext::tokenize()` (x2)
- `nvtext::count_tokens()` (x2)
- `nvtext::character_tokenize()`
- `nvtext::detokenize()`

Also cleaned up some of the doxygen comments and added stream gtests.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/14317
---
 cpp/include/nvtext/tokenize.hpp          | 60 ++++++++++++++----------
 cpp/src/text/detokenize.cu               |  7 +--
 cpp/src/text/tokenize.cu                 | 25 ++++++----
 cpp/tests/CMakeLists.txt                 |  4 +-
 cpp/tests/streams/text/tokenize_test.cpp | 53 +++++++++++++++++++++
 5 files changed, 111 insertions(+), 38 deletions(-)
 create mode 100644 cpp/tests/streams/text/tokenize_test.cpp

diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp
index 44f8f44557c..107fefcc3bf 100644
--- a/cpp/include/nvtext/tokenize.hpp
+++ b/cpp/include/nvtext/tokenize.hpp
@@ -49,15 +49,17 @@ namespace nvtext {
  *
  * All null row entries are ignored and the output contains all valid rows.
  *
- * @param strings Strings column tokenize.
+ * @param input Strings column to tokenize
  * @param delimiter UTF-8 characters used to separate each string into tokens.
  *                  The default of empty string will separate tokens using whitespace.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> tokenize(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
@@ -84,14 +86,16 @@ std::unique_ptr<cudf::column> tokenize(
  *
  * @throw cudf::logic_error if the delimiters column is empty or contains nulls.
  *
- * @param strings Strings column to tokenize.
- * @param delimiters Strings used to separate individual strings into tokens.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param input Strings column to tokenize
+ * @param delimiters Strings used to separate individual strings into tokens
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> tokenize(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::strings_column_view const& delimiters,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -112,15 +116,17 @@ std::unique_ptr<cudf::column> tokenize(
  * All null row entries are ignored and the output contains all valid rows.
  * The number of tokens for a null element is set to 0 in the output column.
  *
- * @param strings Strings column to use for this operation
- * @param delimiter Strings used to separate each string into tokens;
+ * @param input Strings column to count tokens
+ * @param delimiter Strings used to separate each string into tokens.
  *                  The default of empty string will separate tokens using whitespace.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column of token counts
  */
 std::unique_ptr<cudf::column> count_tokens(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
@@ -143,14 +149,16 @@ std::unique_ptr<cudf::column> count_tokens(
  *
  * @throw cudf::logic_error if the delimiters column is empty or contains nulls
  *
- * @param strings Strings column to use for this operation
+ * @param input Strings column to count tokens
  * @param delimiters Strings used to separate each string into tokens
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New column of token counts
  */
 std::unique_ptr<cudf::column> count_tokens(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::strings_column_view const& delimiters,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -168,12 +176,14 @@ std::unique_ptr<cudf::column> count_tokens(
  *
  * All null row entries are ignored and the output contains all valid rows.
  *
- * @param strings Strings column to tokenize.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param input Strings column to tokenize
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> character_tokenize(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -203,16 +213,18 @@ std::unique_ptr<cudf::column> character_tokenize(
  * @throw cudf::logic_error if `row_indices.size() != strings.size()`
  * @throw cudf::logic_error if `row_indices` contains nulls
  *
- * @param strings Strings column to detokenize.
- * @param row_indices The relative output row index assigned for each token in the input column.
- * @param separator String to append after concatenating each token to the proper output row.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param input Strings column to detokenize
+ * @param row_indices The relative output row index assigned for each token in the input column
+ * @param separator String to append after concatenating each token to the proper output row
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> detokenize(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::column_view const& row_indices,
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index a17583cf649..38cb7dd6753 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -169,13 +169,14 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
 
 }  // namespace detail
 
-std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& input,
                                          cudf::column_view const& row_indices,
                                          cudf::string_scalar const& separator,
+                                         rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::detokenize(strings, row_indices, separator, cudf::get_default_stream(), mr);
+  return detail::detokenize(input, row_indices, separator, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 16b9f25b802..87f6a61a533 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -232,43 +232,48 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
 
 // external APIs
 
-std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
                                        cudf::string_scalar const& delimiter,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::tokenize(strings, delimiter, cudf::get_default_stream(), mr);
+  return detail::tokenize(input, delimiter, stream, mr);
 }
 
-std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
                                        cudf::strings_column_view const& delimiters,
+                                       rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::tokenize(strings, delimiters, cudf::get_default_stream(), mr);
+  return detail::tokenize(input, delimiters, stream, mr);
 }
 
-std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& input,
                                            cudf::string_scalar const& delimiter,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_tokens(strings, delimiter, cudf::get_default_stream(), mr);
+  return detail::count_tokens(input, delimiter, stream, mr);
 }
 
-std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& input,
                                            cudf::strings_column_view const& delimiters,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_tokens(strings, delimiters, cudf::get_default_stream(), mr);
+  return detail::count_tokens(input, delimiters, stream, mr);
 }
 
-std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const& input,
+                                                 rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::character_tokenize(strings, cudf::get_default_stream(), mr);
+  return detail::character_tokenize(input, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 1259594dbc0..10937212bc1 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -651,7 +651,9 @@ ConfigureTest(
   testing
 )
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
+ConfigureTest(
+  STREAM_TEXT_TEST streams/text/ngrams_test.cpp streams/text/tokenize_test.cpp STREAM_MODE testing
+)
 ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
diff --git a/cpp/tests/streams/text/tokenize_test.cpp b/cpp/tests/streams/text/tokenize_test.cpp
new file mode 100644
index 00000000000..b281fbc2c0c
--- /dev/null
+++ b/cpp/tests/streams/text/tokenize_test.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <nvtext/tokenize.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class TextTokenizeTest : public cudf::test::BaseFixture {};
+
+TEST_F(TextTokenizeTest, Tokenize)
+{
+  auto const input     = cudf::test::strings_column_wrapper({"the fox jumped", "over thé dog"});
+  auto const view      = cudf::strings_column_view(input);
+  auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
+  nvtext::tokenize(view, delimiter, cudf::test::get_default_stream());
+  nvtext::count_tokens(view, delimiter, cudf::test::get_default_stream());
+  auto const delimiters = cudf::test::strings_column_wrapper({" ", "o", "é"});
+  nvtext::tokenize(view, cudf::strings_column_view(delimiters), cudf::test::get_default_stream());
+  nvtext::count_tokens(
+    view, cudf::strings_column_view(delimiters), cudf::test::get_default_stream());
+}
+
+TEST_F(TextTokenizeTest, CharacterTokenize)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
+  nvtext::character_tokenize(cudf::strings_column_view(input), cudf::test::get_default_stream());
+}
+
+TEST_F(TextTokenizeTest, Detokenize)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
+  auto const view      = cudf::strings_column_view(input);
+  auto const indices   = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 0, 1, 1, 1});
+  auto const separator = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
+  nvtext::detokenize(view, indices, separator, cudf::test::get_default_stream());
+}

From 87d9ba4026f66474f45d5f36a977f2261d6bbda8 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 27 Oct 2023 10:51:24 -0700
Subject: [PATCH 057/118] Optimize ORC writer for decimal columns (#14190)

Use thread blocks to scan over decimal element sizes in row groups instead of using a single CUDA thread.

Up to 60% performance improvement with decimal columns.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/14190
---
 cpp/src/io/orc/orc_gpu.hpp    | 14 +++++++++
 cpp/src/io/orc/stripe_enc.cu  | 57 +++++++++++++++++++++++++++++++++++
 cpp/src/io/orc/writer_impl.cu | 19 +++---------
 3 files changed, 75 insertions(+), 15 deletions(-)

diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index dba7a9ffda5..5669a20907d 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -424,6 +424,20 @@ void rowgroup_char_counts(device_2dspan<size_type> counts,
                           device_span<uint32_t const> str_col_indexes,
                           rmm::cuda_stream_view stream);
 
+/**
+ * @brief Converts sizes of decimal elements to offsets within the rowgroup.
+ *
+ * @note The conversion is done in-place. After the conversion, the device vectors in \p elem_sizes
+ * hold the offsets.
+ *
+ * @param rg_bounds Ranges of rows in each rowgroup [rowgroup][column]
+ * @param elem_sizes Map between column indexes and decimal element sizes
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+void decimal_sizes_to_offsets(device_2dspan<rowgroup_rows const> rg_bounds,
+                              std::map<uint32_t, rmm::device_uvector<uint32_t>>& elem_sizes,
+                              rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernels to initialize statistics collection
  *
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 4841fb1141a..5c75ba22159 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -24,6 +24,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
 
@@ -1260,6 +1261,38 @@ __global__ void __launch_bounds__(1024)
   }
 }
 
+// Holds a non-owning view of a decimal column's element sizes
+struct decimal_column_element_sizes {
+  uint32_t col_idx;
+  device_span<uint32_t> sizes;
+};
+
+// Converts sizes of individual decimal elements to offsets within each row group
+// Conversion is done in-place
+template <int block_size>
+__global__ void decimal_sizes_to_offsets_kernel(device_2dspan<rowgroup_rows const> rg_bounds,
+                                                device_span<decimal_column_element_sizes> sizes)
+{
+  using block_scan = cub::BlockScan<uint32_t, block_size>;
+  __shared__ typename block_scan::TempStorage scan_storage;
+  int const t = threadIdx.x;
+
+  auto const& col_elem_sizes = sizes[blockIdx.x];
+  auto const& row_group      = rg_bounds[blockIdx.y][col_elem_sizes.col_idx];
+  auto const elem_sizes      = col_elem_sizes.sizes.data() + row_group.begin;
+
+  uint32_t initial_value = 0;
+  // Do a series of block sums, storing results in the array as we go
+  for (int64_t pos = 0; pos < row_group.size(); pos += block_size) {
+    auto const tidx    = pos + t;
+    auto tval          = tidx < row_group.size() ? elem_sizes[tidx] : 0u;
+    uint32_t block_sum = 0;
+    block_scan(scan_storage).InclusiveSum(tval, tval, block_sum);
+    if (tidx < row_group.size()) { elem_sizes[tidx] = tval + initial_value; }
+    initial_value += block_sum;
+  }
+}
+
 void EncodeOrcColumnData(device_2dspan<EncChunk const> chunks,
                          device_2dspan<encoder_chunk_streams> streams,
                          rmm::cuda_stream_view stream)
@@ -1368,6 +1401,30 @@ std::optional<writer_compression_statistics> CompressOrcDataStreams(
   }
 }
 
+void decimal_sizes_to_offsets(device_2dspan<rowgroup_rows const> rg_bounds,
+                              std::map<uint32_t, rmm::device_uvector<uint32_t>>& elem_sizes,
+                              rmm::cuda_stream_view stream)
+{
+  if (rg_bounds.count() == 0) return;
+
+  // Convert map to a vector of views of the `elem_sizes` device buffers
+  std::vector<decimal_column_element_sizes> h_sizes;
+  h_sizes.reserve(elem_sizes.size());
+  std::transform(elem_sizes.begin(), elem_sizes.end(), std::back_inserter(h_sizes), [](auto& p) {
+    return decimal_column_element_sizes{p.first, p.second};
+  });
+
+  // Copy the vector of views to the device so that we can pass it to the kernel
+  auto d_sizes = cudf::detail::make_device_uvector_async<decimal_column_element_sizes>(
+    h_sizes, stream, rmm::mr::get_current_device_resource());
+
+  constexpr int block_size = 256;
+  dim3 const grid_size{static_cast<unsigned int>(elem_sizes.size()),        // num decimal columns
+                       static_cast<unsigned int>(rg_bounds.size().first)};  // num rowgroups
+  decimal_sizes_to_offsets_kernel<block_size>
+    <<<grid_size, block_size, 0, stream.value()>>>(rg_bounds, d_sizes);
+}
+
 }  // namespace gpu
 }  // namespace orc
 }  // namespace io
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 3d8bdb4ec97..81629e03a82 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1882,7 +1882,7 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
       auto& current_sizes =
         elem_sizes.insert({orc_col.index(), rmm::device_uvector<uint32_t>(orc_col.size(), stream)})
           .first->second;
-      thrust::tabulate(rmm::exec_policy(stream),
+      thrust::tabulate(rmm::exec_policy_nosync(stream),
                        current_sizes.begin(),
                        current_sizes.end(),
                        [d_cols  = device_span<orc_column_device_view const>{orc_table.d_columns},
@@ -1908,25 +1908,14 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
                          return varint_size(zigzaged_value);
                        });
 
-      // Compute element offsets within each row group
-      thrust::for_each_n(rmm::exec_policy(stream),
-                         thrust::make_counting_iterator(0ul),
-                         segmentation.num_rowgroups(),
-                         [sizes     = device_span<uint32_t>{current_sizes},
-                          rg_bounds = device_2dspan<rowgroup_rows const>{segmentation.rowgroups},
-                          col_idx   = orc_col.index()] __device__(auto rg_idx) {
-                           auto const& range = rg_bounds[rg_idx][col_idx];
-                           thrust::inclusive_scan(thrust::seq,
-                                                  sizes.begin() + range.begin,
-                                                  sizes.begin() + range.end,
-                                                  sizes.begin() + range.begin);
-                         });
-
       orc_col.attach_decimal_offsets(current_sizes.data());
     }
   }
   if (elem_sizes.empty()) return {};
 
+  // Compute element offsets within each row group
+  gpu::decimal_sizes_to_offsets(segmentation.rowgroups, elem_sizes, stream);
+
   // Gather the row group sizes and copy to host
   auto d_tmp_rowgroup_sizes = rmm::device_uvector<uint32_t>(segmentation.num_rowgroups(), stream);
   std::map<uint32_t, std::vector<uint32_t>> rg_sizes;

From 9354fb5de704c7771e92058d4cdbc6a2ae7e20f1 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 27 Oct 2023 11:11:08 -0700
Subject: [PATCH 058/118] Fix host buffer access from device function in the
 Parquet reader (#14328)

Closes https://github.com/rapidsai/cudf/issues/14311
The host access becomes an issue when pageable memory is used for host side of `hostdevice_vector`.
This PR fixes the device lambda to use the device pointer instead.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - https://github.com/shrshi
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14328
---
 cpp/src/io/parquet/reader_impl_preprocess.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 8494dc72a1d..80a4d00a5a2 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -310,8 +310,8 @@ int decode_page_headers(cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks
   }
 
   // compute max bytes needed for level data
-  auto level_bit_size =
-    cudf::detail::make_counting_transform_iterator(0, [chunks = chunks.begin()] __device__(int i) {
+  auto level_bit_size = cudf::detail::make_counting_transform_iterator(
+    0, [chunks = chunks.d_begin()] __device__(int i) {
       auto c = chunks[i];
       return static_cast<int>(
         max(c.level_bits[level_type::REPETITION], c.level_bits[level_type::DEFINITION]));

From 751370e111416c9e1561c6bcc741284ae46c9427 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 27 Oct 2023 16:41:32 -0500
Subject: [PATCH 059/118] Enable `dask_cudf/io` pytests in CI (#14338)

@rjzamora recently identified and fixed(https://github.com/rapidsai/cudf/pull/14327/) an issue where `dask_cudf/io` pytests weren't being run in ci, however, this was fixed for wheels pytests. This PR fixes the same issue in `conda` pytests.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14338
---
 ci/test_python_other.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
index ab36fbbb5ff..25c1d681029 100755
--- a/ci/test_python_other.sh
+++ b/ci/test_python_other.sh
@@ -27,7 +27,7 @@ pytest \
   --cov=dask_cudf \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \
   --cov-report=term \
-  tests
+  .
 popd
 
 rapids-logger "pytest custreamz"

From 2bc454aa6c6511d29f60839279eb8734cb5f24f5 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Fri, 27 Oct 2023 20:31:24 -0400
Subject: [PATCH 060/118] Reimplement `cudf::merge` for nested types without
 using comparators (#14250)

Part of #11844

This PR also uses new experimental comparators for non-nested types by introducing a new device constructor for `cudf::experimental::row::lexicographic::device_row_comparator`. In the case of non-nested types, preprocessing can be skipped so comparators can be created on the fly. This solution helps us avoid creating 3 comparator types because `thrust::merge` can call the operator with indices from either side of the table.

Furthermore, the PR reworks `cudf/detail/merge.cuh` by removing any CUDA headers/components to expose a true detail API of the form `cudf/detail/merge.hpp`.

[Benchmark comparison for non-nested types](https://github.com/rapidsai/cudf/pull/14250#issuecomment-1747620300)

Compilation time increases from ~6 mins to ~7 mins.

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/14250
---
 cpp/benchmarks/CMakeLists.txt                 |   1 +
 .../generate_nested_types.hpp}                |   2 +-
 cpp/benchmarks/merge/merge_lists.cpp          |  54 ++++++
 cpp/benchmarks/merge/merge_structs.cpp        |  54 ++++++
 cpp/benchmarks/sort/rank_lists.cpp            |   3 +-
 cpp/benchmarks/sort/rank_structs.cpp          |   2 +-
 cpp/benchmarks/sort/sort_lists.cpp            |   2 +-
 cpp/benchmarks/sort/sort_structs.cpp          |   2 +-
 cpp/include/cudf/detail/merge.cuh             | 166 -----------------
 cpp/include/cudf/detail/merge.hpp             |  60 +++++++
 cpp/include/cudf/dictionary/detail/merge.hpp  |   4 +-
 cpp/include/cudf/merge.hpp                    |   7 +-
 cpp/include/cudf/strings/detail/merge.cuh     |   2 +-
 .../cudf/table/experimental/row_operators.cuh |  67 +++++++
 cpp/src/merge/merge.cu                        | 170 +++++++++++++++++-
 .../quantiles/tdigest/tdigest_aggregation.cu  |   2 +-
 cpp/tests/merge/merge_test.cpp                | 113 ++++++++++++
 17 files changed, 525 insertions(+), 186 deletions(-)
 rename cpp/benchmarks/{sort/nested_types_common.hpp => common/generate_nested_types.hpp} (98%)
 create mode 100644 cpp/benchmarks/merge/merge_lists.cpp
 create mode 100644 cpp/benchmarks/merge/merge_structs.cpp
 delete mode 100644 cpp/include/cudf/detail/merge.cuh
 create mode 100644 cpp/include/cudf/detail/merge.hpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index cd6b3cfdc03..a3e2b4ed6db 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -230,6 +230,7 @@ ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp)
 # ##################################################################################################
 # * merge benchmark -------------------------------------------------------------------------------
 ConfigureBench(MERGE_BENCH merge/merge.cpp)
+ConfigureNVBench(MERGE_NVBENCH merge/merge_structs.cpp merge/merge_lists.cpp)
 
 # ##################################################################################################
 # * null_mask benchmark ---------------------------------------------------------------------------
diff --git a/cpp/benchmarks/sort/nested_types_common.hpp b/cpp/benchmarks/common/generate_nested_types.hpp
similarity index 98%
rename from cpp/benchmarks/sort/nested_types_common.hpp
rename to cpp/benchmarks/common/generate_nested_types.hpp
index 93853ba5768..ee9e3ca9de3 100644
--- a/cpp/benchmarks/sort/nested_types_common.hpp
+++ b/cpp/benchmarks/common/generate_nested_types.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <benchmarks/common/generate_input.hpp>
+#include "generate_input.hpp"
 
 #include <cudf_test/column_wrapper.hpp>
 
diff --git a/cpp/benchmarks/merge/merge_lists.cpp b/cpp/benchmarks/merge/merge_lists.cpp
new file mode 100644
index 00000000000..bcb9f10ac83
--- /dev/null
+++ b/cpp/benchmarks/merge/merge_lists.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_nested_types.hpp>
+
+#include <cudf/detail/merge.hpp>
+#include <cudf/detail/sorting.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+void nvbench_merge_list(nvbench::state& state)
+{
+  rmm::cuda_stream_view stream;
+
+  auto const input1 = create_lists_data(state);
+  auto const sorted_input1 =
+    cudf::detail::sort(*input1, {}, {}, stream, rmm::mr::get_current_device_resource());
+
+  auto const input2 = create_lists_data(state);
+  auto const sorted_input2 =
+    cudf::detail::sort(*input2, {}, {}, stream, rmm::mr::get_current_device_resource());
+
+  stream.synchronize();
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+
+    cudf::detail::merge({*sorted_input1, *sorted_input2},
+                        {0},
+                        {cudf::order::ASCENDING},
+                        {},
+                        stream_view,
+                        rmm::mr::get_current_device_resource());
+  });
+}
+
+NVBENCH_BENCH(nvbench_merge_list)
+  .set_name("merge_lists")
+  .add_int64_power_of_two_axis("size_bytes", {10, 18, 24, 28})
+  .add_int64_axis("depth", {1, 4})
+  .add_float64_axis("null_frequency", {0, 0.2});
diff --git a/cpp/benchmarks/merge/merge_structs.cpp b/cpp/benchmarks/merge/merge_structs.cpp
new file mode 100644
index 00000000000..9c56b44b623
--- /dev/null
+++ b/cpp/benchmarks/merge/merge_structs.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_nested_types.hpp>
+
+#include <cudf/detail/merge.hpp>
+#include <cudf/detail/sorting.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+void nvbench_merge_struct(nvbench::state& state)
+{
+  rmm::cuda_stream_view stream;
+
+  auto const input1 = create_structs_data(state);
+  auto const sorted_input1 =
+    cudf::detail::sort(*input1, {}, {}, stream, rmm::mr::get_current_device_resource());
+
+  auto const input2 = create_structs_data(state);
+  auto const sorted_input2 =
+    cudf::detail::sort(*input2, {}, {}, stream, rmm::mr::get_current_device_resource());
+
+  stream.synchronize();
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+
+    cudf::detail::merge({*sorted_input1, *sorted_input2},
+                        {0},
+                        {cudf::order::ASCENDING},
+                        {},
+                        stream_view,
+                        rmm::mr::get_current_device_resource());
+  });
+}
+
+NVBENCH_BENCH(nvbench_merge_struct)
+  .set_name("merge_struct")
+  .add_int64_power_of_two_axis("NumRows", {10, 18, 26})
+  .add_int64_axis("Depth", {0, 1, 8})
+  .add_int64_axis("Nulls", {0, 1});
diff --git a/cpp/benchmarks/sort/rank_lists.cpp b/cpp/benchmarks/sort/rank_lists.cpp
index 49dc409ebfc..c23f3c891f0 100644
--- a/cpp/benchmarks/sort/rank_lists.cpp
+++ b/cpp/benchmarks/sort/rank_lists.cpp
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#include "nested_types_common.hpp"
 #include "rank_types_common.hpp"
 
+#include <benchmarks/common/generate_nested_types.hpp>
+
 #include <cudf/sorting.hpp>
 
 #include <cudf_test/column_utilities.hpp>
diff --git a/cpp/benchmarks/sort/rank_structs.cpp b/cpp/benchmarks/sort/rank_structs.cpp
index 85427e2128f..271b883e62a 100644
--- a/cpp/benchmarks/sort/rank_structs.cpp
+++ b/cpp/benchmarks/sort/rank_structs.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "nested_types_common.hpp"
 #include "rank_types_common.hpp"
+#include <benchmarks/common/generate_nested_types.hpp>
 
 #include <cudf/sorting.hpp>
 
diff --git a/cpp/benchmarks/sort/sort_lists.cpp b/cpp/benchmarks/sort/sort_lists.cpp
index 4b04323a99f..2052de3688c 100644
--- a/cpp/benchmarks/sort/sort_lists.cpp
+++ b/cpp/benchmarks/sort/sort_lists.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "nested_types_common.hpp"
+#include <benchmarks/common/generate_nested_types.hpp>
 
 #include <cudf/detail/sorting.hpp>
 
diff --git a/cpp/benchmarks/sort/sort_structs.cpp b/cpp/benchmarks/sort/sort_structs.cpp
index 1d54fa42f6f..3a3d1080ba0 100644
--- a/cpp/benchmarks/sort/sort_structs.cpp
+++ b/cpp/benchmarks/sort/sort_structs.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "nested_types_common.hpp"
+#include <benchmarks/common/generate_nested_types.hpp>
 
 #include <cudf/detail/sorting.hpp>
 
diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh
deleted file mode 100644
index e8e9b080a92..00000000000
--- a/cpp/include/cudf/detail/merge.cuh
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/table/row_operators.cuh>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <thrust/merge.h>
-#include <thrust/pair.h>
-#include <thrust/tuple.h>
-
-namespace cudf {
-namespace detail {
-/**
- * @brief Source table identifier to copy data from.
- */
-enum class side : bool { LEFT, RIGHT };
-
-/**
- * @brief Tagged index type: `thrust::get<0>` indicates left/right side,
- * `thrust::get<1>` indicates the row index
- */
-using index_type = thrust::pair<side, cudf::size_type>;
-
-/**
- * @brief Vector of `index_type` values.
- */
-using index_vector = rmm::device_uvector<index_type>;
-
-/**
- * @brief tagged_element_relational_comparator uses element_relational_comparator to provide
- * "tagged-index" comparison logic.
- *
- * Special treatment is necessary in several thrust algorithms (e.g., merge()) where
- * the index affinity to the side is not guaranteed; i.e., the algorithms rely on
- * binary functors (predicates) where the operands may transparently switch sides.
- *
- * For example,
- *         thrust::merge(left_container,
- *                       right_container,
- *                       predicate(lhs, rhs){...});
- *         can create 4 different use-cases, inside predicate(...):
- *
- *         1. lhs refers to the left container; rhs to the right container;
- *         2. vice-versa;
- *         3. both lhs and rhs actually refer to the left container;
- *         4. both lhs and rhs actually refer to the right container;
- *
- * Because of that, one cannot rely on the predicate having *fixed* references to the containers.
- * Each invocation may land in a different situation (among the 4 above) than any other invocation.
- * Also, one cannot just manipulate lhs, rhs (indices) alone; because, if predicate always applies
- * one index to one container and the other index to the other container,
- * switching the indices alone won't suffice in the cases (3) or (4),
- * where the also the containers must be changed (to just one instead of two)
- * independently of indices;
- *
- * As a result, a special comparison logic is necessary whereby the index is "tagged" with side
- * information and consequently comparator functors (predicates) must operate on these tagged
- * indices rather than on raw indices.
- */
-template <bool has_nulls = true>
-struct tagged_element_relational_comparator {
-  __host__ __device__ tagged_element_relational_comparator(column_device_view lhs,
-                                                           column_device_view rhs,
-                                                           null_order null_precedence)
-    : lhs{lhs}, rhs{rhs}, null_precedence{null_precedence}
-  {
-  }
-
-  [[nodiscard]] __device__ weak_ordering compare(index_type lhs_tagged_index,
-                                                 index_type rhs_tagged_index) const noexcept
-  {
-    auto const [l_side, l_indx] = lhs_tagged_index;
-    auto const [r_side, r_indx] = rhs_tagged_index;
-
-    column_device_view const* ptr_left_dview{l_side == side::LEFT ? &lhs : &rhs};
-    column_device_view const* ptr_right_dview{r_side == side::LEFT ? &lhs : &rhs};
-
-    auto erl_comparator = element_relational_comparator(
-      nullate::DYNAMIC{has_nulls}, *ptr_left_dview, *ptr_right_dview, null_precedence);
-
-    return cudf::type_dispatcher(lhs.type(), erl_comparator, l_indx, r_indx);
-  }
-
- private:
-  column_device_view lhs;
-  column_device_view rhs;
-  null_order null_precedence;
-};
-
-/**
- * @brief The equivalent of `row_lexicographic_comparator` for tagged indices.
- */
-template <bool has_nulls = true>
-struct row_lexicographic_tagged_comparator {
-  row_lexicographic_tagged_comparator(table_device_view lhs,
-                                      table_device_view rhs,
-                                      order const* column_order         = nullptr,
-                                      null_order const* null_precedence = nullptr)
-    : _lhs{lhs}, _rhs{rhs}, _column_order{column_order}, _null_precedence{null_precedence}
-  {
-    // Add check for types to be the same.
-    CUDF_EXPECTS(_lhs.num_columns() == _rhs.num_columns(), "Mismatched number of columns.");
-  }
-
-  __device__ bool operator()(index_type lhs_tagged_index,
-                             index_type rhs_tagged_index) const noexcept
-  {
-    for (size_type i = 0; i < _lhs.num_columns(); ++i) {
-      bool ascending = (_column_order == nullptr) or (_column_order[i] == order::ASCENDING);
-
-      null_order null_precedence =
-        _null_precedence == nullptr ? null_order::BEFORE : _null_precedence[i];
-
-      auto comparator = tagged_element_relational_comparator<has_nulls>{
-        _lhs.column(i), _rhs.column(i), null_precedence};
-
-      weak_ordering state = comparator.compare(lhs_tagged_index, rhs_tagged_index);
-
-      if (state == weak_ordering::EQUIVALENT) { continue; }
-
-      return state == (ascending ? weak_ordering::LESS : weak_ordering::GREATER);
-    }
-    return false;
-  }
-
- private:
-  table_device_view _lhs;
-  table_device_view _rhs;
-  null_order const* _null_precedence{};
-  order const* _column_order{};
-};
-
-/**
- * @copydoc std::unique_ptr<cudf::table> merge(
- *            std::vector<table_view> const& tables_to_merge,
- *            std::vector<cudf::size_type> const& key_cols,
- *            std::vector<cudf::order> const& column_order,
- *            std::vector<cudf::null_order> const& null_precedence,
- *            rmm::mr::device_memory_resource* mr)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches
- */
-std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merge,
-                                   std::vector<cudf::size_type> const& key_cols,
-                                   std::vector<cudf::order> const& column_order,
-                                   std::vector<cudf::null_order> const& null_precedence,
-                                   rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/include/cudf/detail/merge.hpp b/cpp/include/cudf/detail/merge.hpp
new file mode 100644
index 00000000000..2167a484214
--- /dev/null
+++ b/cpp/include/cudf/detail/merge.hpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/pair.h>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Source table identifier to copy data from.
+ */
+enum class side : bool { LEFT, RIGHT };
+
+/**
+ * @brief Tagged index type: `thrust::get<0>` indicates left/right side,
+ * `thrust::get<1>` indicates the row index
+ */
+using index_type = thrust::pair<side, cudf::size_type>;
+
+/**
+ * @brief Vector of `index_type` values.
+ */
+using index_vector = rmm::device_uvector<index_type>;
+
+/**
+ * @copydoc std::unique_ptr<cudf::table> merge(
+ *            std::vector<table_view> const& tables_to_merge,
+ *            std::vector<cudf::size_type> const& key_cols,
+ *            std::vector<cudf::order> const& column_order,
+ *            std::vector<cudf::null_order> const& null_precedence,
+ *            rmm::mr::device_memory_resource* mr)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merge,
+                                   std::vector<cudf::size_type> const& key_cols,
+                                   std::vector<cudf::order> const& column_order,
+                                   std::vector<cudf::null_order> const& null_precedence,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/dictionary/detail/merge.hpp b/cpp/include/cudf/dictionary/detail/merge.hpp
index e7ea53c740a..cad495d0097 100644
--- a/cpp/include/cudf/dictionary/detail/merge.hpp
+++ b/cpp/include/cudf/dictionary/detail/merge.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
-#include <cudf/detail/merge.cuh>
+#include <cudf/detail/merge.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/include/cudf/merge.hpp b/cpp/include/cudf/merge.hpp
index 3d09550209d..8886ec24bfe 100644
--- a/cpp/include/cudf/merge.hpp
+++ b/cpp/include/cudf/merge.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,10 @@ namespace cudf {
  * @brief Merge a set of sorted tables.
  *
  * Merges sorted tables into one sorted table
- * containing data from all tables.
+ * containing data from all tables. The key columns
+ * of each table must be sorted according to the
+ * parameters (cudf::column_order and cudf::null_order)
+ * specified for that column.
  *
  * ```
  * Example 1:
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
index 965e89cc862..5f50faa158e 100644
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ b/cpp/include/cudf/strings/detail/merge.cuh
@@ -18,8 +18,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/merge.hpp>
 #include <cudf/detail/null_mask.hpp>
-#include <cudf/merge.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index 6b024d902a9..6946ccdb213 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -52,6 +52,7 @@
 #include <limits>
 #include <memory>
 #include <optional>
+#include <type_traits>
 #include <utility>
 
 namespace cudf {
@@ -264,6 +265,7 @@ template <bool has_nested_columns,
           typename Nullate,
           typename PhysicalElementComparator = sorting_physical_element_comparator>
 class device_row_comparator {
+ public:
   friend class self_comparator;       ///< Allow self_comparator to access private members
   friend class two_table_comparator;  ///< Allow two_table_comparator to access private members
 
@@ -274,6 +276,8 @@ class device_row_comparator {
    * @param check_nulls Indicates if any input column contains nulls.
    * @param lhs The first table
    * @param rhs The second table (may be the same table as `lhs`)
+   * @param l_dremel_device_views lhs table dremel device view for list type
+   * @param r_dremel_device_views rhs table dremel device view for list type
    * @param depth Optional, device array the same length as a row that contains starting depths of
    * columns if they're nested, and 0 otherwise.
    * @param column_order Optional, device array the same length as a row that indicates the desired
@@ -305,6 +309,44 @@ class device_row_comparator {
   {
   }
 
+  /**
+   * @brief Construct a function object for performing a lexicographic
+   * comparison between the rows of two tables.
+   * This is a special overload to allow device-side construction of the
+   * comparator for cases where no preprocessing is needed, i.e. tables with
+   * non-nested type columns.
+   *
+   * @param check_nulls Indicates if any input column contains nulls.
+   * @param lhs The first table
+   * @param rhs The second table (may be the same table as `lhs`)
+   * @param column_order Optional, device array the same length as a row that indicates the desired
+   * ascending/descending order of each column in a row. If `nullopt`, it is assumed all columns are
+   * sorted in ascending order.
+   * @param null_precedence Optional, device array the same length as a row and indicates how null
+   * values compare to all other for every column. If `nullopt`, then null precedence would be
+   * `null_order::BEFORE` for all columns.
+   * @param comparator Physical element relational comparison functor.
+   */
+  template <bool nested_disable = not has_nested_columns, CUDF_ENABLE_IF(nested_disable)>
+  __device__ device_row_comparator(
+    Nullate check_nulls,
+    table_device_view lhs,
+    table_device_view rhs,
+    std::optional<device_span<order const>> column_order         = std::nullopt,
+    std::optional<device_span<null_order const>> null_precedence = std::nullopt,
+    PhysicalElementComparator comparator                         = {}) noexcept
+    : _lhs{lhs},
+      _rhs{rhs},
+      _l_dremel{},
+      _r_dremel{},
+      _check_nulls{check_nulls},
+      _depth{},
+      _column_order{column_order},
+      _null_precedence{null_precedence},
+      _comparator{comparator}
+  {
+  }
+
   /**
    * @brief Performs a relational comparison between two elements in two columns.
    */
@@ -323,6 +365,8 @@ class device_row_comparator {
      * @param depth The depth of the column if part of a nested column @see
      * preprocessed_table::depths
      * @param comparator Physical element relational comparison functor.
+     * @param l_dremel_device_view <>
+     * @param r_dremel_device_view <>
      */
     __device__ element_comparator(Nullate check_nulls,
                                   column_device_view lhs,
@@ -370,6 +414,13 @@ class device_row_comparator {
                              std::numeric_limits<int>::max());
     }
 
+    /**
+     * @brief Throws run-time error when columns types cannot be compared
+     *        or if this class is instantiated with `has_nested_columns = false` but
+     *        passed tables with nested columns
+     *
+     * @return Ordering
+     */
     template <typename Element,
               CUDF_ENABLE_IF(not cudf::is_relationally_comparable<Element, Element>() and
                              (not has_nested_columns or not cudf::is_nested<Element>()))>
@@ -379,6 +430,14 @@ class device_row_comparator {
       CUDF_UNREACHABLE("Attempted to compare elements of uncomparable types.");
     }
 
+    /**
+     * @brief Compares two struct-type columns
+     *
+     * @param lhs_element_index The index of the first element
+     * @param rhs_element_index The index of the second element
+     * @return Indicates the relationship between the elements in the `lhs` and `rhs` columns, along
+     * with the depth at which a null value was encountered.
+     */
     template <typename Element,
               CUDF_ENABLE_IF(has_nested_columns and std::is_same_v<Element, cudf::struct_view>)>
     __device__ cuda::std::pair<weak_ordering, int> operator()(
@@ -413,6 +472,14 @@ class device_row_comparator {
         rhs_element_index);
     }
 
+    /**
+     * @brief Compares two list-type columns
+     *
+     * @param lhs_element_index The index of the first element
+     * @param rhs_element_index The index of the second element
+     * @return Indicates the relationship between the elements in the `lhs` and `rhs` columns, along
+     * with the depth at which a null value was encountered.
+     */
     template <typename Element,
               CUDF_ENABLE_IF(has_nested_columns and std::is_same_v<Element, cudf::list_view>)>
     __device__ cuda::std::pair<weak_ordering, int> operator()(size_type lhs_element_index,
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 00a2f0bee8f..e47abd6ede4 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -13,30 +13,40 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/gather.cuh>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/merge.cuh>
+#include <cudf/detail/merge.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/search.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/merge.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
+#include <cudf/lists/detail/concatenate.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/detail/merge.cuh>
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
 
+#include <limits>
+#include <numeric>
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/binary_search.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/merge.h>
 #include <thrust/pair.h>
+#include <thrust/sequence.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
@@ -45,8 +55,47 @@
 
 namespace cudf {
 namespace detail {
+
 namespace {
 
+template <bool has_nulls>
+struct row_lexicographic_tagged_comparator {
+  row_lexicographic_tagged_comparator(table_device_view const lhs,
+                                      table_device_view const rhs,
+                                      device_span<order const> const column_order,
+                                      device_span<null_order const> const null_precedence)
+    : _lhs{lhs}, _rhs{rhs}, _column_order{column_order}, _null_precedence{null_precedence}
+  {
+  }
+
+  __device__ bool operator()(index_type lhs_tagged_index,
+                             index_type rhs_tagged_index) const noexcept
+  {
+    auto const [l_side, l_indx] = lhs_tagged_index;
+    auto const [r_side, r_indx] = rhs_tagged_index;
+
+    table_device_view const* ptr_left_dview{l_side == side::LEFT ? &_lhs : &_rhs};
+    table_device_view const* ptr_right_dview{r_side == side::LEFT ? &_lhs : &_rhs};
+    auto const comparator = [&]() {
+      if constexpr (has_nulls) {
+        return cudf::experimental::row::lexicographic::device_row_comparator<false, bool>{
+          has_nulls, *ptr_left_dview, *ptr_right_dview, _column_order, _null_precedence};
+      } else {
+        return cudf::experimental::row::lexicographic::device_row_comparator<false, bool>{
+          has_nulls, *ptr_left_dview, *ptr_right_dview, _column_order};
+      }
+    }();
+
+    return comparator(l_indx, r_indx) == weak_ordering::LESS;
+  }
+
+ private:
+  table_device_view const _lhs;
+  table_device_view const _rhs;
+  device_span<null_order const> const _null_precedence;
+  device_span<order const> const _column_order;
+};
+
 using detail::side;
 using index_type = detail::index_type;
 
@@ -187,18 +236,31 @@ index_vector generate_merged_indices(table_view const& left_table,
 
   index_vector merged_indices(total_size, stream);
 
+  auto const has_nulls =
+    nullate::DYNAMIC{cudf::has_nulls(left_table) or cudf::has_nulls(right_table)};
+
   auto lhs_device_view = table_device_view::create(left_table, stream);
   auto rhs_device_view = table_device_view::create(right_table, stream);
 
   auto d_column_order = cudf::detail::make_device_uvector_async(
     column_order, stream, rmm::mr::get_current_device_resource());
 
-  if (nullable) {
+  if (has_nulls) {
+    auto const new_null_precedence = [&]() {
+      if (null_precedence.size() > 0) {
+        CUDF_EXPECTS(static_cast<size_type>(null_precedence.size()) == left_table.num_columns(),
+                     "Null precedence vector size mismatched");
+        return null_precedence;
+      } else {
+        return std::vector<null_order>(left_table.num_columns(), null_order::BEFORE);
+      }
+    }();
+
     auto d_null_precedence = cudf::detail::make_device_uvector_async(
-      null_precedence, stream, rmm::mr::get_current_device_resource());
+      new_null_precedence, stream, rmm::mr::get_current_device_resource());
 
     auto ineq_op = detail::row_lexicographic_tagged_comparator<true>(
-      *lhs_device_view, *rhs_device_view, d_column_order.data(), d_null_precedence.data());
+      *lhs_device_view, *rhs_device_view, d_column_order, d_null_precedence);
     thrust::merge(rmm::exec_policy(stream),
                   left_begin,
                   left_begin + left_size,
@@ -208,7 +270,7 @@ index_vector generate_merged_indices(table_view const& left_table,
                   ineq_op);
   } else {
     auto ineq_op = detail::row_lexicographic_tagged_comparator<false>(
-      *lhs_device_view, *rhs_device_view, d_column_order.data());
+      *lhs_device_view, *rhs_device_view, d_column_order, {});
     thrust::merge(rmm::exec_policy(stream),
                   left_begin,
                   left_begin + left_size,
@@ -223,6 +285,56 @@ index_vector generate_merged_indices(table_view const& left_table,
   return merged_indices;
 }
 
+index_vector generate_merged_indices_nested(table_view const& left_table,
+                                            table_view const& right_table,
+                                            std::vector<order> const& column_order,
+                                            std::vector<null_order> const& null_precedence,
+                                            bool nullable,
+                                            rmm::cuda_stream_view stream)
+{
+  size_type const left_size  = left_table.num_rows();
+  size_type const right_size = right_table.num_rows();
+  size_type const total_size = left_size + right_size;
+
+  index_vector merged_indices(total_size, stream);
+
+  auto const left_indices_col     = cudf::detail::lower_bound(right_table,
+                                                          left_table,
+                                                          column_order,
+                                                          null_precedence,
+                                                          stream,
+                                                          rmm::mr::get_current_device_resource());
+  auto const left_indices         = left_indices_col->view();
+  auto left_indices_mutable       = left_indices_col->mutable_view();
+  auto const left_indices_begin   = left_indices.begin<cudf::size_type>();
+  auto const left_indices_end     = left_indices.end<cudf::size_type>();
+  auto left_indices_mutable_begin = left_indices_mutable.begin<cudf::size_type>();
+
+  auto const total_counter = thrust::make_counting_iterator(0);
+  thrust::for_each(
+    rmm::exec_policy_nosync(stream),
+    total_counter,
+    total_counter + total_size,
+    [merged = merged_indices.data(), left = left_indices_begin, left_size, right_size] __device__(
+      auto const idx) {
+      // We split threads into two groups, so only one kernel is needed.
+      // Threads in [0, right_size) will insert right indices in sorted order.
+      // Threads in [right_size, total_size) will insert left indices in sorted order.
+      if (idx < right_size) {
+        // this tells us between which segments of left elements a right element
+        // would fall
+        auto const r_bound      = thrust::upper_bound(thrust::seq, left, left + left_size, idx);
+        auto const r_segment    = thrust::distance(left, r_bound);
+        merged[r_segment + idx] = thrust::make_pair(side::RIGHT, idx);
+      } else {
+        auto const left_idx               = idx - right_size;
+        merged[left[left_idx] + left_idx] = thrust::make_pair(side::LEFT, left_idx);
+      }
+    });
+
+  return merged_indices;
+}
+
 /**
  * @brief Generate merged column given row-order of merged tables
  *  (ordered according to indices of key_cols) and the 2 columns to merge.
@@ -353,6 +465,32 @@ std::unique_ptr<column> column_merger::operator()<cudf::dictionary32>(
   return result;
 }
 
+// specialization for lists
+template <>
+std::unique_ptr<column> column_merger::operator()<cudf::list_view>(
+  column_view const& lcol,
+  column_view const& rcol,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
+{
+  std::vector<column_view> columns{lcol, rcol};
+  auto concatenated_list = cudf::lists::detail::concatenate(columns, stream, mr);
+
+  auto const iter_gather = cudf::detail::make_counting_transform_iterator(
+    0, [row_order = row_order_.data(), lsize = lcol.size()] __device__(auto const idx) {
+      auto const [side, index] = row_order[idx];
+      return side == side::LEFT ? index : lsize + index;
+    });
+
+  auto result = cudf::detail::gather(table_view{{concatenated_list->view()}},
+                                     iter_gather,
+                                     iter_gather + concatenated_list->size(),
+                                     out_of_bounds_policy::DONT_CHECK,
+                                     stream,
+                                     mr);
+  return std::move(result->release()[0]);
+}
+
 // specialization for structs
 template <>
 std::unique_ptr<column> column_merger::operator()<cudf::struct_view>(
@@ -418,9 +556,16 @@ table_ptr_type merge(cudf::table_view const& left_table,
 
   // extract merged row order according to indices:
   //
-  auto const merged_indices = generate_merged_indices(
-    index_left_view, index_right_view, column_order, null_precedence, nullable, stream);
-
+  auto const merged_indices = [&]() {
+    if (cudf::detail::has_nested_columns(left_table) or
+        cudf::detail::has_nested_columns(right_table)) {
+      return generate_merged_indices_nested(
+        index_left_view, index_right_view, column_order, null_precedence, nullable, stream);
+    } else {
+      return generate_merged_indices(
+        index_left_view, index_right_view, column_order, null_precedence, nullable, stream);
+    }
+  }();
   // create merged table:
   //
   auto const n_cols = left_table.num_columns();
@@ -493,6 +638,13 @@ table_ptr_type merge(std::vector<table_view> const& tables_to_merge,
 
   CUDF_EXPECTS(key_cols.size() == column_order.size(),
                "Mismatched size between key_cols and column_order");
+  CUDF_EXPECTS(std::accumulate(tables_to_merge.cbegin(),
+                               tables_to_merge.cend(),
+                               cudf::size_type{0},
+                               [](auto const& running_sum, auto const& tbl) {
+                                 return running_sum + tbl.num_rows();
+                               }) <= std::numeric_limits<cudf::size_type>::max(),
+               "Total number of merged rows exceeds row limit");
 
   // This utility will ensure all corresponding dictionary columns have matching keys.
   // It will return any new dictionary columns created as well as updated table_views.
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 9e8b75ae3b6..44a13c450ab 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -23,7 +23,7 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/merge.cuh>
+#include <cudf/detail/merge.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index 3a61c0768a6..3558e5676dd 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -27,7 +27,9 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
@@ -874,6 +876,117 @@ TEST_F(MergeTest, StructsNestedWithNulls)
   // clang-format on
 }
 
+using lcw = cudf::test::lists_column_wrapper<int32_t>;
+using cudf::test::iterators::null_at;
+using cudf::test::iterators::nulls_at;
+
+TEST_F(MergeTest, Lists)
+{
+  auto col1 = lcw{lcw{1}, lcw{3}, lcw{5}, lcw{7}};
+  auto col2 = lcw{lcw{2}, lcw{4}, lcw{6}, lcw{8}};
+
+  auto tbl1 = cudf::table_view{{col1}};
+  auto tbl2 = cudf::table_view{{col2}};
+
+  auto result = cudf::merge({tbl1, tbl2}, {0}, {cudf::order::ASCENDING});
+
+  auto expected_col = lcw{lcw{1}, lcw{2}, lcw{3}, lcw{4}, lcw{5}, lcw{6}, lcw{7}, lcw{8}};
+  auto expected_tbl = cudf::table_view{{expected_col}};
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl, *result);
+}
+
+TEST_F(MergeTest, NestedListsWithNulls)
+{
+  auto col1 = lcw{{lcw{lcw{1}}, lcw{lcw{3}}, lcw{lcw{5}}, lcw{lcw{7}}}, null_at(3)};
+  auto col2 = lcw{{lcw{lcw{2}}, lcw{lcw{4}}, lcw{lcw{6}}, lcw{lcw{8}}}, null_at(3)};
+
+  auto tbl1 = cudf::table_view{{col1}};
+  auto tbl2 = cudf::table_view{{col2}};
+
+  auto result = cudf::merge({tbl1, tbl2}, {0}, {cudf::order::ASCENDING}, {cudf::null_order::AFTER});
+
+  auto expected_col = lcw{{lcw{lcw{1}},
+                           lcw{lcw{2}},
+                           lcw{lcw{3}},
+                           lcw{lcw{4}},
+                           lcw{lcw{5}},
+                           lcw{lcw{6}},
+                           lcw{lcw{7}},
+                           lcw{lcw{8}}},
+                          nulls_at({6, 7})};
+  auto expected_tbl = cudf::table_view{{expected_col}};
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl, *result);
+}
+
+TEST_F(MergeTest, NestedListsofStructs)
+{
+  // [ {1},    {2},   {3} ]
+  // [ {5}                ]
+  // [ {7},    {8}        ]
+  // [ {10}               ]
+  auto const col1 = [] {
+    auto const get_structs = [] {
+      auto child0 = cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3, 5, 7, 8, 10};
+      return cudf::test::structs_column_wrapper{{child0}};
+    };
+    return cudf::make_lists_column(
+      4,
+      cudf::test::fixed_width_column_wrapper<int32_t>{0, 3, 4, 6, 7}.release(),
+      get_structs().release(),
+      0,
+      {});
+  }();
+
+  // [ {4}                ]
+  // [ {6}                ]
+  // [ {9}                ]
+  // [ {11}               ]
+  auto const col2 = [] {
+    auto const get_structs = [] {
+      auto child0 = cudf::test::fixed_width_column_wrapper<int32_t>{4, 6, 9, 11};
+      return cudf::test::structs_column_wrapper{{child0}};
+    };
+    return cudf::make_lists_column(
+      4,
+      cudf::test::fixed_width_column_wrapper<int32_t>{0, 1, 2, 3, 4}.release(),
+      get_structs().release(),
+      0,
+      {});
+  }();
+
+  auto tbl1 = cudf::table_view{{*col1}};
+  auto tbl2 = cudf::table_view{{*col2}};
+
+  auto result = cudf::merge({tbl1, tbl2}, {0}, {cudf::order::ASCENDING}, {cudf::null_order::AFTER});
+
+  // [ {1},    {2},   {3} ]
+  // [ {4}                ]
+  // [ {5}                ]
+  // [ {6}                ]
+  // [ {7},    {8}        ]
+  // [ {9}                ]
+  // [ {10}               ]
+  // [ {11}               ]
+  auto const expected_col = [] {
+    auto const get_structs = [] {
+      auto child0 =
+        cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+      return cudf::test::structs_column_wrapper{{child0}};
+    };
+    return cudf::make_lists_column(
+      8,
+      cudf::test::fixed_width_column_wrapper<int32_t>{0, 3, 4, 5, 6, 8, 9, 10, 11}.release(),
+      get_structs().release(),
+      0,
+      {});
+  }();
+  auto expected_tbl = cudf::table_view{{*expected_col}};
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl, *result);
+}
+
 template <typename T>
 struct FixedPointTestAllReps : public cudf::test::BaseFixture {};
 

From 2a923dfff84a8ce3422a126e9d0035282a0bea2f Mon Sep 17 00:00:00 2001
From: Jihoon Son <jihoonson@apache.org>
Date: Fri, 27 Oct 2023 19:11:01 -0700
Subject: [PATCH 061/118] Fix the precision when converting a decimal128 column
 to an arrow array (#14230)

This PR fixes https://github.com/rapidsai/cudf/issues/13749. As discussed in the issue linked, the precision is unnecessarily being limited to 18 when converting decimal128 to arrow.

Implementation-wise, I wasn't sure where is the best place to define the max precision for decimal types. Given that the decimal types don't store the precision in libcudf, I thought it would be better to not expose the max precision to the outside of to-arrow conversion. However, this led to replicating the definition of max precision across multiple places. Appreciate any suggestion.

Finally, it was suggested in https://github.com/rapidsai/cudf/issues/13749#issuecomment-1686406749 to add some tests for round tripping. The existing tests look sufficient to me for round trip tests, so I just modified them instead of adding new tests. Please let me know if we need new tests in addition to them.

I'm also not sure whether any documentation should be fixed. Please let me know.

Authors:
  - Jihoon Son (https://github.com/jihoonson)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14230
---
 cpp/include/cudf/detail/interop.hpp | 13 +++++++++++++
 cpp/include/cudf/interop.hpp        | 12 ++++++++++++
 cpp/src/interop/to_arrow.cu         | 13 +++++++++----
 cpp/tests/interop/arrow_utils.hpp   |  2 +-
 cpp/tests/interop/to_arrow_test.cpp |  4 +++-
 5 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 44024333239..8124471982d 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -194,5 +194,18 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
 std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Return a maximum precision for a given type.
+ *
+ * @tparam T the type to get the maximum precision for
+ */
+template <typename T>
+constexpr std::size_t max_precision()
+{
+  auto constexpr num_bits = sizeof(T) * 8;
+  return std::floor(num_bits * std::log(2) / std::log(10));
+}
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 865cc004107..2ee6f19614d 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -129,6 +129,12 @@ struct column_metadata {
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param ar_mr arrow memory pool to allocate memory for arrow Table
  * @return arrow Table generated from `input`
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf,
+ * it will be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
  */
 std::shared_ptr<arrow::Table> to_arrow(table_view input,
                                        std::vector<column_metadata> const& metadata = {},
@@ -145,6 +151,12 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param ar_mr arrow memory pool to allocate memory for arrow Scalar
  * @return arrow Scalar generated from `input`
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf,
+ * it will be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
  */
 std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
                                         column_metadata const& metadata = {},
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 0cd750bc947..28230cf8e74 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -197,7 +197,9 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal32>(
   arrow::MemoryPool* ar_mr,
   rmm::cuda_stream_view stream)
 {
-  return unsupported_decimals_to_arrow<int32_t>(input, 9, ar_mr, stream);
+  using DeviceType = int32_t;
+  return unsupported_decimals_to_arrow<DeviceType>(
+    input, cudf::detail::max_precision<DeviceType>(), ar_mr, stream);
 }
 
 template <>
@@ -208,7 +210,9 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
   arrow::MemoryPool* ar_mr,
   rmm::cuda_stream_view stream)
 {
-  return unsupported_decimals_to_arrow<int64_t>(input, 18, ar_mr, stream);
+  using DeviceType = int64_t;
+  return unsupported_decimals_to_arrow<DeviceType>(
+    input, cudf::detail::max_precision<DeviceType>(), ar_mr, stream);
 }
 
 template <>
@@ -219,7 +223,8 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal128>
   arrow::MemoryPool* ar_mr,
   rmm::cuda_stream_view stream)
 {
-  using DeviceType = __int128_t;
+  using DeviceType         = __int128_t;
+  auto const max_precision = cudf::detail::max_precision<DeviceType>();
 
   rmm::device_uvector<DeviceType> buf(input.size(), stream);
 
@@ -234,7 +239,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal128>
   CUDF_CUDA_TRY(cudaMemcpyAsync(
     data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value()));
 
-  auto type    = arrow::decimal(18, -input.type().scale());
+  auto type    = arrow::decimal(max_precision, -input.type().scale());
   auto mask    = fetch_mask_buffer(input, ar_mr, stream);
   auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, std::move(data_buffer)};
   auto data    = std::make_shared<arrow::ArrayData>(type, input.size(), buffers);
diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp
index fc8f5b37f7e..2c5f7458ce5 100644
--- a/cpp/tests/interop/arrow_utils.hpp
+++ b/cpp/tests/interop/arrow_utils.hpp
@@ -186,7 +186,7 @@ template <typename T>
   auto constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(T);
 
   std::shared_ptr<arrow::Array> arr;
-  arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -scale),
+  arrow::Decimal128Builder decimal_builder(arrow::decimal(cudf::detail::max_precision<T>(), -scale),
                                            arrow::default_memory_pool());
 
   for (T i = 0; i < static_cast<T>(data.size() / BIT_WIDTH_RATIO); ++i) {
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index 6bb4cdfd747..d6762e70d80 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -604,7 +604,9 @@ struct ToArrowDecimalScalarTest : public cudf::test::BaseFixture {};
 TEST_F(ToArrowDecimalScalarTest, Basic)
 {
   auto const value{42};
-  auto const precision{18};  // cudf will convert to the widest-precision Arrow scalar of the type
+  auto const precision =
+    cudf::detail::max_precision<__int128_t>();  // cudf will convert to the widest-precision Arrow
+                                                // scalar of the type
   int32_t const scale{4};
 
   auto const cudf_scalar =

From 2548509a56fad536c8ec80332dd7bb06b59a8cb8 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 30 Oct 2023 09:17:01 -0700
Subject: [PATCH 062/118] Add cramjam (#14344)

As of fastavro 1.9.0 (released on October 27) [the recommend method for snappy decompression is via the cramjam package](https://github.com/fastavro/fastavro/blob/62ea68fe22e11ff5ff5dc86dfe942c7d51d7c157/ChangeLog#L3). We need to install it [to avoid getting deprecation warnings involving python-snappy usage](https://github.com/fastavro/fastavro/issues/722).

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14344
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 1 +
 conda/environments/all_cuda-120_arch-x86_64.yaml | 1 +
 dependencies.yaml                                | 1 +
 python/cudf/pyproject.toml                       | 1 +
 4 files changed, 4 insertions(+)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index cfcbde71b01..3d3d4f15d05 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -15,6 +15,7 @@ dependencies:
 - c-compiler
 - cachetools
 - cmake>=3.26.4
+- cramjam
 - cubinlinker
 - cuda-nvtx=11.8
 - cuda-python>=11.7.1,<12.0a0
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index db19d658b0d..4f39424bbc6 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -15,6 +15,7 @@ dependencies:
 - c-compiler
 - cachetools
 - cmake>=3.26.4
+- cramjam
 - cuda-cudart-dev
 - cuda-gdb
 - cuda-nvcc
diff --git a/dependencies.yaml b/dependencies.yaml
index 1f2b42c49c4..cb9ef7a468c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -557,6 +557,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
+          - cramjam
           - fastavro>=0.22.9
           - hypothesis
           - mimesis>=4.1.0
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 90759074750..cc8a67c389f 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -54,6 +54,7 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
+    "cramjam",
     "fastavro>=0.22.9",
     "hypothesis",
     "mimesis>=4.1.0",

From abc0d41d1d9033d581948ae19384e0aa0f33da77 Mon Sep 17 00:00:00 2001
From: shrshi <shruti.shivakumar@gmail.com>
Date: Mon, 30 Oct 2023 10:32:47 -0700
Subject: [PATCH 063/118] Added streams to JSON reader and writer api (#14313)

This PR contributes to [#13744](https://github.com/rapidsai/cudf/issues/13744).
- Added stream parameters to public APIs
`cudf::io::read_json`
`cudf::io::write_json`
- Added stream gtests
- Added copy constructor to internal JSON struct that was breaking for non-default streams

Authors:
  - https://github.com/shrshi

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/14313
---
 cpp/include/cudf/io/detail/json.hpp |  1 -
 cpp/include/cudf/io/json.hpp        |  4 ++
 cpp/src/io/functions.cpp            | 12 ++++--
 cpp/src/io/json/nested_json.hpp     |  1 -
 cpp/src/io/json/write_json.cu       | 65 ++++++++++++++++------------
 cpp/tests/CMakeLists.txt            |  1 +
 cpp/tests/io/json_test.cpp          |  5 ++-
 cpp/tests/io/json_writer.cpp        | 65 +++++++++++++++++++---------
 cpp/tests/streams/io/json_test.cpp  | 66 +++++++++++++++++++++++++++++
 9 files changed, 166 insertions(+), 54 deletions(-)
 create mode 100644 cpp/tests/streams/io/json_test.cpp

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 6930a4fdb25..d0a9543397d 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/io/json.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 55aa534ac6c..472d42b1db5 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -512,6 +512,7 @@ class json_reader_options_builder {
  * @endcode
  *
  * @param options Settings for controlling reading behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the table in the returned
  * table_with_metadata.
  *
@@ -519,6 +520,7 @@ class json_reader_options_builder {
  */
 table_with_metadata read_json(
   json_reader_options options,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
@@ -861,9 +863,11 @@ class json_writer_options_builder {
  * @endcode
  *
  * @param options Settings for controlling writing behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
 void write_json(json_writer_options const& options,
+                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 29ebb1ddbde..00d56008611 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -200,7 +200,9 @@ compression_type infer_compression_type(compression_type compression, source_inf
   return compression_type::NONE;
 }
 
-table_with_metadata read_json(json_reader_options options, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_json(json_reader_options options,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -210,10 +212,12 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor
                                       options.get_byte_range_offset(),
                                       options.get_byte_range_size_with_padding());
 
-  return json::detail::read_json(datasources, options, cudf::get_default_stream(), mr);
+  return json::detail::read_json(datasources, options, stream, mr);
 }
 
-void write_json(json_writer_options const& options, rmm::mr::device_memory_resource* mr)
+void write_json(json_writer_options const& options,
+                rmm::cuda_stream_view stream,
+                rmm::mr::device_memory_resource* mr)
 {
   auto sinks = make_datasinks(options.get_sink());
   CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for JSON writing");
@@ -222,7 +226,7 @@ void write_json(json_writer_options const& options, rmm::mr::device_memory_resou
     sinks[0].get(),
     options.get_table(),
     options,
-    cudf::get_default_stream(),
+    stream,
     mr);
 }
 
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 3bbfc4b5f83..8d89f4ff927 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -20,7 +20,6 @@
 #include <cudf/io/types.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <map>
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 2d363c51fce..c211d17f13a 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -504,6 +504,12 @@ struct column_to_strings_fn {
   {
   }
 
+  ~column_to_strings_fn()                                      = default;
+  column_to_strings_fn(column_to_strings_fn const&)            = delete;
+  column_to_strings_fn& operator=(column_to_strings_fn const&) = delete;
+  column_to_strings_fn(column_to_strings_fn&&)                 = delete;
+  column_to_strings_fn& operator=(column_to_strings_fn&&)      = delete;
+
   // unsupported type of column:
   template <typename column_type>
   std::enable_if_t<is_not_handled<column_type>(), std::unique_ptr<column>> operator()(
@@ -614,17 +620,18 @@ struct column_to_strings_fn {
 
     auto child_string_with_null = [&]() {
       if (child_view.type().id() == type_id::STRUCT) {
-        return (*this).template operator()<cudf::struct_view>(
-          child_view,
-          children_names.size() > child_index ? children_names[child_index].children
-                                              : std::vector<column_name_info>{});
-      } else if (child_view.type().id() == type_id::LIST) {
-        return (*this).template operator()<cudf::list_view>(child_view,
+        return this->template operator()<cudf::struct_view>(child_view,
                                                             children_names.size() > child_index
                                                               ? children_names[child_index].children
                                                               : std::vector<column_name_info>{});
+      } else if (child_view.type().id() == type_id::LIST) {
+        return this->template operator()<cudf::list_view>(child_view,
+                                                          children_names.size() > child_index
+                                                            ? children_names[child_index].children
+                                                            : std::vector<column_name_info>{});
       } else {
-        return cudf::type_dispatcher(child_view.type(), *this, child_view);
+        return cudf::type_dispatcher<cudf::id_to_type_impl, column_to_strings_fn const&>(
+          child_view.type(), *this, child_view);
       }
     };
     auto new_offsets = cudf::lists::detail::get_normalized_offsets(
@@ -679,27 +686,29 @@ struct column_to_strings_fn {
     //
     auto i_col_begin =
       thrust::make_zip_iterator(thrust::counting_iterator<size_t>(0), column_begin);
-    std::transform(i_col_begin,
-                   i_col_begin + num_columns,
-                   std::back_inserter(str_column_vec),
-                   [this, &children_names](auto const& i_current_col) {
-                     auto const i            = thrust::get<0>(i_current_col);
-                     auto const& current_col = thrust::get<1>(i_current_col);
-                     // Struct needs children's column names
-                     if (current_col.type().id() == type_id::STRUCT) {
-                       return (*this).template operator()<cudf::struct_view>(
-                         current_col,
-                         children_names.size() > i ? children_names[i].children
-                                                   : std::vector<column_name_info>{});
-                     } else if (current_col.type().id() == type_id::LIST) {
-                       return (*this).template operator()<cudf::list_view>(
-                         current_col,
-                         children_names.size() > i ? children_names[i].children
-                                                   : std::vector<column_name_info>{});
-                     } else {
-                       return cudf::type_dispatcher(current_col.type(), *this, current_col);
-                     }
-                   });
+    std::transform(
+      i_col_begin,
+      i_col_begin + num_columns,
+      std::back_inserter(str_column_vec),
+      [this, &children_names](auto const& i_current_col) {
+        auto const i            = thrust::get<0>(i_current_col);
+        auto const& current_col = thrust::get<1>(i_current_col);
+        // Struct needs children's column names
+        if (current_col.type().id() == type_id::STRUCT) {
+          return this->template operator()<cudf::struct_view>(current_col,
+                                                              children_names.size() > i
+                                                                ? children_names[i].children
+                                                                : std::vector<column_name_info>{});
+        } else if (current_col.type().id() == type_id::LIST) {
+          return this->template operator()<cudf::list_view>(current_col,
+                                                            children_names.size() > i
+                                                              ? children_names[i].children
+                                                              : std::vector<column_name_info>{});
+        } else {
+          return cudf::type_dispatcher<cudf::id_to_type_impl, column_to_strings_fn const&>(
+            current_col.type(), *this, current_col);
+        }
+      });
 
     // create string table view from str_column_vec:
     //
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 10937212bc1..f856d106d03 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -655,6 +655,7 @@ ConfigureTest(
   STREAM_TEXT_TEST streams/text/ngrams_test.cpp streams/text/tokenize_test.cpp STREAM_MODE testing
 )
 ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 0149a467c32..a2db2d69984 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -18,6 +18,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/default_stream.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
@@ -1422,7 +1423,9 @@ TEST_F(JsonReaderTest, JsonLongString)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(),
+                       cudf::test::get_default_stream(),
+                       rmm::mr::get_current_device_resource());
 
   cudf::table_view const expected = tbl_view;
   std::map<std::string, data_type> types;
diff --git a/cpp/tests/io/json_writer.cpp b/cpp/tests/io/json_writer.cpp
index 3a4074c02ad..a85a696565b 100644
--- a/cpp/tests/io/json_writer.cpp
+++ b/cpp/tests/io/json_writer.cpp
@@ -16,6 +16,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/detail/iterator.cuh>
@@ -49,14 +50,16 @@ TEST_F(JsonWriterTest, EmptyInput)
                        .build();
 
   // Empty columns in table
-  cudf::io::write_json(out_options, rmm::mr::get_current_device_resource());
+  cudf::io::write_json(
+    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
   std::string const expected = R"([])";
   EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
 
   // Empty columns in table - JSON Lines
   out_buffer.clear();
   out_options.enable_lines(true);
-  cudf::io::write_json(out_options, rmm::mr::get_current_device_resource());
+  cudf::io::write_json(
+    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
   std::string const expected_lines = "\n";
   EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
 
@@ -64,7 +67,8 @@ TEST_F(JsonWriterTest, EmptyInput)
   cudf::table_view tbl_view2{};
   out_options.set_table(tbl_view2);
   out_buffer.clear();
-  cudf::io::write_json(out_options, rmm::mr::get_current_device_resource());
+  cudf::io::write_json(
+    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
   EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
 }
 
@@ -89,17 +93,22 @@ TEST_F(JsonWriterTest, ErrorCases)
                        .build();
 
   // not enough column names
-  EXPECT_THROW(cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::io::write_json(
+      out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
+    cudf::logic_error);
 
   mt.schema_info.emplace_back("int16");
   out_options.set_metadata(mt);
-  EXPECT_NO_THROW(cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()));
+  EXPECT_NO_THROW(cudf::io::write_json(
+    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()));
 
   // chunk_rows must be at least 8
   out_options.set_rows_per_chunk(0);
-  EXPECT_THROW(cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::io::write_json(
+      out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
+    cudf::logic_error);
 }
 
 TEST_F(JsonWriterTest, PlainTable)
@@ -121,7 +130,9 @@ TEST_F(JsonWriterTest, PlainTable)
                            .lines(false)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(),
+                       cudf::test::get_default_stream(),
+                       rmm::mr::get_current_device_resource());
 
   std::string const expected =
     R"([{"col1":"a","col2":"d","int":1,"float":1.5,"int16":null},{"col1":"b","col2":"e","int":2,"float":2.5,"int16":2},{"col1":"c","col2":"f","int":3,"float":3.5,"int16":null}])";
@@ -151,7 +162,9 @@ TEST_F(JsonWriterTest, SimpleNested)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(),
+                       cudf::test::get_default_stream(),
+                       rmm::mr::get_current_device_resource());
   std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]}
 {"a":6,"b":7,"c":{"d":8},"f":10.5}
 {"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]}
@@ -183,7 +196,9 @@ TEST_F(JsonWriterTest, MixedNested)
                            .lines(false)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(),
+                       cudf::test::get_default_stream(),
+                       rmm::mr::get_current_device_resource());
   std::string const expected =
     R"([{"a":1,"b":2,"c":{"d":[3]},"f":5.5,"g":[{"h":1}]},)"
     R"({"a":6,"b":7,"c":{"d":[8]},"f":10.5},)"
@@ -216,7 +231,8 @@ TEST_F(JsonWriterTest, WriteReadNested)
                        .na_rep("null")
                        .build();
 
-  cudf::io::write_json(out_options, rmm::mr::get_current_device_resource());
+  cudf::io::write_json(
+    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
   std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]}
 {"a":6,"b":7,"c":{"d":8},"f":10.5}
 {"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]}
@@ -291,7 +307,8 @@ TEST_F(JsonWriterTest, WriteReadNested)
   mt.schema_info[2].children.clear();
   out_options.set_metadata(mt);
   out_buffer.clear();
-  cudf::io::write_json(out_options, rmm::mr::get_current_device_resource());
+  cudf::io::write_json(
+    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
 
   in_options = cudf::io::json_reader_options::builder(
                  cudf::io::source_info{out_buffer.data(), out_buffer.size()})
@@ -314,7 +331,8 @@ TEST_F(JsonWriterTest, WriteReadNested)
   // without column names
   out_options.set_metadata(cudf::io::table_metadata{});
   out_buffer.clear();
-  cudf::io::write_json(out_options, rmm::mr::get_current_device_resource());
+  cudf::io::write_json(
+    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
   in_options = cudf::io::json_reader_options::builder(
                  cudf::io::source_info{out_buffer.data(), out_buffer.size()})
                  .lines(true)
@@ -352,7 +370,8 @@ TEST_F(JsonWriterTest, SpecialChars)
                        .na_rep("null")
                        .build();
 
-  cudf::io::write_json(out_options, rmm::mr::get_current_device_resource());
+  cudf::io::write_json(
+    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
   std::string const expected = R"({"\"a\"":1,"'b'":"abcd"}
 {"\"a\"":6,"'b'":"b\b\f\n\r\t"}
 {"\"a\"":1,"'b'":"\"c\""}
@@ -385,7 +404,9 @@ TEST_F(JsonWriterTest, NullList)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(),
+                       cudf::test::get_default_stream(),
+                       rmm::mr::get_current_device_resource());
   std::string const expected = R"({"a":[null],"b":[[1,2,3],[null],[null,null,null],[4,null,5]]}
 {"a":[2,null,null,3],"b":null}
 {"a":[null,null,4],"b":[[2,null],null]}
@@ -424,7 +445,9 @@ TEST_F(JsonWriterTest, ChunkedNested)
                            .na_rep("null")
                            .rows_per_chunk(8);
 
-  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(),
+                       cudf::test::get_default_stream(),
+                       rmm::mr::get_current_device_resource());
   std::string const expected =
     R"({"a":1,"b":-2,"c":{},"e":[{"f":1}]}
 {"a":2,"b":-2,"c":{}}
@@ -480,7 +503,9 @@ TEST_F(JsonWriterTest, StructAllNullCombinations)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(),
+                       cudf::test::get_default_stream(),
+                       rmm::mr::get_current_device_resource());
   std::string const expected = R"({}
 {"e":1}
 {"d":1}
@@ -542,7 +567,9 @@ TEST_F(JsonWriterTest, Unicode)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(),
+                       cudf::test::get_default_stream(),
+                       rmm::mr::get_current_device_resource());
 
   std::string const expected =
     R"({"col1":"\"\\\/\b\f\n\r\t","col2":"C\u10ae\u226a\u31f3\u434f\u51f9\u6ca6\u738b\u8fbf\u9fb8\ua057\ubbdc\uc2a4\ud3f6\ue4fe\ufd20","int16":null}
diff --git a/cpp/tests/streams/io/json_test.cpp b/cpp/tests/streams/io/json_test.cpp
new file mode 100644
index 00000000000..80619d4d58c
--- /dev/null
+++ b/cpp/tests/streams/io/json_test.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/io/detail/json.hpp>
+#include <cudf/io/json.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <string>
+#include <vector>
+
+class JSONTest : public cudf::test::BaseFixture {};
+
+TEST_F(JSONTest, JSONreader)
+{
+  std::string data = "[1, 1.1]\n[2, 2.2]\n[3, 3.3]\n";
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+      .dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::INT32},
+                                           cudf::data_type{cudf::type_id::FLOAT64}})
+      .lines(true)
+      .legacy(true);
+  cudf::io::table_with_metadata result =
+    cudf::io::read_json(in_options, cudf::test::get_default_stream());
+}
+
+TEST_F(JSONTest, JSONwriter)
+{
+  cudf::test::strings_column_wrapper col1{"a", "b", "c"};
+  cudf::test::strings_column_wrapper col2{"d", "e", "f"};
+  cudf::test::fixed_width_column_wrapper<int> col3{1, 2, 3};
+  cudf::test::fixed_width_column_wrapper<float> col4{1.5, 2.5, 3.5};
+  cudf::test::fixed_width_column_wrapper<int16_t> col5{{1, 2, 3},
+                                                       cudf::test::iterators::nulls_at({0, 2})};
+  cudf::table_view tbl_view{{col1, col2, col3, col4, col5}};
+  cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int"}, {"float"}, {"int16"}}};
+
+  std::vector<char> out_buffer;
+  auto destination     = cudf::io::sink_info(&out_buffer);
+  auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view)
+                           .include_nulls(true)
+                           .metadata(mt)
+                           .lines(false)
+                           .na_rep("null");
+
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
+}

From 2abf9a6995dec331047849727ee51d471efc13bd Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Tue, 31 Oct 2023 08:09:50 +0100
Subject: [PATCH 064/118] Fixes stack context for json lines format that
 recovers from invalid JSON lines (#14309)

Addresses https://github.com/rapidsai/cudf/issues/14282.

For the JSON lines format that recovers after an invalid JSON line, we've had two issues when we were generating the stack context that is used downstream in the full JSON pushdown transducer.

For that format, we need to make sure that we "reset" the stack context after each JSON line. That is,

1. We need to reset the stack to the empty stack after each JSON line, as the stack may not be empty after an erroneous  JSON line. E.g. `{"this opening brace is never closed":123\n{"<=this brace should be on the empty stack":...}`
2. We need to reset that we are outside of a string: `{"no matching end-quote on this line\n{"<=this quote is the beginning of a field name, not the end of the previous line's field name"`

This fixes above requirements as follows:
1. Was already implemented - but with an inappropriate scan operator that is not associative:
```
StackLevelT new_level = (symbol_to_stack_op_type(rhs.value) == stack_op_type::RESET)
                              ? 0
                              : (lhs.stack_level + rhs.stack_level);
```
E.g. (`{,n,{`,`},n,{`,`{,n,}`,`},n,}` all fail the associativity test). This was replaced with a `ScanByKey` that would start with a "fresh" stack level with each new key segment.

2. Was addressed by changing the transition table of the finite-state transducer that filters out brackets and braces that are enclosed in quotes to go back to the `OOS` (`outside-of-string`) state after every newline. This behaviour requires that _every_ newline character is treated as a delimiter of a JSON line. This was confirmed by Spark Rapids, who is the targeted user for the recovery option to be the case.

Authors:
  - Elias Stehle (https://github.com/elstehle)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/14309
---
 cpp/src/io/fst/logical_stack.cuh       | 143 +++++++++++++++++++++----
 cpp/src/io/json/nested_json_gpu.cu     |  31 ++++--
 cpp/tests/io/fst/logical_stack_test.cu |  17 +--
 cpp/tests/io/nested_json_test.cpp      | 117 ++++++++++++++++++++
 4 files changed, 271 insertions(+), 37 deletions(-)

diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh
index c4f99736306..22385d33c7b 100644
--- a/cpp/src/io/fst/logical_stack.cuh
+++ b/cpp/src/io/fst/logical_stack.cuh
@@ -28,6 +28,7 @@
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
+#include <thrust/iterator/transform_output_iterator.h>
 #include <thrust/scatter.h>
 
 #include <cub/cub.cuh>
@@ -48,6 +49,14 @@ enum class stack_op_type : int8_t {
   RESET = 3   ///< Operation popping all items currently on the stack
 };
 
+/**
+ * @brief Describes the kind of stack operations supported by the logical stack.
+ */
+enum class stack_op_support : bool {
+  NO_RESET_SUPPORT   = false,  ///< A stack that only supports push(x) and pop() operations
+  WITH_RESET_SUPPORT = true    ///< A stack that supports push(x), pop(), and reset() operations
+};
+
 namespace detail {
 
 /**
@@ -130,6 +139,37 @@ struct StackSymbolToStackOp {
   StackSymbolToStackOpTypeT symbol_to_stack_op_type;
 };
 
+/**
+ * @brief Function object that maps a stack `reset` operation to `1`.
+ */
+template <typename StackSymbolToStackOpTypeT>
+struct NewlineToResetStackSegmentOp {
+  template <typename StackSymbolT>
+  constexpr CUDF_HOST_DEVICE uint32_t operator()(StackSymbolT const& stack_symbol) const
+  {
+    stack_op_type stack_op = symbol_to_stack_op_type(stack_symbol);
+
+    // Every reset operation marks the beginning of a new segment
+    return (stack_op == stack_op_type::RESET) ? 1 : 0;
+  }
+
+  /// Function object returning a stack operation type for a given stack symbol
+  StackSymbolToStackOpTypeT symbol_to_stack_op_type;
+};
+
+/**
+ * @brief Function object that wraps around for values that exceed the largest value of `TargetT`
+ */
+template <typename TargetT>
+struct ModToTargetTypeOpT {
+  template <typename T>
+  constexpr CUDF_HOST_DEVICE TargetT operator()(T const& val) const
+  {
+    return static_cast<TargetT>(
+      val % (static_cast<T>(cuda::std::numeric_limits<TargetT>::max()) + static_cast<T>(1)));
+  }
+};
+
 /**
  * @brief Binary reduction operator to compute the absolute stack level from relative stack levels
  * (i.e., +1 for a PUSH, -1 for a POP operation).
@@ -140,9 +180,7 @@ struct AddStackLevelFromStackOp {
   constexpr CUDF_HOST_DEVICE StackOp<StackLevelT, ValueT> operator()(
     StackOp<StackLevelT, ValueT> const& lhs, StackOp<StackLevelT, ValueT> const& rhs) const
   {
-    StackLevelT new_level = (symbol_to_stack_op_type(rhs.value) == stack_op_type::RESET)
-                              ? 0
-                              : (lhs.stack_level + rhs.stack_level);
+    StackLevelT new_level = lhs.stack_level + rhs.stack_level;
     return StackOp<StackLevelT, ValueT>{new_level, rhs.value};
   }
 
@@ -230,6 +268,8 @@ struct RemapEmptyStack {
  * onto the stack or pop something from the stack and resolves the symbol that is on top of the
  * stack.
  *
+ * @tparam SupportResetOperation Whether the logical stack also supports `reset` operations that
+ * reset the stack to the empty stack
  * @tparam StackLevelT Signed integer type that must be sufficient to cover [-max_stack_level,
  * max_stack_level] for the given sequence of stack operations. Must be signed as it needs to cover
  * the stack level of any arbitrary subsequence of stack operations.
@@ -261,7 +301,8 @@ struct RemapEmptyStack {
  * what-is-on-top-of-the-stack
  * @param[in] stream The cuda stream to which to dispatch the work
  */
-template <typename StackLevelT,
+template <stack_op_support SupportResetOperation,
+          typename StackLevelT,
           typename StackSymbolItT,
           typename SymbolPositionT,
           typename StackSymbolToStackOpTypeT,
@@ -281,6 +322,9 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
   // Type used to hold pairs of (stack_level, value) pairs
   using StackOpT = detail::StackOp<StackLevelT, StackSymbolT>;
 
+  // Type used to mark *-by-key segments after `reset` operations
+  using StackSegmentT = uint8_t;
+
   // The unsigned integer type that we use for radix sorting items of type StackOpT
   using StackOpUnsignedT = detail::UnsignedStackOpType<StackOpT>;
   static_assert(!std::is_void<StackOpUnsignedT>(), "unsupported StackOpT size");
@@ -292,6 +336,8 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
   using TransformInputItT =
     cub::TransformInputIterator<StackOpT, StackSymbolToStackOpT, StackSymbolItT>;
 
+  constexpr bool supports_reset_op = SupportResetOperation == stack_op_support::WITH_RESET_SUPPORT;
+
   auto const num_symbols_in = d_symbol_positions.size();
 
   // Converting a stack symbol that may either push or pop to a stack operation:
@@ -330,14 +376,44 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
 
   // Getting temporary storage requirements for the prefix sum of the stack level after each
   // operation
-  CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(
-    nullptr,
-    stack_level_scan_bytes,
-    stack_symbols_in,
-    d_kv_operations.Current(),
-    detail::AddStackLevelFromStackOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
-    num_symbols_in,
-    stream));
+  if constexpr (supports_reset_op) {
+    // Iterator that returns `1` for every symbol that corresponds to a `reset` operation
+    auto reset_segments_it = thrust::make_transform_iterator(
+      d_symbols,
+      detail::NewlineToResetStackSegmentOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op});
+
+    auto const fake_key_segment_it      = static_cast<StackSegmentT*>(nullptr);
+    std::size_t gen_segments_scan_bytes = 0;
+    std::size_t scan_by_key_bytes       = 0;
+    CUDF_CUDA_TRY(cub::DeviceScan::InclusiveSum(
+      nullptr,
+      gen_segments_scan_bytes,
+      reset_segments_it,
+      thrust::make_transform_output_iterator(fake_key_segment_it,
+                                             detail::ModToTargetTypeOpT<StackSegmentT>{}),
+      num_symbols_in,
+      stream));
+    CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScanByKey(
+      nullptr,
+      scan_by_key_bytes,
+      fake_key_segment_it,
+      stack_symbols_in,
+      d_kv_operations.Current(),
+      detail::AddStackLevelFromStackOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
+      num_symbols_in,
+      cub::Equality{},
+      stream));
+    stack_level_scan_bytes = std::max(gen_segments_scan_bytes, scan_by_key_bytes);
+  } else {
+    CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(
+      nullptr,
+      stack_level_scan_bytes,
+      stack_symbols_in,
+      d_kv_operations.Current(),
+      detail::AddStackLevelFromStackOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
+      num_symbols_in,
+      stream));
+  }
 
   // Getting temporary storage requirements for the stable radix sort (sorting by stack level of the
   // operations)
@@ -401,14 +477,41 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
   d_kv_operations = cub::DoubleBuffer<StackOpT>{d_kv_ops_current.data(), d_kv_ops_alt.data()};
 
   // Compute prefix sum of the stack level after each operation
-  CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(
-    temp_storage.data(),
-    total_temp_storage_bytes,
-    stack_symbols_in,
-    d_kv_operations.Current(),
-    detail::AddStackLevelFromStackOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
-    num_symbols_in,
-    stream));
+  if constexpr (supports_reset_op) {
+    // Iterator that returns `1` for every symbol that corresponds to a `reset` operation
+    auto reset_segments_it = thrust::make_transform_iterator(
+      d_symbols,
+      detail::NewlineToResetStackSegmentOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op});
+
+    rmm::device_uvector<StackSegmentT> key_segments{num_symbols_in, stream};
+    CUDF_CUDA_TRY(cub::DeviceScan::InclusiveSum(
+      temp_storage.data(),
+      total_temp_storage_bytes,
+      reset_segments_it,
+      thrust::make_transform_output_iterator(key_segments.data(),
+                                             detail::ModToTargetTypeOpT<StackSegmentT>{}),
+      num_symbols_in,
+      stream));
+    CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScanByKey(
+      temp_storage.data(),
+      total_temp_storage_bytes,
+      key_segments.data(),
+      stack_symbols_in,
+      d_kv_operations.Current(),
+      detail::AddStackLevelFromStackOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
+      num_symbols_in,
+      cub::Equality{},
+      stream));
+  } else {
+    CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(
+      temp_storage.data(),
+      total_temp_storage_bytes,
+      stack_symbols_in,
+      d_kv_operations.Current(),
+      detail::AddStackLevelFromStackOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
+      num_symbols_in,
+      stream));
+  }
 
   // Stable radix sort, sorting by stack level of the operations
   d_kv_operations_unsigned = cub::DoubleBuffer<StackOpUnsignedT>{
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 3702d94fd2b..496e5b25e60 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -343,27 +343,35 @@ constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NU
 std::array<std::string, NUM_SYMBOL_GROUPS - 1> const symbol_groups{
   {{"{"}, {"["}, {"}"}, {"]"}, {"\""}, {"\\"}, {"\n"}}};
 
-// Transition table
+// Transition table for the default JSON and JSON lines formats
 std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const transition_table{
   {/* IN_STATE          {       [       }       ]       "       \      \n    OTHER */
    /* TT_OOS    */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}},
    /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR, TT_STR}},
    /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}};
 
-// Translation table (i.e., for each transition, what are the symbols that we output)
+// Transition table for the JSON lines format that recovers from invalid JSON lines
+std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
+  resetting_transition_table{
+    {/* IN_STATE          {       [       }       ]       "       \      \n    OTHER */
+     /* TT_OOS    */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}},
+     /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_OOS, TT_STR}},
+     /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_STR}}}};
+
+// Translation table for the default JSON and JSON lines formats
 std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const translation_table{
   {/* IN_STATE         {      [      }      ]      "      \     \n    OTHER */
    /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}, {}}},
    /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {}, {}}},
    /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}};
 
-// Translation table
+// Translation table for the JSON lines format that recovers from invalid JSON lines
 std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
   resetting_translation_table{
     {/* IN_STATE         {      [      }      ]      "      \     \n    OTHER */
      /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {'\n'}, {}}},
-     /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {}, {}}},
-     /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}};
+     /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}},
+     /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}}}};
 
 // The DFA's starting state
 constexpr auto start_state = static_cast<StateT>(TT_OOS);
@@ -1415,14 +1423,19 @@ void get_stack_context(device_span<SymbolT const> json_in,
   constexpr auto max_translation_table_size =
     to_stack_op::NUM_SYMBOL_GROUPS * to_stack_op::TT_NUM_STATES;
 
-  // Translation table specialized on the choice of whether to reset on newlines outside of strings
+  // Transition table specialized on the choice of whether to reset on newlines
+  const auto transition_table = (stack_behavior == stack_behavior_t::ResetOnDelimiter)
+                                  ? to_stack_op::resetting_transition_table
+                                  : to_stack_op::transition_table;
+
+  // Translation table specialized on the choice of whether to reset on newlines
   const auto translation_table = (stack_behavior == stack_behavior_t::ResetOnDelimiter)
                                    ? to_stack_op::resetting_translation_table
                                    : to_stack_op::translation_table;
 
   auto json_to_stack_ops_fst = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(to_stack_op::symbol_groups),
-    fst::detail::make_transition_table(to_stack_op::transition_table),
+    fst::detail::make_transition_table(transition_table),
     fst::detail::make_translation_table<max_translation_table_size>(translation_table),
     stream);
 
@@ -1441,7 +1454,7 @@ void get_stack_context(device_span<SymbolT const> json_in,
 
   // Stack operations with indices are converted to top of the stack for each character in the input
   if (stack_behavior == stack_behavior_t::ResetOnDelimiter) {
-    fst::sparse_stack_op_to_top_of_stack<StackLevelT>(
+    fst::sparse_stack_op_to_top_of_stack<fst::stack_op_support::WITH_RESET_SUPPORT, StackLevelT>(
       stack_ops.data(),
       device_span<SymbolOffsetT>{stack_op_indices.data(), num_stack_ops},
       JSONWithRecoveryToStackOp{},
@@ -1451,7 +1464,7 @@ void get_stack_context(device_span<SymbolT const> json_in,
       json_in.size(),
       stream);
   } else {
-    fst::sparse_stack_op_to_top_of_stack<StackLevelT>(
+    fst::sparse_stack_op_to_top_of_stack<fst::stack_op_support::NO_RESET_SUPPORT, StackLevelT>(
       stack_ops.data(),
       device_span<SymbolOffsetT>{stack_op_indices.data(), num_stack_ops},
       JSONToStackOp{},
diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu
index 3d6743702b8..20b8674a717 100644
--- a/cpp/tests/io/fst/logical_stack_test.cu
+++ b/cpp/tests/io/fst/logical_stack_test.cu
@@ -216,14 +216,15 @@ TEST_F(LogicalStackTest, GroundTruth)
                                 stream.value()));
 
   // Run algorithm
-  fst::sparse_stack_op_to_top_of_stack<StackLevelT>(d_stack_ops.data(),
-                                                    d_stack_op_idx_span,
-                                                    JSONToStackOp{},
-                                                    top_of_stack_gpu.device_ptr(),
-                                                    empty_stack_symbol,
-                                                    read_symbol,
-                                                    string_size,
-                                                    stream.value());
+  fst::sparse_stack_op_to_top_of_stack<fst::stack_op_support::NO_RESET_SUPPORT, StackLevelT>(
+    d_stack_ops.data(),
+    d_stack_op_idx_span,
+    JSONToStackOp{},
+    top_of_stack_gpu.device_ptr(),
+    empty_stack_symbol,
+    read_symbol,
+    string_size,
+    stream.value());
 
   // Async copy results from device to host
   top_of_stack_gpu.device_to_host_async(stream_view);
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 5f79d5b862b..b0ffbe3d154 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -285,6 +285,123 @@ TEST_F(JsonTest, StackContextRecovering)
   CUDF_TEST_EXPECT_VECTOR_EQUAL(golden_stack_context, stack_context, stack_context.size());
 }
 
+TEST_F(JsonTest, StackContextRecoveringFuzz)
+{
+  // Type used to represent the atomic symbol type used within the finite-state machine
+  using SymbolT      = char;
+  using StackSymbolT = char;
+
+  std::random_device rd;
+  std::mt19937 gen(42);
+  std::uniform_int_distribution<int> distribution(0, 4);
+  constexpr std::size_t input_length = 1024 * 1024;
+  std::string input{};
+  input.reserve(input_length);
+
+  bool inside_quotes = false;
+  std::stack<StackSymbolT> host_stack{};
+  for (std::size_t i = 0; i < input_length; ++i) {
+    bool is_ok = true;
+    char current{};
+    do {
+      int rand_char = distribution(gen);
+      is_ok         = true;
+      switch (rand_char) {
+        case 0: current = '{'; break;
+        case 1: current = '['; break;
+        case 2: current = '}'; break;
+        case 3: current = '"'; break;
+        case 4: current = '\n'; break;
+      }
+      switch (current) {
+        case '"': inside_quotes = !inside_quotes; break;
+        case '{':
+          if (!inside_quotes) { host_stack.push('{'); }
+          break;
+        case '[':
+          if (!inside_quotes) { host_stack.push('['); }
+          break;
+        case '}':
+          if (!inside_quotes) {
+            if (host_stack.size() > 0) {
+              // Get the proper 'pop' stack symbol
+              current = (host_stack.top() == '{' ? '}' : ']');
+              host_stack.pop();
+            } else
+              is_ok = false;
+          }
+          break;
+        case '\n':
+          // Increase chance to have longer lines
+          if (distribution(gen) == 0) {
+            is_ok = false;
+            break;
+          } else {
+            host_stack    = {};
+            inside_quotes = false;
+            break;
+          }
+      }
+    } while (!is_ok);
+    input += current;
+  }
+
+  std::string expected_stack_context{};
+  expected_stack_context.reserve(input_length);
+  inside_quotes = false;
+  host_stack    = std::stack<StackSymbolT>{};
+  for (auto const current : input) {
+    // Write the stack context for the current input symbol
+    if (host_stack.empty()) {
+      expected_stack_context += '_';
+    } else {
+      expected_stack_context += host_stack.top();
+    }
+
+    switch (current) {
+      case '"': inside_quotes = !inside_quotes; break;
+      case '{':
+        if (!inside_quotes) { host_stack.push('{'); }
+        break;
+      case '[':
+        if (!inside_quotes) { host_stack.push('['); }
+        break;
+      case '}':
+        if (!inside_quotes && host_stack.size() > 0) { host_stack.pop(); }
+        break;
+      case ']':
+        if (!inside_quotes && host_stack.size() > 0) { host_stack.pop(); }
+        break;
+      case '\n':
+        host_stack    = {};
+        inside_quotes = false;
+        break;
+    }
+  }
+
+  // Prepare cuda stream for data transfers & kernels
+  auto const stream = cudf::get_default_stream();
+
+  // Prepare input & output buffers
+  cudf::string_scalar const d_scalar(input, true, stream);
+  auto const d_input =
+    cudf::device_span<SymbolT const>{d_scalar.data(), static_cast<size_t>(d_scalar.size())};
+  cudf::detail::hostdevice_vector<StackSymbolT> stack_context(input.size(), stream);
+
+  // Run algorithm
+  constexpr auto stack_behavior = cuio_json::stack_behavior_t::ResetOnDelimiter;
+  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream);
+
+  // Copy back the results
+  stack_context.device_to_host_async(stream);
+
+  // Make sure we copied back the stack context
+  stream.synchronize();
+
+  ASSERT_EQ(expected_stack_context.size(), stack_context.size());
+  CUDF_TEST_EXPECT_VECTOR_EQUAL(expected_stack_context, stack_context, stack_context.size());
+}
+
 TEST_F(JsonTest, TokenStream)
 {
   using cuio_json::PdaTokenT;

From b4746d8064198d6287c83aeeaa470b499ddd2e10 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 31 Oct 2023 07:12:25 -0700
Subject: [PATCH 065/118] Upgrade wheels to use arrow 13 (#14339)

In #14330 we upgraded to Arrow 13. However, we only did so for conda packages. For wheels, we couldn't do the same because pyarrow 13 started supporting two manylinux versions, both 2.17 and 2.28. This results in ABI compatibility issues because cudf wheels bundle an identical libarrow and was previously configured to compile with the CXX11 ABI turned off for compatibility with the  libarrow in the 2.17 wheels. To address this, this PR modifies the CMake logic to only set that flag conditionally based on the glibc version on the host system.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Robert Maynard (https://github.com/robertmaynard)
  - Ray Douglass (https://github.com/raydouglass)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14339
---
 .github/workflows/pr.yaml            |  1 +
 ci/build_wheel_cudf.sh               |  3 +-
 ci/test_wheel_cudf.sh                | 15 ++++++-
 ci/test_wheel_dask_cudf.sh           | 15 ++++++-
 cpp/cmake/thirdparty/get_arrow.cmake | 62 +++++++++++++++-------------
 dependencies.yaml                    | 13 +-----
 python/cudf/pyproject.toml           |  4 +-
 python/cudf_kafka/pyproject.toml     |  2 +-
 8 files changed, 69 insertions(+), 46 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 8d6c471c912..2da47f42c29 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -101,6 +101,7 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       build_type: pull-request
+      build-2_28-wheels: "true"
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index 7d3919b2d72..1586fec3a22 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -9,8 +9,7 @@ export SKBUILD_CONFIGURE_OPTIONS="-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF"
 
 ./ci/build_wheel.sh cudf ${package_dir}
 
-mkdir -p ${package_dir}/final_dist
 python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
+RAPIDS_PY_WHEEL_NAME="cudf_${AUDITWHEEL_POLICY}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index 83e24ab3ff1..8c42651e299 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -3,8 +3,21 @@
 
 set -eou pipefail
 
+# Set the manylinux version used for downloading the wheels so that we test the
+# newer ABI wheels on the newer images that support their installation.
+# Need to disable pipefail for the head not to fail, see
+# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
+set +o pipefail
+glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
+set -o pipefail
+manylinux_version="2_17"
+if [[ ${glibc_minor_version} -ge 28 ]]; then
+    manylinux_version="2_28"
+fi
+manylinux="manylinux_${manylinux_version}"
+
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/cudf*.whl)[test]
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index f89aa43c20a..118bea753d0 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -7,7 +7,20 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
 # Download the cudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+# Set the manylinux version used for downloading the wheels so that we test the
+# newer ABI wheels on the newer images that support their installation.
+# Need to disable pipefail for the head not to fail, see
+# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
+set +o pipefail
+glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
+set -o pipefail
+manylinux_version="2_17"
+if [[ ${glibc_minor_version} -ge 28 ]]; then
+    manylinux_version="2_28"
+fi
+manylinux="manylinux_${manylinux_version}"
+
+RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
 python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
 
 # Always install latest dask for testing
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 3b2cbc57d1c..469b968eefd 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -53,19 +53,35 @@ function(find_libarrow_in_python_wheel PYARROW_VERSION)
   find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL)
   add_library(arrow_shared ALIAS Arrow::Arrow)
 
-  # When using the libarrow inside a wheel we must build libcudf with the old ABI because pyarrow's
-  # `libarrow.so` is compiled for manylinux2014 (centos7 toolchain) which uses the old ABI. Note
-  # that these flags will often be redundant because we build wheels in manylinux containers that
-  # actually have the old libc++ anyway, but setting them explicitly ensures correct and consistent
-  # behavior in all other cases such as aarch builds on newer manylinux or testing builds in newer
-  # containers. Note that tests will not build successfully without also propagating these options
-  # to builds of GTest. Similarly, benchmarks will not work without updating GBench (and possibly
-  # NVBench) builds. We are currently ignoring these limitations since we don't anticipate using
-  # this feature except for building wheels.
-  target_compile_options(
-    Arrow::Arrow INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-D_GLIBCXX_USE_CXX11_ABI=0>"
-                           "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>"
+  # When using the libarrow inside a wheel, whether or not libcudf may be built using the new C++11
+  # ABI is dependent on whether the libarrow inside the wheel was compiled using that ABI because we
+  # need the arrow library that we bundle in cudf to be ABI-compatible with the one inside pyarrow.
+  # We determine what options to use by checking the glibc version on the current system, which is
+  # also how pip determines which manylinux-versioned pyarrow wheel to install. Note that tests will
+  # not build successfully without also propagating these options to builds of GTest. Similarly,
+  # benchmarks will not work without updating GBench (and possibly NVBench) builds. We are currently
+  # ignoring these limitations since we don't anticipate using this feature except for building
+  # wheels.
+  EXECUTE_PROCESS(
+    COMMAND ${CMAKE_C_COMPILER} -print-file-name=libc.so.6
+    OUTPUT_VARIABLE GLIBC_EXECUTABLE
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+  EXECUTE_PROCESS(
+    COMMAND ${GLIBC_EXECUTABLE}
+    OUTPUT_VARIABLE GLIBC_OUTPUT
+    OUTPUT_STRIP_TRAILING_WHITESPACE
   )
+  STRING(REGEX MATCH "stable release version ([0-9]+\\.[0-9]+)" GLIBC_VERSION ${GLIBC_OUTPUT})
+  STRING(REPLACE "stable release version " "" GLIBC_VERSION ${GLIBC_VERSION})
+  STRING(REPLACE "." ";" GLIBC_VERSION_LIST ${GLIBC_VERSION})
+  LIST(GET GLIBC_VERSION_LIST 1 GLIBC_VERSION_MINOR)
+  if(GLIBC_VERSION_MINOR LESS 28)
+    target_compile_options(
+      Arrow::Arrow INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-D_GLIBCXX_USE_CXX11_ABI=0>"
+                             "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>"
+    )
+  endif()
 
   rapids_export_package(BUILD Arrow cudf-exports)
   rapids_export_package(INSTALL Arrow cudf-exports)
@@ -408,22 +424,12 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
 endfunction()
 
 if(NOT DEFINED CUDF_VERSION_Arrow)
-  # Temporarily use Arrow 12.0.1 in wheels and Arrow 13.0.0 otherwise
-  if(USE_LIBARROW_FROM_PYARROW)
-    set(CUDF_VERSION_Arrow
-        # This version must be kept in sync with the libarrow version pinned for builds in
-        # dependencies.yaml.
-        12.0.1
-        CACHE STRING "The version of Arrow to find (or build)"
-    )
-  else()
-    set(CUDF_VERSION_Arrow
-        # This version must be kept in sync with the libarrow version pinned for builds in
-        # dependencies.yaml.
-        13.0.0
-        CACHE STRING "The version of Arrow to find (or build)"
-    )
-  endif()
+  set(CUDF_VERSION_Arrow
+      # This version must be kept in sync with the libarrow version pinned for builds in
+      # dependencies.yaml.
+      13.0.0
+      CACHE STRING "The version of Arrow to find (or build)"
+  )
 endif()
 
 find_and_configure_arrow(
diff --git a/dependencies.yaml b/dependencies.yaml
index cb9ef7a468c..bb61e244b97 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -242,16 +242,11 @@ dependencies:
           - cython>=3.0.0
           # TODO: Pin to numpy<1.25 until cudf requires pandas 2
           - &numpy numpy>=1.21,<1.25
-      - output_types: [conda]
+      - output_types: [conda, requirements, pyproject]
         packages:
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
           - pyarrow==13.0.0.*
-      - output_types: [requirements, pyproject]
-        packages:
-          # Hard pin the patch version used during the build. This must be kept
-          # in sync with the version pinned in get_arrow.cmake.
-          - pyarrow==12.0.1.*
   build_python:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -272,14 +267,10 @@ dependencies:
           - libarrow==13.*
   pyarrow_run:
     common:
-      - output_types: [conda]
+      - output_types: [conda, requirements, pyproject]
         packages:
           # Allow runtime version to float up to minor version
           - pyarrow==13.*
-      - output_types: [requirements, pyproject]
-        packages:
-          # Allow runtime version to float up to minor version
-          - pyarrow==12.*
   cudatoolkit:
     specific:
       - output_types: conda
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index cc8a67c389f..4a453da0a4c 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -8,7 +8,7 @@ requires = [
     "ninja",
     "numpy>=1.21,<1.25",
     "protoc-wheel",
-    "pyarrow==12.0.1.*",
+    "pyarrow==13.0.0.*",
     "rmm==23.12.*",
     "scikit-build>=0.13.1",
     "setuptools",
@@ -38,7 +38,7 @@ dependencies = [
     "pandas>=1.3,<1.6.0dev0",
     "protobuf>=4.21,<5",
     "ptxcompiler",
-    "pyarrow==12.*",
+    "pyarrow==13.*",
     "rmm==23.12.*",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 78a7a83ac3a..5058412ef60 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -5,7 +5,7 @@
 requires = [
     "cython>=3.0.0",
     "numpy>=1.21,<1.25",
-    "pyarrow==12.0.1.*",
+    "pyarrow==13.0.0.*",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From f4c95aad97ed1cd6eae81bc453ee89ff09780bd1 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 31 Oct 2023 08:55:10 -0700
Subject: [PATCH 066/118] Fix logical type issues in the Parquet writer
 (#14322)

Closes #14315
Closes #14326

Parquet writer writes time and timestamp types with logical type with `isAdjustedToUTC` as `false`. However, timestamps in libcudf tables are implicitly in UTC and don't need to be adjusted.
This PR changes the `isAdjustedToUTC` to true.

Also added a writer option to write timestamps as local, as this is the expected behavior on the Python side.

Also changed the way logical type is handled for UNKNOWN type columns in `merge_row_group_metadata` - the logical type is excluded from merged metadata because of issues with type inference.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/14322
---
 cpp/include/cudf/io/parquet.hpp               | 57 +++++++++++++++++++
 cpp/src/io/parquet/parquet.hpp                | 12 ++--
 cpp/src/io/parquet/writer_impl.cu             | 46 ++++++++++-----
 cpp/src/io/parquet/writer_impl.hpp            |  1 +
 python/cudf/cudf/_lib/cpp/io/parquet.pxd      | 21 +++++++
 python/cudf/cudf/_lib/parquet.pyx             |  1 +
 .../dask_cudf/io/tests/test_parquet.py        | 14 +----
 7 files changed, 121 insertions(+), 31 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 354bf839632..ea18da74d5a 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -532,6 +532,9 @@ class parquet_writer_options {
   // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
   // If true then overrides any per-column setting in _metadata.
   bool _write_timestamps_as_int96 = false;
+  // Parquet writer can write timestamps as UTC
+  // Defaults to true because libcudf timestamps are implicitly UTC
+  bool _write_timestamps_as_UTC = true;
   // Column chunks file paths to be set in the raw output metadata. One per output file
   std::vector<std::string> _column_chunks_file_paths;
   // Maximum size of each row group (unless smaller than a single page)
@@ -652,6 +655,13 @@ class parquet_writer_options {
    */
   bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
 
+  /**
+   * @brief Returns `true` if timestamps will be written as UTC
+   *
+   * @return `true` if timestamps will be written as UTC
+   */
+  [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }
+
   /**
    * @brief Returns Column chunks file paths to be set in the raw output metadata.
    *
@@ -789,6 +799,13 @@ class parquet_writer_options {
    */
   void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; }
 
+  /**
+   * @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`.
+   *
+   * @param val Boolean value to enable/disable writing of timestamps as UTC.
+   */
+  void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; }
+
   /**
    * @brief Sets column chunks file path to be set in the raw output metadata.
    *
@@ -1100,6 +1117,18 @@ class parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set to true if timestamps are to be written as UTC.
+   *
+   * @param enabled Boolean value to enable/disable writing of timestamps as UTC.
+   * @return this for chaining
+   */
+  parquet_writer_options_builder& utc_timestamps(bool enabled)
+  {
+    options._write_timestamps_as_UTC = enabled;
+    return *this;
+  }
+
   /**
    * @brief Set to true if V2 page headers are to be written.
    *
@@ -1171,6 +1200,8 @@ class chunked_parquet_writer_options {
   // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
   // If true then overrides any per-column setting in _metadata.
   bool _write_timestamps_as_int96 = false;
+  // Parquet writer can write timestamps as UTC. Defaults to true.
+  bool _write_timestamps_as_UTC = true;
   // Maximum size of each row group (unless smaller than a single page)
   size_t _row_group_size_bytes = default_row_group_size_bytes;
   // Maximum number of rows in row group (unless smaller than a single page)
@@ -1254,6 +1285,13 @@ class chunked_parquet_writer_options {
    */
   bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
 
+  /**
+   * @brief Returns `true` if timestamps will be written as UTC
+   *
+   * @return `true` if timestamps will be written as UTC
+   */
+  [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }
+
   /**
    * @brief Returns maximum row group size, in bytes.
    *
@@ -1375,6 +1413,13 @@ class chunked_parquet_writer_options {
    */
   void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; }
 
+  /**
+   * @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`.
+   *
+   * @param val Boolean value to enable/disable writing of timestamps as UTC.
+   */
+  void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; }
+
   /**
    * @brief Sets the maximum row group size, in bytes.
    *
@@ -1539,6 +1584,18 @@ class chunked_parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set to true if timestamps are to be written as UTC.
+   *
+   * @param enabled Boolean value to enable/disable writing of timestamps as UTC.
+   * @return this for chaining
+   */
+  chunked_parquet_writer_options_builder& utc_timestamps(bool enabled)
+  {
+    options._write_timestamps_as_UTC = enabled;
+    return *this;
+  }
+
   /**
    * @brief Set to true if V2 page headers are to be written.
    *
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 699cad89703..9ab686b99d5 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -58,13 +58,17 @@ struct TimeUnit {
 };
 
 struct TimeType {
-  bool isAdjustedToUTC = false;
-  TimeUnit unit;
+  // Default to true because the timestamps are implicitly in UTC
+  // Writer option overrides this default
+  bool isAdjustedToUTC = true;
+  TimeUnit unit        = {TimeUnit::MILLIS};
 };
 
 struct TimestampType {
-  bool isAdjustedToUTC = false;
-  TimeUnit unit;
+  // Default to true because the timestamps are implicitly in UTC
+  // Writer option overrides this default
+  bool isAdjustedToUTC = true;
+  TimeUnit unit        = {TimeUnit::MILLIS};
 };
 
 struct IntType {
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index c06acc1690b..c2b10e09b1a 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -278,6 +278,7 @@ struct leaf_schema_fn {
   cudf::detail::LinkedColPtr const& col;
   column_in_metadata const& col_meta;
   bool timestamp_is_int96;
+  bool timestamp_is_utc;
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, bool>, void> operator()()
@@ -404,7 +405,7 @@ struct leaf_schema_fn {
     col_schema.ts_scale    = 1000;
     if (not timestamp_is_int96) {
       col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS;
-      col_schema.logical_type   = LogicalType{TimestampType{false, TimeUnit::MILLIS}};
+      col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}};
     }
   }
 
@@ -415,7 +416,7 @@ struct leaf_schema_fn {
     col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
     if (not timestamp_is_int96) {
       col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS;
-      col_schema.logical_type   = LogicalType{TimestampType{false, TimeUnit::MILLIS}};
+      col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}};
     }
   }
 
@@ -426,7 +427,7 @@ struct leaf_schema_fn {
     col_schema.stats_dtype = statistics_dtype::dtype_timestamp64;
     if (not timestamp_is_int96) {
       col_schema.converted_type = ConvertedType::TIMESTAMP_MICROS;
-      col_schema.logical_type   = LogicalType{TimestampType{false, TimeUnit::MICROS}};
+      col_schema.logical_type   = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MICROS}};
     }
   }
 
@@ -441,7 +442,7 @@ struct leaf_schema_fn {
     }
     // set logical type if it's not int96
     else {
-      col_schema.logical_type = LogicalType{TimestampType{false, TimeUnit::NANOS}};
+      col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::NANOS}};
     }
   }
 
@@ -453,7 +454,7 @@ struct leaf_schema_fn {
     col_schema.converted_type = ConvertedType::TIME_MILLIS;
     col_schema.stats_dtype    = statistics_dtype::dtype_int32;
     col_schema.ts_scale       = 24 * 60 * 60 * 1000;
-    col_schema.logical_type   = LogicalType{TimeType{false, TimeUnit::MILLIS}};
+    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
   }
 
   template <typename T>
@@ -463,7 +464,7 @@ struct leaf_schema_fn {
     col_schema.converted_type = ConvertedType::TIME_MILLIS;
     col_schema.stats_dtype    = statistics_dtype::dtype_int32;
     col_schema.ts_scale       = 1000;
-    col_schema.logical_type   = LogicalType{TimeType{false, TimeUnit::MILLIS}};
+    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
   }
 
   template <typename T>
@@ -472,7 +473,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::TIME_MILLIS;
     col_schema.stats_dtype    = statistics_dtype::dtype_int32;
-    col_schema.logical_type   = LogicalType{TimeType{false, TimeUnit::MILLIS}};
+    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
   }
 
   template <typename T>
@@ -481,7 +482,7 @@ struct leaf_schema_fn {
     col_schema.type           = Type::INT64;
     col_schema.converted_type = ConvertedType::TIME_MICROS;
     col_schema.stats_dtype    = statistics_dtype::dtype_int64;
-    col_schema.logical_type   = LogicalType{TimeType{false, TimeUnit::MICROS}};
+    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}};
   }
 
   //  unsupported outside cudf for parquet 1.0.
@@ -490,7 +491,7 @@ struct leaf_schema_fn {
   {
     col_schema.type         = Type::INT64;
     col_schema.stats_dtype  = statistics_dtype::dtype_int64;
-    col_schema.logical_type = LogicalType{TimeType{false, TimeUnit::NANOS}};
+    col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}};
   }
 
   template <typename T>
@@ -567,7 +568,8 @@ std::vector<schema_tree_node> construct_schema_tree(
   cudf::detail::LinkedColVector const& linked_columns,
   table_input_metadata& metadata,
   single_write_mode write_mode,
-  bool int96_timestamps)
+  bool int96_timestamps,
+  bool utc_timestamps)
 {
   std::vector<schema_tree_node> schema;
   schema_tree_node root{};
@@ -739,8 +741,9 @@ std::vector<schema_tree_node> construct_schema_tree(
 
         bool timestamp_is_int96 = int96_timestamps or col_meta.is_enabled_int96_timestamps();
 
-        cudf::type_dispatcher(col->type(),
-                              leaf_schema_fn{col_schema, col, col_meta, timestamp_is_int96});
+        cudf::type_dispatcher(
+          col->type(),
+          leaf_schema_fn{col_schema, col, col_meta, timestamp_is_int96, utc_timestamps});
 
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
@@ -1467,6 +1470,7 @@ void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta)
  * @param max_dictionary_size Maximum dictionary size, in bytes
  * @param single_write_mode Flag to indicate that we are guaranteeing a single table write
  * @param int96_timestamps Flag to indicate if timestamps will be written as INT96
+ * @param utc_timestamps Flag to indicate if timestamps are UTC
  * @param write_v2_headers True if V2 page headers are to be written
  * @param out_sink Sink for checking if device write is supported, should not be used to write any
  *        data in this function
@@ -1491,12 +1495,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    size_t max_dictionary_size,
                                    single_write_mode write_mode,
                                    bool int96_timestamps,
+                                   bool utc_timestamps,
                                    bool write_v2_headers,
                                    host_span<std::unique_ptr<data_sink> const> out_sink,
                                    rmm::cuda_stream_view stream)
 {
-  auto vec         = table_to_linked_columns(input);
-  auto schema_tree = construct_schema_tree(vec, table_meta, write_mode, int96_timestamps);
+  auto vec = table_to_linked_columns(input);
+  auto schema_tree =
+    construct_schema_tree(vec, table_meta, write_mode, int96_timestamps, utc_timestamps);
   // Construct parquet_column_views from the schema tree leaf nodes.
   std::vector<parquet_column_view> parquet_columns;
 
@@ -2026,6 +2032,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _max_dictionary_size(options.get_max_dictionary_size()),
     _max_page_fragment_size(options.get_max_page_fragment_size()),
     _int96_timestamps(options.is_enabled_int96_timestamps()),
+    _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
@@ -2054,6 +2061,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _max_dictionary_size(options.get_max_dictionary_size()),
     _max_page_fragment_size(options.get_max_page_fragment_size()),
     _int96_timestamps(options.is_enabled_int96_timestamps()),
+    _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
@@ -2131,6 +2139,7 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                                            _max_dictionary_size,
                                            _single_write_mode,
                                            _int96_timestamps,
+                                           _utc_timestamps,
                                            _write_v2_headers,
                                            _out_sink,
                                            _stream);
@@ -2394,6 +2403,15 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
     }
   }
 
+  // Remove any LogicalType::UNKNOWN annotations that were passed in as they can confuse
+  // column type inferencing.
+  // See https://github.com/rapidsai/cudf/pull/14264#issuecomment-1778311615
+  for (auto& se : md.schema) {
+    if (se.logical_type.has_value() && se.logical_type.value().type == LogicalType::UNKNOWN) {
+      se.logical_type = thrust::nullopt;
+    }
+  }
+
   // Thrift-encode the resulting output
   file_header_s fhdr;
   file_ender_s fendr;
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 1d27a8400c8..3415205d179 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -157,6 +157,7 @@ class writer::impl {
   size_t const _max_dictionary_size;
   std::optional<size_type> const _max_page_fragment_size;
   bool const _int96_timestamps;
+  bool const _utc_timestamps;
   bool const _write_v2_headers;
   int32_t const _column_index_truncate_length;
   std::vector<std::map<std::string, std::string>> const _kv_meta;  // Optional user metadata.
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index 2b92b9b58d3..cace29b5d45 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -90,6 +90,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_column_chunks_file_paths(
             vector[string] column_chunks_file_paths
         ) except +
+        void set_int96_timestamps(
+            bool enabled
+        ) except +
+        void set_utc_timestamps(
+            bool enabled
+        ) except +
         void set_row_group_size_bytes(size_t val) except +
         void set_row_group_size_rows(size_type val) except +
         void set_max_page_size_bytes(size_t val) except +
@@ -129,6 +135,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_writer_options_builder& int96_timestamps(
             bool enabled
         ) except +
+        parquet_writer_options_builder& utc_timestamps(
+            bool enabled
+        ) except +
         parquet_writer_options_builder& row_group_size_bytes(
             size_t val
         ) except +
@@ -172,6 +181,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_compression(
             cudf_io_types.compression_type compression
         ) except +
+        void set_int96_timestamps(
+            bool enabled
+        ) except +
+        void set_utc_timestamps(
+            bool enabled
+        ) except +
         void set_row_group_size_bytes(size_t val) except +
         void set_row_group_size_rows(size_type val) except +
         void set_max_page_size_bytes(size_t val) except +
@@ -199,6 +214,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         chunked_parquet_writer_options_builder& compression(
             cudf_io_types.compression_type compression
         ) except +
+        chunked_parquet_writer_options_builder& int96_timestamps(
+            bool enabled
+        ) except +
+        chunked_parquet_writer_options_builder& utc_timestamps(
+            bool enabled
+        ) except +
         chunked_parquet_writer_options_builder& row_group_size_bytes(
             size_t val
         ) except +
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 85fd25cf1a9..f75a6c2b20e 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -399,6 +399,7 @@ def write_parquet(
         .compression(comp_type)
         .stats_level(stat_freq)
         .int96_timestamps(_int96_timestamps)
+        .utc_timestamps(False)
         .build()
     )
     if partitions_info is not None:
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 234b8fc5212..7b4e20012f7 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -148,7 +148,6 @@ def test_roundtrip_from_pandas(tmpdir):
 
 
 def test_strings(tmpdir):
-
     fn = str(tmpdir)
     dfp = pd.DataFrame(
         {"a": ["aa", "bbb", "cccc"], "b": ["hello", "dog", "man"]}
@@ -161,7 +160,6 @@ def test_strings(tmpdir):
 
 
 def test_dask_timeseries_from_pandas(tmpdir):
-
     fn = str(tmpdir.join("test.parquet"))
     ddf2 = dask.datasets.timeseries(freq="D")
     pdf = ddf2.compute()
@@ -173,7 +171,6 @@ def test_dask_timeseries_from_pandas(tmpdir):
 @pytest.mark.parametrize("index", [False, None])
 @pytest.mark.parametrize("divisions", [False, True])
 def test_dask_timeseries_from_dask(tmpdir, index, divisions):
-
     fn = str(tmpdir)
     ddf2 = dask.datasets.timeseries(freq="D")
     ddf2.to_parquet(fn, engine="pyarrow", write_index=index)
@@ -188,7 +185,6 @@ def test_dask_timeseries_from_dask(tmpdir, index, divisions):
 @pytest.mark.parametrize("index", [False, None])
 @pytest.mark.parametrize("divisions", [False, True])
 def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions):
-
     fn = str(tmpdir)
     ddf2 = dask_cudf.from_cudf(
         cudf.datasets.timeseries(freq="D"), npartitions=4
@@ -205,7 +201,6 @@ def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions):
 
 @pytest.mark.parametrize("index", [False, True])
 def test_empty(tmpdir, index):
-
     fn = str(tmpdir)
     dfp = pd.DataFrame({"a": [11.0, 12.0, 12.0], "b": [4, 5, 6]})[:0]
     if index:
@@ -218,7 +213,6 @@ def test_empty(tmpdir, index):
 
 
 def test_filters(tmpdir):
-
     tmp_path = str(tmpdir)
     df = pd.DataFrame({"x": range(10), "y": list("aabbccddee")})
     ddf = dd.from_pandas(df, npartitions=5)
@@ -251,7 +245,6 @@ def test_filters(tmpdir):
 @pytest.mark.parametrize("numeric", [True, False])
 @pytest.mark.parametrize("null", [np.nan, None])
 def test_isna_filters(tmpdir, null, numeric):
-
     tmp_path = str(tmpdir)
     df = pd.DataFrame(
         {
@@ -284,7 +277,6 @@ def test_isna_filters(tmpdir, null, numeric):
 
 
 def test_filters_at_row_group_level(tmpdir):
-
     tmp_path = str(tmpdir)
     df = pd.DataFrame({"x": range(10), "y": list("aabbccddee")})
     ddf = dd.from_pandas(df, npartitions=5)
@@ -405,7 +397,6 @@ def test_split_row_groups(tmpdir, row_groups, index):
 @need_create_meta
 @pytest.mark.parametrize("partition_on", [None, "a"])
 def test_create_metadata_file(tmpdir, partition_on):
-
     tmpdir = str(tmpdir)
 
     # Write ddf without a _metadata file
@@ -445,7 +436,6 @@ def test_create_metadata_file(tmpdir, partition_on):
 
 @need_create_meta
 def test_create_metadata_file_inconsistent_schema(tmpdir):
-
     # NOTE: This test demonstrates that the CudfEngine
     # can be used to generate a global `_metadata` file
     # even if there are inconsistent schemas in the dataset.
@@ -481,9 +471,7 @@ def test_create_metadata_file_inconsistent_schema(tmpdir):
     # call `compute` on `ddf1`, because the dtype of
     # the inconsistent column ("a") may be "object"
     # before computing, and "int" after
-    # TODO: Uncomment after cudf#14326 is closed
-    # (See: https://github.com/rapidsai/cudf/issues/14326)
-    # dd.assert_eq(ddf1.compute(), ddf2)
+    dd.assert_eq(ddf1.compute(), ddf2)
     dd.assert_eq(ddf1.compute(), ddf2.compute())
 
 
From 7358ecd71fffaa449395606571f5dfac0bb5741e Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 31 Oct 2023 09:46:08 -0700
Subject: [PATCH 067/118] Update versioning strategy (#14285)

See https://github.com/rapidsai/rmm/pull/1347

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/14285
---
 VERSION                                  |  1 +
 ci/build_cpp.sh                          |  4 +++-
 ci/build_python.sh                       | 17 +++++++++++++----
 ci/build_wheel.sh                        |  8 ++++----
 ci/release/update-version.sh             | 13 ++-----------
 conda/recipes/cudf/meta.yaml             |  4 ++--
 conda/recipes/cudf_kafka/meta.yaml       |  4 ++--
 conda/recipes/custreamz/meta.yaml        |  4 ++--
 conda/recipes/dask-cudf/meta.yaml        |  4 ++--
 conda/recipes/libcudf/meta.yaml          |  4 ++--
 python/cudf/cudf/VERSION                 |  1 +
 python/cudf/cudf/__init__.py             |  3 +--
 python/cudf/cudf/_version.py             | 20 ++++++++++++++++++++
 python/cudf/pyproject.toml               |  5 ++++-
 python/cudf/setup.py                     |  4 +++-
 python/cudf_kafka/cudf_kafka/VERSION     |  1 +
 python/cudf_kafka/cudf_kafka/__init__.py |  3 +++
 python/cudf_kafka/cudf_kafka/_version.py | 23 +++++++++++++++++++++++
 python/cudf_kafka/pyproject.toml         |  5 ++++-
 python/cudf_kafka/setup.py               |  2 +-
 python/custreamz/custreamz/VERSION       |  1 +
 python/custreamz/custreamz/__init__.py   |  3 ++-
 python/custreamz/custreamz/_version.py   | 23 +++++++++++++++++++++++
 python/custreamz/pyproject.toml          |  5 ++++-
 python/custreamz/setup.py                |  4 +++-
 python/dask_cudf/dask_cudf/VERSION       |  1 +
 python/dask_cudf/dask_cudf/__init__.py   |  3 +--
 python/dask_cudf/dask_cudf/_version.py   | 23 +++++++++++++++++++++++
 python/dask_cudf/pyproject.toml          |  6 ++++--
 python/dask_cudf/setup.py                |  5 ++++-
 30 files changed, 160 insertions(+), 44 deletions(-)
 create mode 100644 VERSION
 create mode 120000 python/cudf/cudf/VERSION
 create mode 100644 python/cudf/cudf/_version.py
 create mode 120000 python/cudf_kafka/cudf_kafka/VERSION
 create mode 100644 python/cudf_kafka/cudf_kafka/_version.py
 create mode 120000 python/custreamz/custreamz/VERSION
 create mode 100644 python/custreamz/custreamz/_version.py
 create mode 120000 python/dask_cudf/dask_cudf/VERSION
 create mode 100644 python/dask_cudf/dask_cudf/_version.py

diff --git a/VERSION b/VERSION
new file mode 100644
index 00000000000..a193fff41e8
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+23.12.00
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 8b757fecf5a..f1ad8ee7778 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -9,10 +9,12 @@ export CMAKE_GENERATOR=Ninja
 
 rapids-print-env
 
+version=$(rapids-generate-version)
+
 rapids-logger "Begin cpp build"
 
 # With boa installed conda build forward to boa
-rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
     conda/recipes/libcudf
 
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 61f160b25f5..32fe7b6b3ce 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -9,6 +9,15 @@ export CMAKE_GENERATOR=Ninja
 
 rapids-print-env
 
+package_dir="python"
+version=$(rapids-generate-version)
+commit=$(git rev-parse HEAD)
+
+echo "${version}" > VERSION
+for package_name in cudf dask_cudf cudf_kafka custreamz; do
+    sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" ${package_dir}/${package_name}/${package_name}/_version.py
+done
+
 rapids-logger "Begin py build"
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
@@ -16,24 +25,24 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 # TODO: Remove `--no-test` flag once importing on a CPU
 # node works correctly
 # With boa installed conda build forwards to the boa builder
-rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/cudf
 
-rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/dask-cudf
 
-rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cudf_kafka
 
-rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 06d0c3c7a56..a770c260bef 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -9,9 +9,8 @@ package_dir=$2
 source rapids-configure-sccache
 source rapids-date-string
 
-# Use gha-tools rapids-pip-wheel-version to generate wheel version then
-# update the necessary files
-version_override="$(rapids-pip-wheel-version ${RAPIDS_DATE_STRING})"
+version=$(rapids-generate-version)
+commit=$(git rev-parse HEAD)
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
@@ -22,8 +21,9 @@ PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
 # Patch project metadata files to include the CUDA version suffix and version override.
 pyproject_file="${package_dir}/pyproject.toml"
 
-sed -i "s/^version = .*/version = \"${version_override}\"/g" ${pyproject_file}
 sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+echo "${version}" > VERSION
+sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name}/_version.py"
 
 # For nightlies we want to ensure that we're pulling in alphas as well. The
 # easiest way to do so is to augment the spec with a constraint containing a
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index eac64fe1a0f..7574b4174e9 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -50,17 +50,8 @@ sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g'
 # cpp cudf_jni update
 sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' java/src/main/native/CMakeLists.txt
 
-# Python __init__.py updates
-sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cudf/cudf/__init__.py
-sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/dask_cudf/dask_cudf/__init__.py
-sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cudf_kafka/cudf_kafka/__init__.py
-sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/custreamz/custreamz/__init__.py
-
-# Python pyproject.toml updates
-sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cudf/pyproject.toml
-sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/dask_cudf/pyproject.toml
-sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cudf_kafka/pyproject.toml
-sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/custreamz/pyproject.toml
+# Centralized version file update
+echo "${NEXT_FULL_TAG}" > VERSION
 
 # Wheel testing script
 sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_dask_cudf.sh
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 619df00087c..1ed07a85b88 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -1,6 +1,6 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
-{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version = environ['CONDA_PY'] %}
 {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
@@ -12,7 +12,7 @@ package:
   version: {{ version }}
 
 source:
-  git_url: ../../..
+  path: ../../..
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index a79c23b7d98..cdc547b4d68 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version = environ['CONDA_PY'] %}
 {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
@@ -12,7 +12,7 @@ package:
   version: {{ version }}
 
 source:
-  git_url: ../../..
+  path: ../../..
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index c5d14f1c884..fb6efabffd4 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -1,6 +1,6 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
-{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version = environ['CONDA_PY'] %}
 {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
@@ -12,7 +12,7 @@ package:
   version: {{ version }}
 
 source:
-  git_url: ../../..
+  path: ../../..
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 444a9850c74..9dc9f76d9f5 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -1,6 +1,6 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
-{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version = environ['CONDA_PY'] %}
 {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
@@ -12,7 +12,7 @@ package:
   version: {{ version }}
 
 source:
-  git_url: ../../..
+  path: ../../..
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index ecd777bf91f..0459908fd00 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -1,6 +1,6 @@
 # Copyright (c) 2018-2023, NVIDIA CORPORATION.
 
-{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
 {% set cuda_major = cuda_version.split('.')[0] %}
@@ -11,7 +11,7 @@ package:
   name: libcudf-split
 
 source:
-  git_url: ../../..
+  path: ../../..
 
 build:
   script_env:
diff --git a/python/cudf/cudf/VERSION b/python/cudf/cudf/VERSION
new file mode 120000
index 00000000000..d62dc733efd
--- /dev/null
+++ b/python/cudf/cudf/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 8d25d478676..02274a5fdd1 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -17,6 +17,7 @@
 from rmm.allocators.numba import RMMNumbaManager
 
 from cudf import api, core, datasets, testing
+from cudf._version import __git_commit__, __version__
 from cudf.api.extensions import (
     register_dataframe_accessor,
     register_index_accessor,
@@ -99,8 +100,6 @@
 rmm.register_reinitialize_hook(clear_cache)
 
 
-__version__ = "23.12.00"
-
 __all__ = [
     "BaseIndex",
     "CategoricalDtype",
diff --git a/python/cudf/cudf/_version.py b/python/cudf/cudf/_version.py
new file mode 100644
index 00000000000..ecf6ddd8e3b
--- /dev/null
+++ b/python/cudf/cudf/_version.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files("cudf").joinpath("VERSION").read_text().strip()
+)
+__git_commit__ = ""
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 4a453da0a4c..e934846ec35 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -17,7 +17,7 @@ requires = [
 
 [project]
 name = "cudf"
-version = "23.12.00"
+dynamic = ["version"]
 description = "cuDF - GPU Dataframe"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -78,6 +78,9 @@ Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 [tool.setuptools]
 license-files = ["LICENSE"]
 
+[tool.setuptools.dynamic]
+version = {file = "cudf/VERSION"}
+
 [tool.isort]
 line_length = 79
 multi_line_output = 3
diff --git a/python/cudf/setup.py b/python/cudf/setup.py
index 96b91b4ccc0..984cd63a7c9 100644
--- a/python/cudf/setup.py
+++ b/python/cudf/setup.py
@@ -6,6 +6,8 @@
 packages = find_packages(include=["cudf*", "udf_cpp*"])
 setup(
     packages=packages,
-    package_data={key: ["*.pxd", "*.hpp", "*.cuh"] for key in packages},
+    package_data={
+        key: ["VERSION", "*.pxd", "*.hpp", "*.cuh"] for key in packages
+    },
     zip_safe=False,
 )
diff --git a/python/cudf_kafka/cudf_kafka/VERSION b/python/cudf_kafka/cudf_kafka/VERSION
new file mode 120000
index 00000000000..d62dc733efd
--- /dev/null
+++ b/python/cudf_kafka/cudf_kafka/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/cudf_kafka/cudf_kafka/__init__.py b/python/cudf_kafka/cudf_kafka/__init__.py
index e69de29bb2d..43a91af9cf5 100644
--- a/python/cudf_kafka/cudf_kafka/__init__.py
+++ b/python/cudf_kafka/cudf_kafka/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+
+from ._version import __git_commit__, __version__
diff --git a/python/cudf_kafka/cudf_kafka/_version.py b/python/cudf_kafka/cudf_kafka/_version.py
new file mode 100644
index 00000000000..5adab566da0
--- /dev/null
+++ b/python/cudf_kafka/cudf_kafka/_version.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files("cudf_kafka")
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+__git_commit__ = ""
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 5058412ef60..5d2588fa6f7 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -12,7 +12,7 @@ requires = [
 
 [project]
 name = "cudf_kafka"
-version = "23.12.00"
+dynamic = ["version"]
 description = "cuDF Kafka Datasource"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -38,6 +38,9 @@ Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 [tool.setuptools]
 license-files = ["LICENSE"]
 
+[tool.setuptools.dynamic]
+version = {file = "cudf_kafka/VERSION"}
+
 [tool.isort]
 line_length = 79
 multi_line_output = 3
diff --git a/python/cudf_kafka/setup.py b/python/cudf_kafka/setup.py
index d955d95858a..6f3909d4528 100644
--- a/python/cudf_kafka/setup.py
+++ b/python/cudf_kafka/setup.py
@@ -91,6 +91,6 @@
         ),
     ),
     packages=packages,
-    package_data={key: ["*.pxd"] for key in packages},
+    package_data={key: ["VERSION", "*.pxd"] for key in packages},
     zip_safe=False,
 )
diff --git a/python/custreamz/custreamz/VERSION b/python/custreamz/custreamz/VERSION
new file mode 120000
index 00000000000..d62dc733efd
--- /dev/null
+++ b/python/custreamz/custreamz/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/custreamz/custreamz/__init__.py b/python/custreamz/custreamz/__init__.py
index 52be76aab1f..3f11da14684 100644
--- a/python/custreamz/custreamz/__init__.py
+++ b/python/custreamz/custreamz/__init__.py
@@ -1,3 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
+from ._version import __git_commit__, __version__
 from .kafka import Consumer
diff --git a/python/custreamz/custreamz/_version.py b/python/custreamz/custreamz/_version.py
new file mode 100644
index 00000000000..0f545f95f2b
--- /dev/null
+++ b/python/custreamz/custreamz/_version.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files("custreamz")
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+__git_commit__ = ""
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index e6328ed045d..2d0059d5aa9 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 
 [project]
 name = "custreamz"
-version = "23.12.00"
+dynamic = ["version"]
 description = "cuStreamz - GPU Accelerated Streaming"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -48,6 +48,9 @@ Homepage = "https://github.com/rapidsai/cudf"
 license-files = ["LICENSE"]
 zip-safe = false
 
+[tool.setuptools.dynamic]
+version = {file = "custreamz/VERSION"}
+
 [tools.setuptools.packages.find]
 include = [
     "custreamz",
diff --git a/python/custreamz/setup.py b/python/custreamz/setup.py
index 2fa45ac8087..04943bf88e2 100644
--- a/python/custreamz/setup.py
+++ b/python/custreamz/setup.py
@@ -2,4 +2,6 @@
 
 from setuptools import setup
 
-setup()
+setup(
+    package_data={"custreamz": ["VERSION"]},
+)
diff --git a/python/dask_cudf/dask_cudf/VERSION b/python/dask_cudf/dask_cudf/VERSION
new file mode 120000
index 00000000000..d62dc733efd
--- /dev/null
+++ b/python/dask_cudf/dask_cudf/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index 7c81f5da481..c152a9e6a81 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -5,6 +5,7 @@
 import cudf
 
 from . import backends
+from ._version import __git_commit__, __version__
 from .core import DataFrame, Series, concat, from_cudf, from_dask_dataframe
 from .groupby import groupby_agg
 from .io import read_csv, read_json, read_orc, read_text, to_orc
@@ -14,8 +15,6 @@
 except ImportError:
     pass
 
-__version__ = "23.12.00"
-
 __all__ = [
     "DataFrame",
     "Series",
diff --git a/python/dask_cudf/dask_cudf/_version.py b/python/dask_cudf/dask_cudf/_version.py
new file mode 100644
index 00000000000..0dd62854a4e
--- /dev/null
+++ b/python/dask_cudf/dask_cudf/_version.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files("dask_cudf")
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+__git_commit__ = ""
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 8461c51c573..32c7bb9fd15 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 
 [project]
 name = "dask_cudf"
-version = "23.12.00"
+dynamic = ["version", "entry-points"]
 description = "Utilities for Dask and cuDF interactions"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -35,7 +35,6 @@ classifiers = [
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
 ]
-dynamic = ["entry-points"]
 
 [project.optional-dependencies]
 test = [
@@ -52,6 +51,9 @@ Homepage = "https://github.com/rapidsai/cudf"
 [tool.setuptools]
 license-files = ["LICENSE"]
 
+[tool.setuptools.dynamic]
+version = {file = "dask_cudf/VERSION"}
+
 [tool.isort]
 line_length = 79
 multi_line_output = 3
diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py
index 3fa0f257834..c6ce219d32f 100644
--- a/python/dask_cudf/setup.py
+++ b/python/dask_cudf/setup.py
@@ -2,9 +2,12 @@
 
 from setuptools import find_packages, setup
 
+packages = find_packages(exclude=["tests", "tests.*"])
+
 setup(
     include_package_data=True,
-    packages=find_packages(exclude=["tests", "tests.*"]),
+    packages=packages,
+    package_data={key: ["VERSION"] for key in packages},
     entry_points={
         "dask.dataframe.backends": [
             "cudf = dask_cudf.backends:CudfBackendEntrypoint",

From cb06c20c36b2338915ed38a4c37d4db9a5bd3d79 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 31 Oct 2023 11:04:02 -0700
Subject: [PATCH 068/118] Sort dictionary data alphabetically in the ORC writer
 (#14295)

Strings in the dictionary data streams are now sorted alphabetically.
Reduces file size in some cases because compression can be more efficient.

Reduces throughput up to 22% when writing strings columns (3% speedup when dictionary encoding is not used, though!).
Benchmark data does not demonstrate the compression difference, but we have some user data that compresses almost 30% better.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Divye Gala (https://github.com/divyegala)
  - Alessandro Bellina (https://github.com/abellina)

URL: https://github.com/rapidsai/cudf/pull/14295
---
 cpp/include/cudf/io/orc.hpp    |  56 ++++++++++++++++
 cpp/src/io/orc/orc_gpu.hpp     |  14 ++--
 cpp/src/io/orc/stripe_enc.cu   |   4 ++
 cpp/src/io/orc/writer_impl.cu  | 113 +++++++++++++++++++++++++++------
 cpp/src/io/orc/writer_impl.hpp |   1 +
 cpp/tests/io/orc_test.cpp      |  30 +++++++++
 6 files changed, 191 insertions(+), 27 deletions(-)

diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 5801d2c1008..c2762b05aa6 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -450,6 +450,8 @@ class orc_writer_options {
   std::map<std::string, std::string> _user_data;
   // Optional compression statistics
   std::shared_ptr<writer_compression_statistics> _compression_stats;
+  // Specify whether string dictionaries should be alphabetically sorted
+  bool _enable_dictionary_sort = true;
 
   friend orc_writer_options_builder;
 
@@ -572,6 +574,13 @@ class orc_writer_options {
     return _compression_stats;
   }
 
+  /**
+   * @brief Returns whether string dictionaries should be sorted.
+   *
+   * @return `true` if string dictionaries should be sorted
+   */
+  [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; }
+
   // Setters
 
   /**
@@ -670,6 +679,13 @@ class orc_writer_options {
   {
     _compression_stats = std::move(comp_stats);
   }
+
+  /**
+   * @brief Sets whether string dictionaries should be sorted.
+   *
+   * @param val Boolean value to enable/disable
+   */
+  void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; }
 };
 
 /**
@@ -810,6 +826,18 @@ class orc_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets whether string dictionaries should be sorted.
+   *
+   * @param val Boolean value to enable/disable
+   * @return this for chaining
+   */
+  orc_writer_options_builder& enable_dictionary_sort(bool val)
+  {
+    options._enable_dictionary_sort = val;
+    return *this;
+  }
+
   /**
    * @brief move orc_writer_options member once it's built.
    */
@@ -866,6 +894,8 @@ class chunked_orc_writer_options {
   std::map<std::string, std::string> _user_data;
   // Optional compression statistics
   std::shared_ptr<writer_compression_statistics> _compression_stats;
+  // Specify whether string dictionaries should be alphabetically sorted
+  bool _enable_dictionary_sort = true;
 
   friend chunked_orc_writer_options_builder;
 
@@ -966,6 +996,13 @@ class chunked_orc_writer_options {
     return _compression_stats;
   }
 
+  /**
+   * @brief Returns whether string dictionaries should be sorted.
+   *
+   * @return `true` if string dictionaries should be sorted
+   */
+  [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; }
+
   // Setters
 
   /**
@@ -1057,6 +1094,13 @@ class chunked_orc_writer_options {
   {
     _compression_stats = std::move(comp_stats);
   }
+
+  /**
+   * @brief Sets whether string dictionaries should be sorted.
+   *
+   * @param val Boolean value to enable/disable
+   */
+  void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; }
 };
 
 /**
@@ -1183,6 +1227,18 @@ class chunked_orc_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets whether string dictionaries should be sorted.
+   *
+   * @param val Boolean value to enable/disable
+   * @return this for chaining
+   */
+  chunked_orc_writer_options_builder& enable_dictionary_sort(bool val)
+  {
+    options._enable_dictionary_sort = val;
+    return *this;
+  }
+
   /**
    * @brief move chunked_orc_writer_options member once it's built.
    */
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 5669a20907d..243704b65d4 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -150,7 +150,8 @@ struct EncChunk {
   uint8_t dtype_len;                 // data type length
   int32_t scale;                     // scale for decimals or timestamps
 
-  uint32_t* dict_index;  // dictionary index from row index
+  uint32_t* dict_index;       // dictionary index from row index
+  uint32_t* dict_data_order;  // map from data to sorted data indices
   uint32_t* decimal_offsets;
   orc_column_device_view const* column;
 };
@@ -191,11 +192,12 @@ struct stripe_dictionary {
   size_type num_rows       = 0;      // number of rows in the stripe
 
   // output
-  device_span<uint32_t> data;     // index of elements in the column to include in the dictionary
-  device_span<uint32_t> index;    // index into the dictionary for each row in the column
-  size_type entry_count = 0;      // number of entries in the dictionary
-  size_type char_count  = 0;      // number of characters in the dictionary
-  bool is_enabled       = false;  // true if dictionary encoding is enabled for this stripe
+  device_span<uint32_t> data;        // index of elements in the column to include in the dictionary
+  device_span<uint32_t> index;       // index into the dictionary for each row in the column
+  device_span<uint32_t> data_order;  // map from data to sorted data indices
+  size_type entry_count = 0;         // number of entries in the dictionary
+  size_type char_count  = 0;         // number of characters in the dictionary
+  bool is_enabled       = false;     // true if dictionary encoding is enabled for this stripe
 };
 
 /**
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 5c75ba22159..b99826e070e 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -837,6 +837,10 @@ __global__ void __launch_bounds__(block_size)
               if (dict_idx > 0x7fff'ffffu) {
                 dict_idx = s->chunk.dict_index[dict_idx & 0x7fff'ffffu];
               }
+              // translate dictionary index to sorted order, if enabled
+              if (s->chunk.dict_data_order != nullptr) {
+                dict_idx = s->chunk.dict_data_order[dict_idx];
+              }
               s->vals.u32[nz_idx] = dict_idx;
             } else {
               string_view value                       = column.element<string_view>(row);
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 81629e03a82..ac5993e764e 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -29,6 +29,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/pinned_host_vector.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -50,6 +51,8 @@
 #include <thrust/pair.h>
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
 #include <thrust/tabulate.h>
 #include <thrust/transform.h>
 
@@ -867,16 +870,15 @@ encoded_data encode_columns(orc_table_view const& orc_table,
         ck.null_mask_num_rows  = aligned_rowgroups[rg_idx][column.index()].size();
         ck.encoding_kind       = column.orc_encoding();
         ck.type_kind           = column.orc_kind();
-        if (ck.type_kind == TypeKind::STRING) {
-          ck.dict_index = (ck.encoding_kind == DICTIONARY_V2)
-                            ? column.host_stripe_dict(stripe.id).index.data()
-                            : nullptr;
-          ck.dtype_len  = 1;
-        } else {
-          ck.dtype_len = column.type_width();
-        }
-        ck.scale = column.scale();
-        if (ck.type_kind == TypeKind::DECIMAL) { ck.decimal_offsets = column.decimal_offsets(); }
+        auto const is_str_dict =
+          ck.type_kind == TypeKind::STRING and ck.encoding_kind == DICTIONARY_V2;
+        ck.dict_index = is_str_dict ? column.host_stripe_dict(stripe.id).index.data() : nullptr;
+        ck.dict_data_order =
+          is_str_dict ? column.host_stripe_dict(stripe.id).data_order.data() : nullptr;
+        ck.dtype_len = (ck.type_kind == TypeKind::STRING) ? 1 : column.type_width();
+        ck.scale     = column.scale();
+        ck.decimal_offsets =
+          (ck.type_kind == TypeKind::DECIMAL) ? column.decimal_offsets() : nullptr;
       }
     }
   }
@@ -2012,24 +2014,41 @@ struct stripe_dictionaries {
   hostdevice_2dvector<gpu::stripe_dictionary> views;       // descriptors [string_column][stripe]
   std::vector<rmm::device_uvector<uint32_t>> data_owner;   // dictionary data owner, per stripe
   std::vector<rmm::device_uvector<uint32_t>> index_owner;  // dictionary index owner, per stripe
+  std::vector<rmm::device_uvector<uint32_t>> order_owner;  // dictionary order owner, per stripe
 
   // Should be called after encoding is complete to deallocate the dictionary buffers.
   void on_encode_complete(rmm::cuda_stream_view stream)
   {
     data_owner.clear();
     index_owner.clear();
+    order_owner.clear();
 
     for (auto& sd : views.host_view().flat_view()) {
-      sd.data  = {};
-      sd.index = {};
+      sd.data       = {};
+      sd.index      = {};
+      sd.data_order = {};
     }
     views.host_to_device_async(stream);
   }
 };
 
+/**
+ * @brief Compares two rows in a strings column
+ */
+struct string_rows_less {
+  device_span<orc_column_device_view> cols;
+  uint32_t col_idx;
+  __device__ bool operator()(size_type lhs_idx, size_type rhs_idx) const
+  {
+    auto const& col = cols[col_idx];
+    return col.element<string_view>(lhs_idx) < col.element<string_view>(rhs_idx);
+  }
+};
+
 // Build stripe dictionaries for string columns
 stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
                                        file_segmentation const& segmentation,
+                                       bool sort_dictionaries,
                                        rmm::cuda_stream_view stream)
 {
   std::vector<std::vector<rmm::device_uvector<gpu::slot_type>>> hash_maps_storage(
@@ -2080,6 +2099,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
   // Data owners; can be cleared after encode
   std::vector<rmm::device_uvector<uint32_t>> dict_data_owner;
   std::vector<rmm::device_uvector<uint32_t>> dict_index_owner;
+  std::vector<rmm::device_uvector<uint32_t>> dict_order_owner;
   // Make decision about which stripes to encode with dictionary encoding
   for (auto col_idx : orc_table.string_column_indices) {
     auto& str_column = orc_table.column(col_idx);
@@ -2122,15 +2142,61 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
   gpu::collect_map_entries(stripe_dicts, stream);
   gpu::get_dictionary_indices(stripe_dicts, orc_table.d_columns, stream);
 
-  // Clear map slots; hash map storage is deallocated at the end of this function
-  auto device_dicts_flat = stripe_dicts.device_view().flat_view();
-  thrust::for_each(rmm::exec_policy(stream),
-                   device_dicts_flat.begin(),
-                   device_dicts_flat.end(),
-                   [] __device__(auto& sd) { sd.map_slots = {}; });
-  stripe_dicts.device_to_host_async(stream);
+  // deallocate hash map storage, unused after this point
+  hash_maps_storage.clear();
+
+  // Clear map slots and attach order buffers
+  auto dictionaries_flat = stripe_dicts.host_view().flat_view();
+  for (auto& sd : dictionaries_flat) {
+    if (not sd.is_enabled) { continue; }
+
+    sd.map_slots = {};
+    if (sort_dictionaries) {
+      dict_order_owner.emplace_back(sd.entry_count, stream);
+      sd.data_order = dict_order_owner.back();
+    } else {
+      sd.data_order = {};
+    }
+  }
+  stripe_dicts.host_to_device_async(stream);
+
+  // Sort stripe dictionaries alphabetically
+  if (sort_dictionaries) {
+    auto streams = cudf::detail::fork_streams(stream, std::min<size_t>(dict_order_owner.size(), 8));
+    auto stream_idx = 0;
+    for (auto& sd : dictionaries_flat) {
+      if (not sd.is_enabled) { continue; }
+
+      auto const& current_stream = streams[stream_idx];
+
+      // Sort the dictionary data and create a mapping from the sorted order to the original
+      thrust::sequence(
+        rmm::exec_policy_nosync(current_stream), sd.data_order.begin(), sd.data_order.end());
+      thrust::sort_by_key(rmm::exec_policy_nosync(current_stream),
+                          sd.data.begin(),
+                          sd.data.end(),
+                          sd.data_order.begin(),
+                          string_rows_less{orc_table.d_columns, sd.column_idx});
+
+      // Create the inverse permutation - i.e. the mapping from the original order to the sorted
+      auto order_copy = cudf::detail::make_device_uvector_async<uint32_t>(
+        sd.data_order, current_stream, rmm::mr::get_current_device_resource());
+      thrust::scatter(rmm::exec_policy_nosync(current_stream),
+                      thrust::counting_iterator<uint32_t>(0),
+                      thrust::counting_iterator<uint32_t>(sd.data_order.size()),
+                      order_copy.begin(),
+                      sd.data_order.begin());
+
+      stream_idx = (stream_idx + 1) % streams.size();
+    }
+
+    cudf::detail::join_streams(streams, stream);
+  }
 
-  return {std::move(stripe_dicts), std::move(dict_data_owner), std::move(dict_index_owner)};
+  return {std::move(stripe_dicts),
+          std::move(dict_data_owner),
+          std::move(dict_index_owner),
+          std::move(dict_order_owner)};
 }
 
 /**
@@ -2142,6 +2208,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
  * @param max_stripe_size Maximum size of stripes in the output file
  * @param row_index_stride The row index stride
  * @param enable_dictionary Whether dictionary is enabled
+ * @param sort_dictionaries Whether to sort the dictionaries
  * @param compression_kind The compression kind
  * @param compression_blocksize The block size used for compression
  * @param stats_freq Column statistics granularity type for parquet/orc writers
@@ -2156,6 +2223,7 @@ auto convert_table_to_orc_data(table_view const& input,
                                stripe_size_limits max_stripe_size,
                                size_type row_index_stride,
                                bool enable_dictionary,
+                               bool sort_dictionaries,
                                CompressionKind compression_kind,
                                size_t compression_blocksize,
                                statistics_freq stats_freq,
@@ -2180,7 +2248,7 @@ auto convert_table_to_orc_data(table_view const& input,
   auto segmentation =
     calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size);
 
-  auto stripe_dicts    = build_dictionaries(orc_table, segmentation, stream);
+  auto stripe_dicts    = build_dictionaries(orc_table, segmentation, sort_dictionaries, stream);
   auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream);
 
   auto const uncompressed_block_align = uncomp_block_alignment(compression_kind);
@@ -2314,6 +2382,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     _compression_blocksize(compression_block_size(_compression_kind)),
     _compression_statistics(options.get_compression_statistics()),
     _stats_freq(options.get_statistics_freq()),
+    _sort_dictionaries{options.get_enable_dictionary_sort()},
     _single_write_mode(mode),
     _kv_meta(options.get_key_value_metadata()),
     _out_sink(std::move(sink))
@@ -2335,6 +2404,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     _compression_blocksize(compression_block_size(_compression_kind)),
     _compression_statistics(options.get_compression_statistics()),
     _stats_freq(options.get_statistics_freq()),
+    _sort_dictionaries{options.get_enable_dictionary_sort()},
     _single_write_mode(mode),
     _kv_meta(options.get_key_value_metadata()),
     _out_sink(std::move(sink))
@@ -2382,6 +2452,7 @@ void writer::impl::write(table_view const& input)
                                        _max_stripe_size,
                                        _row_index_stride,
                                        _enable_dictionary,
+                                       _sort_dictionaries,
                                        _compression_kind,
                                        _compression_blocksize,
                                        _stats_freq,
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 67c65eb9a37..0d1a83f3d85 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -346,6 +346,7 @@ class writer::impl {
   size_t const _compression_blocksize;
   std::shared_ptr<writer_compression_statistics> _compression_statistics;  // Optional output
   statistics_freq const _stats_freq;
+  bool const _sort_dictionaries;
   single_write_mode const _single_write_mode;  // Special parameter only used by `write()` to
                                                // indicate that we are guaranteeing a single table
                                                // write. This enables some internal optimizations.
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 3457c5675ad..234716749ff 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1930,4 +1930,34 @@ TEST_F(OrcStatisticsTest, AllNulls)
   check_all_null_stats<cudf::io::string_statistics>(stats.file_stats[3]);
 }
 
+TEST_F(OrcWriterTest, UnorderedDictionary)
+{
+  std::vector<char const*> strings{
+    "BBBB", "BBBB", "CCCC", "BBBB", "CCCC", "EEEE", "CCCC", "AAAA", "DDDD", "EEEE"};
+  str_col col(strings.begin(), strings.end());
+
+  table_view expected({col});
+
+  std::vector<char> out_buffer_sorted;
+  cudf::io::orc_writer_options out_opts_sorted =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer_sorted}, expected);
+  cudf::io::write_orc(out_opts_sorted);
+
+  cudf::io::orc_reader_options in_opts_sorted = cudf::io::orc_reader_options::builder(
+    cudf::io::source_info{out_buffer_sorted.data(), out_buffer_sorted.size()});
+  auto const from_sorted = cudf::io::read_orc(in_opts_sorted).tbl;
+
+  std::vector<char> out_buffer_unsorted;
+  cudf::io::orc_writer_options out_opts_unsorted =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer_unsorted}, expected)
+      .enable_dictionary_sort(false);
+  cudf::io::write_orc(out_opts_unsorted);
+
+  cudf::io::orc_reader_options in_opts_unsorted = cudf::io::orc_reader_options::builder(
+    cudf::io::source_info{out_buffer_unsorted.data(), out_buffer_unsorted.size()});
+  auto const from_unsorted = cudf::io::read_orc(in_opts_unsorted).tbl;
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*from_sorted, *from_unsorted);
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From ec080eba750e51fe624dd40020084690d88b9d38 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Tue, 31 Oct 2023 16:16:28 -0400
Subject: [PATCH 069/118] Fix overflow check in `cudf::merge` (#14345)

https://github.com/rapidsai/cudf/pull/14250 added a check to ensure `cudf::merge` throws when the total number of merged rows exceed `cudf::size_type` limit, however @bdice pointed out that the check was not correct because the accumulation was still occurring in `cudf::size_type`. This PR computes the accumulation in `std::size_t`.

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14345
---
 cpp/src/merge/merge.cu | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index e47abd6ede4..ee29c207cf1 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -638,13 +638,14 @@ table_ptr_type merge(std::vector<table_view> const& tables_to_merge,
 
   CUDF_EXPECTS(key_cols.size() == column_order.size(),
                "Mismatched size between key_cols and column_order");
-  CUDF_EXPECTS(std::accumulate(tables_to_merge.cbegin(),
-                               tables_to_merge.cend(),
-                               cudf::size_type{0},
-                               [](auto const& running_sum, auto const& tbl) {
-                                 return running_sum + tbl.num_rows();
-                               }) <= std::numeric_limits<cudf::size_type>::max(),
-               "Total number of merged rows exceeds row limit");
+  CUDF_EXPECTS(
+    std::accumulate(tables_to_merge.cbegin(),
+                    tables_to_merge.cend(),
+                    std::size_t{0},
+                    [](auto const& running_sum, auto const& tbl) {
+                      return running_sum + static_cast<std::size_t>(tbl.num_rows());
+                    }) <= static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+    "Total number of merged rows exceeds row limit");
 
   // This utility will ensure all corresponding dictionary columns have matching keys.
   // It will return any new dictionary columns created as well as updated table_views.

From f07d9cca355edaf86448407231e18adf4725a11b Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 1 Nov 2023 06:15:58 -0700
Subject: [PATCH 070/118] Add the new manylinux builds to the build job
 (#14351)

#14339 added the ability to build cudf wheels against a newer manylinux, but those jobs were not added to the build matrix, only the PR matrix, so the newer ABI wheels aren't currently being published.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14351
---
 .github/workflows/build.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 1a7aa00aebf..2539057c105 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -72,6 +72,7 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
+      build-2_28-wheels: "true"
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}

From 56fe5dbb32bb6d4f4ab954e1abea9218fed404e1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 1 Nov 2023 09:19:29 -0400
Subject: [PATCH 071/118] Expose stream parameter to get_json_object API
 (#14297)

Add stream parameter to public APIs `cudf::get_json_object()`
Also removed the API from the `strings` namespace since it does not fit with the other strings library functions.
This resulted in updating the source file locations as well.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14297
---
 cpp/CMakeLists.txt                          |   2 +-
 cpp/benchmarks/CMakeLists.txt               |   2 +-
 cpp/benchmarks/{string => json}/json.cu     |   4 +-
 cpp/include/cudf/{strings => json}/json.hpp |  13 +--
 cpp/include/cudf/strings/detail/json.hpp    |  43 --------
 cpp/include/doxygen_groups.h                |   5 +-
 cpp/src/{strings => }/json/json_path.cu     |  19 ++--
 cpp/tests/CMakeLists.txt                    |   5 +-
 cpp/tests/{strings => json}/json_tests.cpp  | 108 +++++++++-----------
 java/src/main/native/src/ColumnViewJni.cpp  |   4 +-
 python/cudf/cudf/_lib/cpp/strings/json.pxd  |   4 +-
 11 files changed, 81 insertions(+), 128 deletions(-)
 rename cpp/benchmarks/{string => json}/json.cu (98%)
 rename cpp/include/cudf/{strings => json}/json.hpp (94%)
 delete mode 100644 cpp/include/cudf/strings/detail/json.hpp
 rename cpp/src/{strings => }/json/json_path.cu (98%)
 rename cpp/tests/{strings => json}/json_tests.cpp (84%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f7662006cac..dc12564c656 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -440,6 +440,7 @@ add_library(
   src/join/mixed_join_size_kernel_nulls.cu
   src/join/mixed_join_size_kernels_semi.cu
   src/join/semi_join.cu
+  src/json/json_path.cu
   src/lists/contains.cu
   src/lists/combine/concatenate_list_elements.cu
   src/lists/combine/concatenate_rows.cu
@@ -571,7 +572,6 @@ add_library(
   src/strings/filter_chars.cu
   src/strings/like.cu
   src/strings/padding.cu
-  src/strings/json/json_path.cu
   src/strings/regex/regcomp.cpp
   src/strings/regex/regexec.cpp
   src/strings/regex/regex_program.cpp
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index a3e2b4ed6db..6858a3fc69f 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -320,7 +320,7 @@ ConfigureNVBench(
 
 # ##################################################################################################
 # * json benchmark -------------------------------------------------------------------
-ConfigureBench(JSON_BENCH string/json.cu)
+ConfigureBench(JSON_BENCH json/json.cu)
 ConfigureNVBench(FST_NVBENCH io/fst.cu)
 ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader_input.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
diff --git a/cpp/benchmarks/string/json.cu b/cpp/benchmarks/json/json.cu
similarity index 98%
rename from cpp/benchmarks/string/json.cu
rename to cpp/benchmarks/json/json.cu
index 7e89edf3e17..5dc30aebe38 100644
--- a/cpp/benchmarks/string/json.cu
+++ b/cpp/benchmarks/json/json.cu
@@ -21,9 +21,9 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/json/json.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/json.hpp>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
@@ -196,7 +196,7 @@ void BM_case(benchmark::State& state, std::string query_arg)
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);
-    auto result = cudf::strings::get_json_object(scv, json_path);
+    auto result = cudf::get_json_object(scv, json_path);
     CUDF_CUDA_TRY(cudaStreamSynchronize(0));
   }
 
diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/json/json.hpp
similarity index 94%
rename from cpp/include/cudf/strings/json.hpp
rename to cpp/include/cudf/json/json.hpp
index 8fabee6b9a5..944e0c26dd6 100644
--- a/cpp/include/cudf/strings/json.hpp
+++ b/cpp/include/cudf/json/json.hpp
@@ -16,16 +16,16 @@
 #pragma once
 
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/optional.h>
 
 namespace cudf {
-namespace strings {
 
 /**
- * @addtogroup strings_json
+ * @addtogroup json_object
  * @{
  * @file
  */
@@ -155,20 +155,21 @@ class get_json_object_options {
  * https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html
  * Implements only the operators: $ . [] *
  *
+ * @throw std::invalid_argument if provided an invalid operator or an empty name
+ *
  * @param col The input strings column. Each row must contain a valid json string
  * @param json_path The JSONPath string to be applied to each row
  * @param options Options for controlling the behavior of the function
- * @param mr Resource for allocating device memory.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Resource for allocating device memory
  * @return New strings column containing the retrieved json object strings
- *
- * @throw std::invalid_argument if provided an invalid operator or an empty name
  */
 std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& col,
   cudf::string_scalar const& json_path,
   get_json_object_options options     = get_json_object_options{},
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
-}  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/json.hpp b/cpp/include/cudf/strings/detail/json.hpp
deleted file mode 100644
index 0fb06d36570..00000000000
--- a/cpp/include/cudf/strings/detail/json.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/json.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-namespace cudf {
-namespace strings {
-namespace detail {
-
-/**
- * @copydoc cudf::strings::get_json_object
- *
- * @param stream CUDA stream used for device memory operations and kernel launches
- */
-std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
-                                              cudf::string_scalar const& json_path,
-                                              cudf::strings::get_json_object_options options,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr);
-
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h
index 4da2807bbe6..8845b84613d 100644
--- a/cpp/include/doxygen_groups.h
+++ b/cpp/include/doxygen_groups.h
@@ -130,7 +130,6 @@
  *   @defgroup strings_replace Replacing
  *   @defgroup strings_split Splitting
  *   @defgroup strings_extract Extracting
- *   @defgroup strings_json JSON
  *   @defgroup strings_regex Regex
  * @}
  * @defgroup dictionary_apis Dictionary
@@ -146,6 +145,10 @@
  *   @defgroup io_datasources Data Sources
  *   @defgroup io_datasinks Data Sinks
  * @}
+ * @defgroup json_apis JSON
+ * @{
+ *   @defgroup json_object JSON Path
+ * @}
  * @defgroup lists_apis Lists
  * @{
  *   @defgroup lists_combine Combining
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/json/json_path.cu
similarity index 98%
rename from cpp/src/strings/json/json_path.cu
rename to cpp/src/json/json_path.cu
index c56752f5429..8217e34723c 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -20,9 +20,9 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/json/json.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/utilities.hpp>
-#include <cudf/strings/json.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
@@ -41,7 +41,6 @@
 #include <thrust/tuple.h>
 
 namespace cudf {
-namespace strings {
 namespace detail {
 
 namespace {
@@ -224,7 +223,9 @@ enum json_element_type { NONE, OBJECT, ARRAY, VALUE };
 class json_state : private parser {
  public:
   __device__ json_state() : parser() {}
-  __device__ json_state(char const* _input, int64_t _input_len, get_json_object_options _options)
+  __device__ json_state(char const* _input,
+                        int64_t _input_len,
+                        cudf::get_json_object_options _options)
     : parser(_input, _input_len),
 
       options(_options)
@@ -956,9 +957,6 @@ __launch_bounds__(block_size) __global__
   }
 }
 
-/**
- * @copydoc cudf::strings::detail::get_json_object
- */
 std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
                                               cudf::string_scalar const& json_path,
                                               get_json_object_options options,
@@ -1011,7 +1009,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     cudf::detail::get_value<size_type>(offsets_view, col.size(), stream);
 
   // allocate output string column
-  auto chars = create_chars_child_column(output_size, stream, mr);
+  auto chars = cudf::strings::detail::create_chars_child_column(output_size, stream, mr);
 
   // potential optimization : if we know that all outputs are valid, we could skip creating
   // the validity mask altogether
@@ -1041,17 +1039,14 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
 }  // namespace
 }  // namespace detail
 
-/**
- * @copydoc cudf::strings::get_json_object
- */
 std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
                                               cudf::string_scalar const& json_path,
                                               get_json_object_options options,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::get_json_object(col, json_path, options, cudf::get_default_stream(), mr);
+  return detail::get_json_object(col, json_path, options, stream, mr);
 }
 
-}  // namespace strings
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f856d106d03..e966ef3fb04 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -523,7 +523,6 @@ ConfigureTest(
   strings/format_lists_tests.cpp
   strings/integers_tests.cpp
   strings/ipv4_tests.cpp
-  strings/json_tests.cpp
   strings/like_tests.cpp
   strings/pad_tests.cpp
   strings/repeat_strings_tests.cpp
@@ -537,6 +536,10 @@ ConfigureTest(
   strings/urls_tests.cpp
 )
 
+# ##################################################################################################
+# * json path test --------------------------------------------------------------------------------
+ConfigureTest(JSON_PATH_TEST json/json_tests.cpp)
+
 # ##################################################################################################
 # * structs test ----------------------------------------------------------------------------------
 ConfigureTest(STRUCTS_TEST structs/structs_column_tests.cpp structs/utilities_tests.cpp)
diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/json/json_tests.cpp
similarity index 84%
rename from cpp/tests/strings/json_tests.cpp
rename to cpp/tests/json/json_tests.cpp
index d74bb9258fa..a03880eef5d 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/json/json_tests.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
+#include <cudf/json/json.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/strings/json.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
@@ -85,7 +85,7 @@ TEST_F(JsonPathTests, GetJsonObjectRootOp)
   // root
   cudf::test::strings_column_wrapper input{json_string};
   std::string json_path("$");
-  auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+  auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
   auto result     = drop_whitespace(*result_raw);
 
   auto expected = drop_whitespace(input);
@@ -98,7 +98,7 @@ TEST_F(JsonPathTests, GetJsonObjectChildOp)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     // clang-format off
@@ -147,7 +147,7 @@ TEST_F(JsonPathTests, GetJsonObjectChildOp)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     // clang-format off
@@ -193,7 +193,7 @@ TEST_F(JsonPathTests, GetJsonObjectWildcardOp)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.*");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     // clang-format off
@@ -242,7 +242,7 @@ TEST_F(JsonPathTests, GetJsonObjectWildcardOp)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("*");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     // clang-format off
@@ -297,7 +297,7 @@ TEST_F(JsonPathTests, GetJsonObjectSubscriptOp)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book[2]");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     // clang-format off
@@ -319,7 +319,7 @@ TEST_F(JsonPathTests, GetJsonObjectSubscriptOp)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store['bicycle']");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     // clang-format off
@@ -338,7 +338,7 @@ TEST_F(JsonPathTests, GetJsonObjectSubscriptOp)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book[*]");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     // clang-format off
@@ -387,7 +387,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book[*]['isbn']");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw{R"(["0-553-21311-3","0-395-19395-8"])"};
@@ -399,7 +399,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book[*].category");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw{
@@ -412,7 +412,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book[*].title");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw{
@@ -425,7 +425,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book.*.price");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw{"[8.95,12.99,8.99,22.99]"};
@@ -440,7 +440,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter)
     //  spark:        fiction
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book[2].category");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw{"fiction"};
@@ -457,7 +457,7 @@ TEST_F(JsonPathTests, GetJsonObjectNullInputs)
     cudf::test::strings_column_wrapper input({str, str, str, str}, {1, 0, 1, 0});
 
     std::string json_path("$.a");
-    auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw({"b", "", "b", ""}, {1, 0, 1, 0});
@@ -473,7 +473,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyQuery)
   {
     cudf::test::strings_column_wrapper input{R"({"a" : "b"})"};
     std::string json_path("");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::strings_column_wrapper expected({""}, {0});
 
@@ -487,7 +487,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyInputsAndOutputs)
   {
     cudf::test::strings_column_wrapper input{""};
     std::string json_path("$");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::strings_column_wrapper expected({""}, {0});
 
@@ -500,7 +500,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyInputsAndOutputs)
   {
     cudf::test::strings_column_wrapper input{R"({"store": { "bicycle" : "" } })"};
     std::string json_path("$.store.bicycle");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::strings_column_wrapper expected({""}, {1});
 
@@ -512,7 +512,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyInput)
 {
   cudf::test::strings_column_wrapper input{};
   std::string json_path("$");
-  auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+  auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, input);
 }
 
@@ -525,7 +525,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$$");
     auto query = [&]() {
-      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     };
     EXPECT_THROW(query(), cudf::logic_error);
   }
@@ -535,7 +535,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[auh46h-]");
     auto query = [&]() {
-      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     };
     EXPECT_THROW(query(), cudf::logic_error);
   }
@@ -545,7 +545,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[[]]");
     auto query = [&]() {
-      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     };
     EXPECT_THROW(query(), cudf::logic_error);
   }
@@ -555,7 +555,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[-1]");
     auto query = [&]() {
-      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     };
     EXPECT_THROW(query(), cudf::logic_error);
   }
@@ -565,7 +565,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path(".");
     auto query = [&]() {
-      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     };
     EXPECT_THROW(query(), std::invalid_argument);
   }
@@ -574,7 +574,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("][");
     auto query = [&]() {
-      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     };
     EXPECT_THROW(query(), std::invalid_argument);
   }
@@ -583,7 +583,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("6hw6,56i3");
     auto query = [&]() {
-      auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
     };
     EXPECT_THROW(query(), std::invalid_argument);
   }
@@ -596,7 +596,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery)
   {
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[*].c");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::strings_column_wrapper expected({""}, {0});
 
@@ -607,7 +607,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery)
   {
     cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[*].c[2]");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::strings_column_wrapper expected({""}, {0});
 
@@ -618,7 +618,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book.price");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::strings_column_wrapper expected({""}, {0});
 
@@ -629,7 +629,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery)
   {
     cudf::test::strings_column_wrapper input{json_string};
     std::string json_path("$.store.book[4]");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     cudf::test::strings_column_wrapper expected({""}, {0});
 
@@ -672,7 +672,7 @@ TEST_F(JsonPathTests, MixedOutput)
   cudf::test::strings_column_wrapper input(input_strings.begin(), input_strings.end());
   {
     std::string json_path("$.a");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
@@ -694,7 +694,7 @@ TEST_F(JsonPathTests, MixedOutput)
 
   {
     std::string json_path("$.a[1]");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
@@ -713,7 +713,7 @@ TEST_F(JsonPathTests, MixedOutput)
 
   {
     std::string json_path("$.a.b");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
@@ -731,7 +731,7 @@ TEST_F(JsonPathTests, MixedOutput)
 
   {
     std::string json_path("$.a[*]");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
@@ -752,7 +752,7 @@ TEST_F(JsonPathTests, MixedOutput)
 
   {
     std::string json_path("$.a.b[*]");
-    auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
@@ -779,13 +779,12 @@ TEST_F(JsonPathTests, StripQuotes)
     std::string str("{\"a\" : \"b\"}");
     cudf::test::strings_column_wrapper input({str, str});
 
-    cudf::strings::get_json_object_options options;
+    cudf::get_json_object_options options;
     options.set_strip_quotes_from_single_strings(false);
 
     std::string json_path("$.a");
-    auto result_raw =
-      cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
-    auto result = drop_whitespace(*result_raw);
+    auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path, options);
+    auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw({"\"b\"", "\"b\""});
     auto expected = drop_whitespace(expected_raw);
@@ -798,11 +797,10 @@ TEST_F(JsonPathTests, StripQuotes)
     cudf::test::strings_column_wrapper input{R"({"store": { "bicycle" : "" } })"};
     std::string json_path("$.store.bicycle");
 
-    cudf::strings::get_json_object_options options;
+    cudf::get_json_object_options options;
     options.set_strip_quotes_from_single_strings(true);
 
-    auto result =
-      cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options);
 
     cudf::test::strings_column_wrapper expected({""});
 
@@ -859,11 +857,10 @@ TEST_F(JsonPathTests, AllowSingleQuotes)
   {
     std::string json_path("$.a");
 
-    cudf::strings::get_json_object_options options;
+    cudf::get_json_object_options options;
     options.set_allow_single_quotes(true);
 
-    auto result =
-      cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options);
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
@@ -903,11 +900,10 @@ TEST_F(JsonPathTests, StringsWithSpecialChars)
     {
       std::string json_path("$.item");
 
-      cudf::strings::get_json_object_options options;
+      cudf::get_json_object_options options;
       options.set_allow_single_quotes(true);
 
-      auto result =
-        cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options);
 
       // clang-format off
       cudf::test::strings_column_wrapper expected({
@@ -929,11 +925,10 @@ TEST_F(JsonPathTests, StringsWithSpecialChars)
     {
       std::string json_path("$.a");
 
-      cudf::strings::get_json_object_options options;
+      cudf::get_json_object_options options;
       options.set_allow_single_quotes(true);
 
-      auto result =
-        cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
+      auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options);
 
       // clang-format off
       cudf::test::strings_column_wrapper expected({
@@ -962,11 +957,10 @@ TEST_F(JsonPathTests, EscapeSequences)
   {
     std::string json_path("$.a");
 
-    cudf::strings::get_json_object_options options;
+    cudf::get_json_object_options options;
     options.set_allow_single_quotes(true);
 
-    auto result =
-      cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
+    auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options);
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
@@ -998,12 +992,12 @@ TEST_F(JsonPathTests, MissingFieldsAsNulls)
                                  auto const& missing_fields_output,
                                  bool default_valid = true) {
     cudf::test::strings_column_wrapper input{input_string};
-    cudf::strings::get_json_object_options options;
+    cudf::get_json_object_options options;
 
     // Test default behavior
     options.set_missing_fields_as_nulls(false);
     auto const default_result =
-      cudf::strings::get_json_object(cudf::strings_column_view(input), {json_path_string}, options);
+      cudf::get_json_object(cudf::strings_column_view(input), {json_path_string}, options);
     cudf::test::strings_column_wrapper default_expected({default_output}, {default_valid});
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(default_expected, *default_result);
@@ -1011,7 +1005,7 @@ TEST_F(JsonPathTests, MissingFieldsAsNulls)
     // Test with missing fields as null
     options.set_missing_fields_as_nulls(true);
     auto const missing_fields_result =
-      cudf::strings::get_json_object(cudf::strings_column_view(input), {json_path_string}, options);
+      cudf::get_json_object(cudf::strings_column_view(input), {json_path_string}, options);
     cudf::test::strings_column_wrapper missing_fields_expected({missing_fields_output}, {1});
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(missing_fields_expected, *missing_fields_result);
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 462f0d8eac9..7a626daff1f 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -27,6 +27,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/hashing.hpp>
+#include <cudf/json/json.hpp>
 #include <cudf/lists/combine.hpp>
 #include <cudf/lists/contains.hpp>
 #include <cudf/lists/count_elements.hpp>
@@ -62,7 +63,6 @@
 #include <cudf/strings/extract.hpp>
 #include <cudf/strings/find.hpp>
 #include <cudf/strings/findall.hpp>
-#include <cudf/strings/json.hpp>
 #include <cudf/strings/padding.hpp>
 #include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/repeat_strings.hpp>
@@ -2443,7 +2443,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env
     cudf::column_view *n_column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
     cudf::strings_column_view n_strings_col_view(*n_column_view);
     cudf::string_scalar *n_scalar_path = reinterpret_cast<cudf::string_scalar *>(j_scalar_handle);
-    return release_as_jlong(cudf::strings::get_json_object(n_strings_col_view, *n_scalar_path));
+    return release_as_jlong(cudf::get_json_object(n_strings_col_view, *n_scalar_path));
   }
   CATCH_STD(env, 0)
 }
diff --git a/python/cudf/cudf/_lib/cpp/strings/json.pxd b/python/cudf/cudf/_lib/cpp/strings/json.pxd
index a017e1c5382..eed627c96b5 100644
--- a/python/cudf/cudf/_lib/cpp/strings/json.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/json.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -9,7 +9,7 @@ from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.scalar.scalar cimport scalar, string_scalar
 
 
-cdef extern from "cudf/strings/json.hpp" namespace "cudf::strings" nogil:
+cdef extern from "cudf/json/json.hpp" namespace "cudf" nogil:
     cdef cppclass get_json_object_options:
         get_json_object_options() except +
         # getters

From f97e74f00b7a6bac37c9603def95a11b06cb013f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 3 Nov 2023 15:58:29 -0400
Subject: [PATCH 072/118] Improve performance of
 nvtext::tokenize_with_vocabulary for long strings (#14336)

Improves `nvtext::tokenize_with_vocabulary` performance for long strings. Also adds additional tests and an nvbench benchmark.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/14336
---
 cpp/benchmarks/CMakeLists.txt       |   2 +-
 cpp/benchmarks/text/vocab.cpp       |  88 ++++++++++
 cpp/src/text/vocabulary_tokenize.cu | 247 ++++++++++++++++++++++++++--
 cpp/tests/text/tokenize_tests.cpp   |  63 ++++++-
 4 files changed, 375 insertions(+), 25 deletions(-)
 create mode 100644 cpp/benchmarks/text/vocab.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 6858a3fc69f..9c3a05a2f5f 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -278,7 +278,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
 
 ConfigureNVBench(
   TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp
+  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp
new file mode 100644
index 00000000000..6922b7214ff
--- /dev/null
+++ b/cpp/benchmarks/text/vocab.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/reduction.hpp>
+#include <nvtext/tokenize.hpp>
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/char_types/char_types.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_vocab_tokenize(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  auto const column = [num_rows, row_width] {
+    data_profile const profile = data_profile_builder().no_validity().distribution(
+      cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    auto const col = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
+    return cudf::strings::filter_characters_of_type(
+      cudf::strings_column_view(col->view()),
+      cudf::strings::string_character_types::ALL_TYPES,
+      cudf::string_scalar(" "),
+      cudf::strings::string_character_types::ALPHANUM);
+  }();
+  cudf::strings_column_view input(column->view());
+
+  auto const vocab_col = [] {
+    data_profile const profile = data_profile_builder().no_validity().distribution(
+      cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
+    auto const col = create_random_column(cudf::type_id::STRING, row_count{100}, profile);
+    return cudf::strings::filter_characters_of_type(
+      cudf::strings_column_view(col->view()),
+      cudf::strings::string_character_types::ALL_TYPES,
+      cudf::string_scalar(""),
+      cudf::strings::string_character_types::ALPHANUM);
+  }();
+  auto const vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocab_col->view()));
+
+  auto token_count = [input] {
+    auto const counts = nvtext::count_tokens(input);
+    auto const agg    = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+    auto const count  = cudf::reduce(counts->view(), *agg, counts->type());
+    return static_cast<cudf::scalar_type_t<cudf::size_type>*>(count.get())
+      ->value(cudf::get_default_stream());
+  }();
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  auto chars_size = input.chars_size() + cudf::strings_column_view(vocab_col->view()).chars_size();
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(token_count);
+
+  auto const delimiter = cudf::string_scalar("");
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = nvtext::tokenize_with_vocabulary(input, *vocab, delimiter);
+  });
+}
+
+NVBENCH_BENCH(bench_vocab_tokenize)
+  .set_name("vocab_tokenize")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {262144, 524288, 1048576, 2097152, 4194304, 16777216});
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index f998c9ec239..41f8c0a8731 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -21,10 +21,12 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/hashing/detail/hash_allocator.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -37,6 +39,15 @@
 
 #include <cuco/static_map.cuh>
 
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/logical.h>
+#include <thrust/transform.h>
+
+#include <cub/cub.cuh>
+
 namespace nvtext {
 namespace detail {
 namespace {
@@ -162,6 +173,119 @@ std::unique_ptr<tokenize_vocabulary> load_vocabulary(cudf::strings_column_view c
 namespace detail {
 namespace {
 
+/**
+ * @brief Threshold to decide on using string or warp parallel functions.
+ *
+ * If the average byte length of a string in a column exceeds this value then
+ * the warp-parallel function is used to compute the output sizes.
+ * Otherwise, a regular string-parallel function is used.
+ *
+ * This value was found using the vocab_tokenize benchmark results.
+ */
+constexpr cudf::size_type AVG_CHAR_BYTES_THRESHOLD = 128;
+
+constexpr int block_size = 256;
+
+__device__ bool is_delimiter(cudf::string_view const& d_delimiters, cudf::char_utf8 chr)
+{
+  return d_delimiters.empty() ? (chr <= ' ') :  // whitespace check
+           thrust::any_of(thrust::seq,
+                          d_delimiters.begin(),
+                          d_delimiters.end(),
+                          [chr] __device__(cudf::char_utf8 c) { return c == chr; });
+}
+
+struct mark_delimiters_fn {
+  char const* d_chars;
+  cudf::string_view const d_delimiter;
+  int8_t* d_results;
+
+  __device__ void operator()(cudf::size_type idx) const
+  {
+    auto const ptr = d_chars + idx;
+    if (cudf::strings::detail::is_utf8_continuation_char(*ptr)) { return; }
+    cudf::char_utf8 chr = 0;
+    auto ch_size        = cudf::strings::detail::to_char_utf8(ptr, chr);
+    auto const output   = is_delimiter(d_delimiter, chr);
+    while (ch_size > 0) {
+      d_results[idx++] = output;
+      --ch_size;
+    }
+  }
+};
+
+__global__ void token_counts_fn(cudf::column_device_view const d_strings,
+                                cudf::string_view const d_delimiter,
+                                cudf::size_type* d_counts,
+                                int8_t* d_results)
+{
+  // string per warp
+  auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  if (idx >= (static_cast<std::size_t>(d_strings.size()) *
+              static_cast<std::size_t>(cudf::detail::warp_size))) {
+    return;
+  }
+  auto const str_idx  = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
+
+  if (d_strings.is_null(str_idx)) {
+    d_counts[str_idx] = 0;
+    return;
+  }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) {
+    d_counts[str_idx] = 0;
+    return;
+  }
+
+  auto const offsets =
+    d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
+  auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
+  auto const chars_begin =
+    d_strings.child(cudf::strings_column_view::chars_column_index).data<char>() +
+    offsets[d_strings.offset()];
+
+  auto const begin        = d_str.data();
+  auto const end          = begin + d_str.size_bytes();
+  auto const d_output     = d_results + offset;
+  auto const d_output_end = d_output + d_str.size_bytes();
+
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage warp_storage;
+
+  cudf::size_type count = 0;
+  if (lane_idx == 0) {
+    cudf::char_utf8 chr = 0;
+    auto ch_size        = cudf::strings::detail::to_char_utf8(begin, chr);
+    auto output         = 1;
+    if (begin > chars_begin) {
+      auto ptr = begin - 1;
+      while (ptr > chars_begin && cudf::strings::detail::is_utf8_continuation_char(*ptr)) {
+        --ptr;
+      }
+      cudf::strings::detail::to_char_utf8(ptr, chr);
+      output = !is_delimiter(d_delimiter, chr);
+    }
+    auto ptr = d_output;
+    while (ch_size > 0) {
+      *ptr++ = output;
+      --ch_size;
+    }
+    count = ((begin + ch_size) == end);
+  }
+  __syncwarp();
+
+  for (auto itr = d_output + lane_idx + 1; itr < d_output_end; itr += cudf::detail::warp_size) {
+    // add one if at the edge of a token or at the string's end
+    count += ((*itr && !(*(itr - 1))) || (itr + 1 == d_output_end));
+  }
+  __syncwarp();
+
+  // add up the counts from the other threads to compute the total token count for this string
+  auto const total_count = warp_reduce(warp_storage).Reduce(count, cub::Sum());
+  if (lane_idx == 0) { d_counts[str_idx] = total_count; }
+}
+
 /**
  * @brief Tokenizes each string and uses the map to assign token id values
  *
@@ -197,6 +321,33 @@ struct vocabulary_tokenizer_fn {
   }
 };
 
+template <typename MapRefType>
+struct transform_tokenizer_fn {
+  cudf::string_view const d_delimiter;
+  MapRefType d_map;
+  cudf::size_type const default_id;
+
+  __device__ cudf::size_type operator()(cudf::string_view d_str) const
+  {
+    auto const begin = d_str.data();
+    auto const end   = begin + d_str.size_bytes();
+
+    auto itr = begin;
+    while (itr < end) {
+      cudf::char_utf8 chr = 0;
+      auto const ch_size  = cudf::strings::detail::to_char_utf8(itr, chr);
+      if (!is_delimiter(d_delimiter, chr)) break;
+      itr += ch_size;
+    }
+
+    auto const size  = static_cast<cudf::size_type>(thrust::distance(itr, end));
+    auto const token = cudf::string_view{itr, size};
+    // lookup token in map
+    auto const fitr = d_map.find(token);
+    return (fitr != d_map.end()) ? fitr->second : default_id;
+  }
+};
+
 }  // namespace
 
 std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view const& input,
@@ -209,28 +360,94 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
   auto const output_type = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
-  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+  if (input.size() == input.null_count()) { return cudf::make_empty_column(output_type); }
 
   // count the tokens per string and build the offsets from the counts
   auto const d_strings   = cudf::column_device_view::create(input.parent(), stream);
   auto const d_delimiter = delimiter.value(stream);
-  auto const sizes_itr =
-    cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{*d_strings, d_delimiter});
-  auto [token_offsets, total_count] =
-    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto map_ref           = vocabulary._impl->get_map_ref();
+  auto const zero_itr    = thrust::make_counting_iterator<cudf::size_type>(0);
+
+  if ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
+    auto const sizes_itr =
+      cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{*d_strings, d_delimiter});
+    auto [token_offsets, total_count] =
+      cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+
+    // build the output column to hold all the token ids
+    auto tokens = cudf::make_numeric_column(
+      output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr);
+    auto d_tokens  = tokens->mutable_view().data<cudf::size_type>();
+    auto d_offsets = token_offsets->view().data<cudf::size_type>();
+    vocabulary_tokenizer_fn<decltype(map_ref)> tokenizer{
+      *d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens};
+    thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), tokenizer);
+    return cudf::make_lists_column(input.size(),
+                                   std::move(token_offsets),
+                                   std::move(tokens),
+                                   input.null_count(),
+                                   cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                   stream,
+                                   mr);
+  }
+
+  // longer strings perform better with warp-parallel approach
+
+  auto const first_offset  = (input.offset() == 0) ? 0
+                                                   : cudf::detail::get_value<cudf::size_type>(
+                                                      input.offsets(), input.offset(), stream);
+  auto const last_offset   = (input.offset() == 0 && input.size() == input.offsets().size() - 1)
+                               ? input.chars().size()
+                               : cudf::detail::get_value<cudf::size_type>(
+                                 input.offsets(), input.size() + input.offset(), stream);
+  auto const chars_size    = last_offset - first_offset;
+  auto const d_input_chars = input.chars().data<char>() + first_offset;
+
+  rmm::device_uvector<cudf::size_type> d_token_counts(input.size(), stream);
+  rmm::device_uvector<int8_t> d_marks(chars_size, stream);
+
+  // mark position of all delimiters
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     zero_itr,
+                     chars_size,
+                     mark_delimiters_fn{d_input_chars, d_delimiter, d_marks.data()});
+
+  // launch warp per string to compute token counts
+  cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
+  token_counts_fn<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    *d_strings, d_delimiter, d_token_counts.data(), d_marks.data());
+  auto [token_offsets, total_count] = cudf::detail::make_offsets_child_column(
+    d_token_counts.begin(), d_token_counts.end(), stream, mr);
+
+  rmm::device_uvector<cudf::size_type> d_tmp_offsets(total_count + 1, stream);
+  d_tmp_offsets.set_element(total_count, chars_size, stream);
+  thrust::copy_if(rmm::exec_policy(stream),
+                  zero_itr,
+                  thrust::counting_iterator<cudf::size_type>(chars_size),
+                  d_tmp_offsets.begin(),
+                  [d_marks = d_marks.data()] __device__(auto idx) {
+                    if (idx == 0) return true;
+                    return d_marks[idx] && !d_marks[idx - 1];
+                  });
+
+  auto tmp_offsets =
+    std::make_unique<cudf::column>(std::move(d_tmp_offsets), rmm::device_buffer{}, 0);
+  auto tmp_chars = cudf::column_view(input.chars().type(), chars_size, d_input_chars, nullptr, 0);
+  auto const tmp_input = cudf::column_view(
+    input.parent().type(), total_count, nullptr, nullptr, 0, 0, {tmp_offsets->view(), tmp_chars});
+
+  auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream);
 
-  // build the output column to hold all the token ids
   auto tokens =
     cudf::make_numeric_column(output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr);
-  auto map_ref   = vocabulary._impl->get_map_ref();
-  auto d_offsets = token_offsets->view().data<cudf::size_type>();
-  auto d_tokens  = tokens->mutable_view().data<cudf::size_type>();
-  vocabulary_tokenizer_fn<decltype(map_ref)> tokenizer{
-    *d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens};
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     input.size(),
-                     tokenizer);
+  auto d_tokens = tokens->mutable_view().data<cudf::size_type>();
+
+  transform_tokenizer_fn<decltype(map_ref)> tokenizer{d_delimiter, map_ref, default_id};
+  thrust::transform(rmm::exec_policy(stream),
+                    d_tmp_strings->begin<cudf::string_view>(),
+                    d_tmp_strings->end<cudf::string_view>(),
+                    d_tokens,
+                    tokenizer);
 
   return cudf::make_lists_column(input.size(),
                                  std::move(token_offsets),
diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp
index fbc706ea290..8118183a458 100644
--- a/cpp/tests/text/tokenize_tests.cpp
+++ b/cpp/tests/text/tokenize_tests.cpp
@@ -208,14 +208,16 @@ TEST_F(TextTokenizeTest, Vocabulary)
     {"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"});
   auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary));
 
-  auto validity = cudf::test::iterators::null_at(4);
-  cudf::test::strings_column_wrapper input({"the fox jumped over the dog",
-                                            "the dog chased the cat",
-                                            "the cat chased the mouse",
-                                            "the mousé  ate  cheese",
-                                            "",
-                                            ""},
-                                           validity);
+  auto validity = cudf::test::iterators::null_at(5);
+  auto input    = cudf::test::strings_column_wrapper({" the fox jumped over the dog ",
+                                                      " the dog chased the cat",
+                                                      "",
+                                                      "the cat chased the mouse ",
+                                                      "the mousé  ate  cheese",
+                                                      "",
+                                                      "dog"},
+                                                  validity);
+
   auto input_view = cudf::strings_column_view(input);
   auto delimiter  = cudf::string_scalar(" ");
   auto default_id = -7;  // should be the token for the missing 'cat'
@@ -225,12 +227,55 @@ TEST_F(TextTokenizeTest, Vocabulary)
   // clang-format off
   LCW expected({LCW{ 9, 4, 5, 8, 9, 3},
                 LCW{ 9, 3, 1, 9,-7},
+                LCW{},
                 LCW{ 9,-7, 1, 9, 6},
                 LCW{ 9, 7, 0, 2},
-                LCW{}, LCW{}},
+                LCW{}, LCW{3}},
                 validity);
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto sliced          = cudf::slice(input, {1, 4}).front();
+  auto sliced_expected = cudf::slice(expected, {1, 4}).front();
+
+  input_view = cudf::strings_column_view(sliced);
+
+  results = nvtext::tokenize_with_vocabulary(input_view, *vocab, delimiter, default_id);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected);
+}
+
+TEST_F(TextTokenizeTest, VocabularyLongStrings)
+{
+  cudf::test::strings_column_wrapper vocabulary(  // leaving out 'cat' on purpose
+    {"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"});
+  auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary));
+
+  std::vector<std::string> h_strings(
+    4,
+    "the fox jumped chased the dog cheese mouse at the over there dog mouse cat plus the horse "
+    "jumped over the mouse house with the dog");
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end());
+  auto input_view = cudf::strings_column_view(input);
+  auto delimiter  = cudf::string_scalar(" ");
+  auto default_id = -1;
+  auto results    = nvtext::tokenize_with_vocabulary(input_view, *vocab, delimiter, default_id);
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::size_type>;
+  // clang-format off
+  LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3}});
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto sliced          = cudf::slice(input, {1, 3}).front();
+  auto sliced_expected = cudf::slice(expected, {1, 3}).front();
+
+  input_view = cudf::strings_column_view(sliced);
+
+  results = nvtext::tokenize_with_vocabulary(input_view, *vocab, delimiter, default_id);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected);
 }
 
 TEST_F(TextTokenizeTest, TokenizeErrors)

From c8c3e5cb6f0482b4070efd8b87484b32439c9d3a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 6 Nov 2023 09:41:36 -0500
Subject: [PATCH 073/118] Rework nvtext::byte_pair_encoding API (#14337)

Rewrite of the `nvtext::byte_pair_encoding` API to better match behavior and performance requirements.
The API now strictly does BPE without any tokenizing (word delimiting) on the input.
The code also has been rewritten to improve performance on very long strings.

Authors:
  - David Wendt (https://github.com/davidwendt)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/14337
---
 cpp/include/nvtext/byte_pair_encoding.hpp |   3 +-
 cpp/src/text/bpe/byte_pair_encoding.cu    | 816 ++++++++++------------
 cpp/src/text/bpe/byte_pair_encoding.cuh   | 115 ++-
 cpp/src/text/bpe/load_merge_pairs.cu      |  75 +-
 cpp/tests/text/bpe_tests.cpp              | 106 +--
 5 files changed, 577 insertions(+), 538 deletions(-)

diff --git a/cpp/include/nvtext/byte_pair_encoding.hpp b/cpp/include/nvtext/byte_pair_encoding.hpp
index 1f4851d7057..632a3cc279f 100644
--- a/cpp/include/nvtext/byte_pair_encoding.hpp
+++ b/cpp/include/nvtext/byte_pair_encoding.hpp
@@ -36,7 +36,7 @@ namespace nvtext {
  */
 struct bpe_merge_pairs {
   struct bpe_merge_pairs_impl;
-  std::unique_ptr<bpe_merge_pairs_impl> impl{};  ///< Implementation of the BPE merge pairs table.
+  bpe_merge_pairs_impl* impl{};  ///< Implementation of the BPE merge pairs table.
 
   /**
    * @brief Construct a new bpe merge pairs object
@@ -61,6 +61,7 @@ struct bpe_merge_pairs {
                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   ~bpe_merge_pairs();
+  bpe_merge_pairs();
 };
 
 /**
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu
index 42cd9bcbcbe..5be35119003 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cu
+++ b/cpp/src/text/bpe/byte_pair_encoding.cu
@@ -21,8 +21,10 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/combine.hpp>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -30,468 +32,294 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/binary_search.h>
 #include <thrust/copy.h>
-#include <thrust/count.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
-#include <thrust/find.h>
-#include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
 #include <thrust/merge.h>
-#include <thrust/pair.h>
-#include <thrust/scan.h>
-#include <thrust/transform.h>
+#include <thrust/remove.h>
+#include <thrust/unique.h>
 
 namespace nvtext {
 namespace detail {
-
 namespace {
 
-template <typename CharType>
-constexpr bool is_whitespace(CharType ch)
-{
-  return ch <= ' ';
-}
+constexpr int block_size = 512;
 
 /**
- * @brief Resolve a substring up to the first whitespace character.
+ * @brief Produces offsets to unpairable locations in the given chars array
  *
- * This will return a substring of the input starting with the first byte
- * up to the first whitespace character found or the end of the string.
- * Any whitespace is expected only at the end of the string.
+ * Launched as a thread per byte of the chars array.
+ * The output is non-zero offsets to locations of unpairable substrings.
+ * An unpairable substring does not exist in the given map and so will
+ * never be paired. Fortunately, this can be used as an artificial
+ * boundary providing increased parallelism in the BPE kernel.
  *
- * @param d_str Input string to resolve.
- * @return Substring of the input excluding any trailing whitespace.
+ * @tparam MapRefType The type of the map finder object
  */
-__device__ cudf::string_view get_first_token(cudf::string_view const& d_str)
-{
-  auto const begin = d_str.data();
-  auto const end   = thrust::find_if(
-    thrust::seq, begin, begin + d_str.size_bytes(), [](auto ch) { return is_whitespace(ch); });
-  auto const size = static_cast<cudf::size_type>(thrust::distance(begin, end));
-  return cudf::string_view(begin, size);
-}
+template <typename MapRefType>
+struct bpe_unpairable_offsets_fn {
+  cudf::device_span<char const> d_chars;
+  cudf::size_type offset;
+  MapRefType const d_map;
+  __device__ cudf::size_type operator()(cudf::size_type idx)
+  {
+    if (!cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) { return 0; }
+
+    auto const itr  = d_chars.data() + idx;
+    auto const end  = d_chars.end();
+    auto const lhs  = cudf::string_view(itr, cudf::strings::detail::bytes_in_utf8_byte(*itr));
+    auto const next = itr + lhs.size_bytes();
+    auto output     = 0;
+    if (next < end) {
+      auto const rhs = cudf::string_view(next, cudf::strings::detail::bytes_in_utf8_byte(*next));
+      // see if both halves exist anywhere in the table, if not these are unpairable
+      if (d_map.find(lhs) == d_map.end() && d_map.find(rhs) == d_map.end()) {
+        output = idx + lhs.size_bytes() + offset;  // offset for artificial boundary
+      }
+    }
+    return output;
+  }
+};
 
 /**
- * @brief Main byte pair encoding algorithm function for each string.
+ * @brief Performs byte-pair-encoding
  *
- * @see The byte_pair_encoding_fn::operator() function below for details.
+ * Computes the locations where the separator will be inserted in `d_spaces_data`.
+ * This is launched as a string per block.
+ *
+ * The process first initializes all characters to 1 per position in `d_spaces_data`.
+ * All pairs are realized and their ranks stored in `d_ranks_data`.
+ *
+ * Iteratively, the minimum rank is located, the corresponding `d_spaces_data` location
+ * is set to 0 resulting in new potential pairs. The process repeats accounting for
+ * the rank of the newly formed pairs.
+ *
+ * Once there are no more rankable pairs, the process finishes and the `d_spaces_data`
+ * values identify the location to insert the separator.
+ *
+ * @tparam MapRefType The type of the map finder object
+ * @param d_strings Input data
+ * @param d_map For looking up individual string candidates
+ * @param d_spaces_data Output the location where separator will be inserted
+ * @param d_ranks_data Working memory to hold pair ranks
+ * @param d_rerank_data Working memory to hold locations where reranking is required
  */
 template <typename MapRefType>
-struct byte_pair_encoding_fn {
-  cudf::column_device_view const d_merges;
-  cudf::column_device_view const d_strings;
-  MapRefType const d_map;
-  cudf::size_type* d_sizes;  // output size of encoded string
-  string_hasher_type const hasher;
-  cudf::size_type* d_byte_indices;
-
-  /**
-   * @brief Parse the merge pair into components.
-   *
-   * The two substrings are separated by a single space.
-   *
-   * @param idx Index of merge pair to dissect.
-   * @return The left and right halves of the merge pair.
-   */
-  __device__ thrust::pair<cudf::string_view, cudf::string_view> dissect_merge_pair(
-    cudf::size_type idx)
-  {
-    auto const d_pair  = d_merges.element<cudf::string_view>(idx);
-    auto const lhs     = d_pair.data();
-    auto const end_str = d_pair.data() + d_pair.size_bytes();
-    auto const rhs     = thrust::find(thrust::seq, lhs, end_str, ' ');  // space always expected
-    // check for malformed pair entry to prevent segfault
-    if (rhs == end_str) { return thrust::make_pair(cudf::string_view{}, cudf::string_view{}); }
-    auto const lhs_size = static_cast<cudf::size_type>(thrust::distance(lhs, rhs));
-    auto const rhs_size = static_cast<cudf::size_type>(thrust::distance(rhs + 1, end_str));
-    return thrust::make_pair(cudf::string_view(lhs, lhs_size),
-                             cudf::string_view(rhs + 1, rhs_size));
+__global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
+                                MapRefType const d_map,
+                                int8_t* d_spaces_data,          // working memory
+                                cudf::size_type* d_ranks_data,  // more working memory
+                                int8_t* d_rerank_data           // and one more working memory
+)
+{
+  // string per block
+  auto const str_idx =
+    static_cast<cudf::size_type>(cudf::detail::grid_1d::global_thread_id() / block_size);
+  auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
+
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  auto const offsets =
+    d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
+  auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
+
+  auto const d_spaces   = d_spaces_data + offset;
+  auto const end_spaces = d_spaces + d_str.size_bytes();
+  auto const d_ranks    = d_ranks_data + offset;
+  auto const end_ranks  = d_ranks + d_str.size_bytes();
+  auto const d_rerank   = d_rerank_data + offset;
+  auto const end_rerank = d_rerank + d_str.size_bytes();
+
+  auto constexpr max_rank = cuda::std::numeric_limits<cudf::size_type>::max();
+
+  __shared__ cudf::size_type block_min_rank;
+  using block_reduce = cub::BlockReduce<cudf::size_type, block_size>;
+  __shared__ typename block_reduce::TempStorage temp_storage;
+  auto const num_valid = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes();
+
+  // init all the re-rank identifiers to zero
+  for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) {
+    *itr = 0;
   }
-
-  /**
-   * @brief Get the next substring of the given string.
-   *
-   * This will find the next sequence of characters identified by the
-   * given byte indices iterator values. The beginning of the sequence
-   * starts at `begin` and the end of the sequence is the first non-zero
-   * index found between (begin,end) exclusive.
-   *
-   * @tparam Iterator The byte indices iterator type
-   * @param begin Start of indices to check
-   * @param end End of indices to check
-   * @param d_str String to substring
-   * @return The substring found.
-   */
-  template <typename Iterator>
-  __device__ cudf::string_view next_substr(Iterator begin,
-                                           Iterator end,
-                                           cudf::string_view const& d_str)
-  {
-    auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; });
-    auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
-    return cudf::string_view(d_str.data() + *begin, size);
+  // init all ranks to max
+  for (auto itr = d_ranks + lane_idx; itr < end_ranks; itr += block_size) {
+    *itr = max_rank;
   }
-
-  /**
-   * @brief Look up the pair of strings in the d_map/d_merges
-   *
-   * @param lhs Left half of the string
-   * @param rhs Right half of the string
-   * @return Position of merge pair within d_map
-   */
-  __device__ auto get_merge_pair(cudf::string_view const& lhs, cudf::string_view const& rhs)
-  {
-    __shared__ char shmem[48 * 1024];  // max for Pascal
-    auto const total_size         = lhs.size_bytes() + rhs.size_bytes() + 1;
-    auto const thread_memory_size = static_cast<cudf::size_type>(sizeof(shmem) / blockDim.x);
-
-    // Edge case check.
-    // Empirically found only two merge pair strings that were greater than 70 bytes
-    // and they both looked like ignorable errors.
-    if (thread_memory_size < total_size) { return d_map.end(); }
-
-    // build the target string in shared memory
-    char* ptr = &shmem[threadIdx.x * thread_memory_size];
-
-    // build a temp string like:  temp = lhs + ' ' + rhs
-    memcpy(ptr, lhs.data(), lhs.size_bytes());
-    memcpy(ptr + lhs.size_bytes(), " ", 1);
-    memcpy(ptr + lhs.size_bytes() + 1, rhs.data(), rhs.size_bytes());
-
-    auto const d_str = cudf::string_view(ptr, total_size);
-    return d_map.find(d_str);
+  // init all spaces to 1 as appropriate
+  for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) {
+    auto const index = thrust::distance(d_spaces, itr);
+    *itr = static_cast<int8_t>(cudf::strings::detail::is_begin_utf8_char(d_str.data()[index]));
   }
+  __syncthreads();
 
-  /**
-   * @brief Byte encode each string.
-   *
-   * Each string is iteratively scanned for the minimum rank of adjacent substring pairs
-   * as found within the `d_map` table. Once the minimum pair is located, that pair
-   * is removed -- virtually by zero-ing the index value between any matching adjacent pairs.
-   *
-   * The iteration ends once there are no more adjacent pairs or there are no more
-   * matches found in `d_map`. At the end, the indices for each string reflect the
-   * encoding pattern and can be used to build the output.
-   *
-   * This function also computes the size of the encoded output of each string
-   * by simply counting the number of non-zero indices values remaining. This saves
-   * an extra kernel launch normally required to compute the offsets of the output column.
-   *
-   * @param idx The index of the string in `d_strings` to encode
-   */
-  __device__ void operator()(cudf::size_type idx)
-  {
-    if (d_strings.is_null(idx)) {
-      d_sizes[idx] = 0;
-      return;
+  // for finding the next half of a pair
+  auto next_substr = [d_str, d_spaces, end = end_spaces](int8_t* begin) {
+    auto const next = thrust::find(thrust::seq, begin + 1, end, 1);
+    auto const size = static_cast<cudf::size_type>(thrust::distance(begin, next));
+    return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size);
+  };
+  // for locating adjacent pairs after merging a pair
+  auto find_prev = [begin = d_spaces](int8_t* ptr) {
+    while (ptr > begin && *ptr == 0) {
+      --ptr;
     }
-    auto const d_str = get_first_token(d_strings.element<cudf::string_view>(idx));
-    if (d_str.empty()) {
-      d_sizes[idx] = 0;
-      return;
+    return ptr;
+  };
+
+  auto min_rank = max_rank;
+
+  // store all the initial ranks for each pair
+  // every character but the first one will have a initial rank
+  //
+  // Example:
+  // string:   abcdefghij
+  // spaces:   1111111111
+  // ranks:    *948516327
+  for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) {
+    if (*itr == 0) { continue; }  // skips any UTF-8 continuation bytes
+    // resolve pair and lookup its rank
+    auto const lhs      = next_substr(itr);  // retrieve lhs of the pair
+    auto const next_itr = itr + lhs.size_bytes();
+    if (next_itr < end_spaces) {
+      auto const rhs = next_substr(next_itr);  // retrieve rhs of the pair
+      if (!rhs.empty()) {
+        auto rank          = max_rank;
+        auto const mp      = merge_pair_type{lhs, rhs};
+        auto const map_itr = d_map.find(mp);                     // lookup pair in merges table;
+        if (map_itr != d_map.end()) { rank = map_itr->second; }  // found a match;
+        d_ranks[thrust::distance(d_spaces, next_itr)] = rank;    // store the rank
+        if (rank < min_rank) { min_rank = rank; }
+      }
     }
-
-    auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
-                          .element<cudf::size_type>(idx);
-    auto const d_indices = d_byte_indices + offset;
-
-    // initialize the byte indices for this string;
-    // set the index value to 0 for any intermediate UTF-8 bytes
-    thrust::transform(thrust::seq,
-                      thrust::make_counting_iterator<cudf::size_type>(0),
-                      thrust::make_counting_iterator<cudf::size_type>(d_str.size_bytes()),
-                      d_indices,
-                      [data = d_str.data()](auto idx) {
-                        auto const byte = static_cast<uint8_t>(data[idx]);
-                        return cudf::strings::detail::is_begin_utf8_char(byte) ? idx : 0;
-                      });
-
-    auto const begin = d_indices;
-    auto const end   = d_indices + d_str.size_bytes();
-
-    // keep processing the string until there are no more adjacent pairs found in d_map
-    cudf::size_type min_rank = 0;
-    while (min_rank < cuda::std::numeric_limits<cudf::size_type>::max()) {
-      // initialize working variables
-      min_rank = cuda::std::numeric_limits<cudf::size_type>::max();
-
-      auto lhs = next_substr(begin, end, d_str);
-      auto itr = begin + lhs.size_bytes();
-
-      auto min_itr  = itr;               // these are set along with
-      auto min_size = lhs.size_bytes();  // the min_rank variable
-
-      // check each adjacent pair against the d_map
-      while (itr < end) {
-        auto const rhs = next_substr(itr, end, d_str);
-        if (rhs.empty()) break;  // no more adjacent pairs
-
-        auto const map_itr = get_merge_pair(lhs, rhs);
-        if (map_itr != d_map.end()) {
-          // found a match; record the rank (and other min_ vars)
-          auto const rank = map_itr->second;
-          if (rank < min_rank) {
-            min_rank = rank;
-            min_itr  = itr;
-            min_size = rhs.size_bytes();
-          }
+  }
+  // compute the min rank across the block
+  auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid);
+  if (lane_idx == 0) { block_min_rank = reduce_rank; }
+  __syncthreads();
+
+  // loop through the ranks processing the current minimum until there are no more
+  while (block_min_rank < max_rank) {
+    // search the d_ranks for matches to block_min_rank
+    for (auto itr = d_ranks + lane_idx; itr < end_ranks; itr += block_size) {
+      if (*itr == block_min_rank) {
+        auto ptr = itr - 1;  // check for adjacent min-rank (edge-case)
+        while (ptr > d_ranks && *ptr == max_rank) {
+          --ptr;
         }
-        // next substring
-        lhs = rhs;
-        itr += rhs.size_bytes();
+        // set the output value to 0 at this position (erases separator, merges pair)
+        // using example string above, the min-rank is 1 at position 5
+        // string: abcdefghij
+        // spaces: 1111101111  (set position 5 to 0)
+        if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_ranks, itr)] = 0; }
       }
-
-      // if any pair matched, remove every occurrence from the string
-      if (min_rank < cuda::std::numeric_limits<cudf::size_type>::max()) {
-        // remove the first pair we found
-        itr  = min_itr;
-        *itr = 0;
-
-        // continue scanning for other occurrences in the remainder of the string
-        itr += min_size;
-        if (itr < end) {
-          auto const d_pair = dissect_merge_pair(min_rank);
-
-          lhs = next_substr(itr, end, d_str);
-          itr += lhs.size_bytes();
-          while (itr < end) {
-            auto rhs = next_substr(itr, end, d_str);
-            if (d_pair.first == lhs && d_pair.second == rhs) {
-              *itr = 0;  // removes the pair from this string
-              itr += rhs.size_bytes();
-              if (itr >= end) { break; }  // done checking for pairs
-              // skip to the next adjacent pair
-              rhs = next_substr(itr, end, d_str);
-            }
-            // next substring
-            lhs = rhs;
-            itr += rhs.size_bytes();
-          }
+    }
+    __syncthreads();
+
+    // identify all the re-rank locations (logic above invalidated adjacent pairs)
+    // using example string above, the adjacent pairs have to be re-ranked
+    // string: abcdefghij
+    // spaces: 1111101111 (pair 'e,f' is now merged)
+    // rerank: 0000101000 ('ef' and 'fg' need re-ranking as 'd,ef' and 'ef,g'
+    for (auto itr = d_ranks + lane_idx; itr < end_ranks; itr += block_size) {
+      auto const index = thrust::distance(d_ranks, itr);
+      if (*itr == block_min_rank && d_spaces[index] == 0) {
+        // find previous pair mid-point
+        auto ptr = find_prev(d_spaces + index - 1);
+        if (ptr > d_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; }
+        // find next pair mid-point
+        ptr = thrust::find(thrust::seq, d_spaces + index + 1, end_spaces, 1);
+        if (ptr < end_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; }
+        *itr = max_rank;  // reset this rank
+      }
+    }
+    __syncthreads();
+
+    // compute the ranks for the newly created pairs
+    min_rank = max_rank;  // and record the new minimum along the way
+    for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) {
+      auto const index = thrust::distance(d_rerank, itr);
+      auto rank        = d_ranks[index];
+      if (*itr) {
+        *itr = 0;  // reset re-rank
+        // build lhs of pair
+        auto const ptr  = find_prev(d_spaces + index - 1);
+        auto const size = static_cast<cudf::size_type>(thrust::distance(ptr, d_spaces + index));
+        auto const lhs  = cudf::string_view(d_str.data() + thrust::distance(d_spaces, ptr), size);
+        auto const rhs  = next_substr(d_spaces + index);  // retrieve rhs of pair
+        rank            = max_rank;
+        if (!rhs.empty()) {
+          auto const mp      = merge_pair_type{lhs, rhs};
+          auto const map_itr = d_map.find(mp);                     // lookup rank for this pair;
+          if (map_itr != d_map.end()) { rank = map_itr->second; }  // found a match
         }
+        d_ranks[index] = rank;  // store new rank
       }
+      if (rank < min_rank) { min_rank = rank; }
     }
 
-    // compute and store the output size for this string's encoding
-    auto const encoded_size = d_str.size_bytes() +  // number of original bytes +
-                              thrust::count_if(     // number of non-zero byte indices
-                                thrust::seq,
-                                d_indices,
-                                d_indices + d_str.size_bytes(),
-                                [](auto v) { return v != 0; });
-    d_sizes[idx] = static_cast<cudf::size_type>(encoded_size);
-  }
-};
-
-/**
- * @brief Build the output string encoding.
- *
- * This copies each string to the output inserting a space at each non-zero byte index.
- *
- * @code{.txt}
- * d_strings =      ["helloworld", "testthis"]
- * d_byte_indices = [ 0000050000    00004000]
- * result is ["hello world", "test this"]
- * @endcode
- */
-struct build_encoding_fn {
-  cudf::column_device_view const d_strings;
-  cudf::size_type const* d_byte_indices;
-  cudf::size_type const* d_offsets;
-  char* d_chars{};
-
-  __device__ void operator()(cudf::size_type idx)
-  {
-    if (d_strings.is_null(idx)) { return; }
-    auto const d_str = get_first_token(d_strings.element<cudf::string_view>(idx));
-    if (d_str.empty()) { return; }
-
-    auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index)
-                          .element<cudf::size_type>(idx);
-    auto const d_indices = d_byte_indices + offset;
-    auto d_output        = d_chars ? d_chars + d_offsets[idx] : nullptr;
-
-    // copy chars while indices[i]==0,
-    // insert space each time indices[i]!=0
-    auto const begin = d_indices;
-    auto const end   = d_indices + d_str.size_bytes();
-    auto d_input     = d_str.data();
-    *d_output++      = *d_input++;
-    auto itr         = begin + 1;
-    while (itr < end) {
-      if (*itr++) *d_output++ = ' ';
-      *d_output++ = *d_input++;
-    }
-    // https://github.com/rapidsai/cudf/pull/10270/files#r826319405
-  }
-};
+    // re-compute the minimum rank across the block (since new pairs are created above)
+    auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid);
+    if (lane_idx == 0) { block_min_rank = reduce_rank; }
+    __syncthreads();
+  }  // if no min ranks are found we are done, otherwise start again
+}
 
 /**
- * @brief Perform byte pair encoding on each string in the input column.
- *
- * The result is a strings column of the same size where each string has been encoded.
+ * @brief Computes the output size of each strings row
  *
- * The encoding is performed iteratively. Each pass determines the string's lowest
- * ranked merge pair as determined by the strings in `merges_table`. This pair
- * is removed (virtually) from each string before starting the next iteration.
+ * This launches as a string per block.
+ * The non-zero values in `d_spaces_data` for each string is added to
+ * the current string size to produce the total output bytes.
  *
- * Once all pairs have exhausted for all strings, the output is constructed from
- * the results by adding spaces between each remaining pair in each string.
- *
- * @param input Strings to encode.
- * @param merge_pairs Merge pairs data and map used for encoding.
- * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param d_strings Input data
+ * @param d_spaces_data Output the location where separator will be inserted
+ * @param d_sizes Output sizes of each row
  */
-std::unique_ptr<cudf::column> byte_pair_encoding(
-  cudf::strings_column_view const& input,
-  bpe_merge_pairs::bpe_merge_pairs_impl const& merge_pairs,
-  rmm::cuda_stream_view stream)
+__global__ void bpe_finalize(cudf::column_device_view const d_strings,
+                             int8_t* d_spaces_data,    // where separators are inserted
+                             cudf::size_type* d_sizes  // output sizes of encoded strings
+)
 {
-  auto const d_merges = merge_pairs.get_merge_pairs();
-  CUDF_EXPECTS(d_merges.size() > 0, "Merge pairs table must not be empty");
-
-  // build working vector to hold index values per byte
-  rmm::device_uvector<cudf::size_type> d_byte_indices(input.chars().size(), stream);
-
-  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
-
-  auto offsets   = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
-                                           static_cast<cudf::size_type>(input.size() + 1),
-                                           cudf::mask_state::UNALLOCATED,
-                                           stream,
-                                           rmm::mr::get_current_device_resource());
-  auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
+  // string per block
+  auto const str_idx =
+    static_cast<cudf::size_type>(cudf::detail::grid_1d::global_thread_id() / block_size);
+  auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
+
+  if (d_strings.is_null(str_idx)) {
+    d_sizes[str_idx] = 0;
+    return;
+  }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) {
+    d_sizes[str_idx] = 0;
+    return;
+  }
 
-  auto map_ref = merge_pairs.get_merge_pairs_ref();
-  byte_pair_encoding_fn<decltype(map_ref)> fn{
-    d_merges, *d_strings, map_ref, d_offsets, string_hasher_type{}, d_byte_indices.data()};
-  thrust::for_each_n(
-    rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0), input.size(), fn);
+  auto const offsets =
+    d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
+  auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
 
-  // build the output: add spaces between the remaining pairs in each string
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), d_offsets, d_offsets + input.size() + 1, d_offsets);
+  auto const d_spaces   = d_spaces_data + offset;
+  auto const end_spaces = d_spaces + d_str.size_bytes();
+  auto const num_valid  = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes();
 
-  auto const bytes =
-    cudf::detail::get_value<cudf::size_type>(offsets->view(), input.size(), stream);
-  auto chars = cudf::strings::detail::create_chars_child_column(
-    bytes, stream, rmm::mr::get_current_device_resource());
-  auto d_chars = chars->mutable_view().data<char>();
+  using block_reduce = cub::BlockReduce<cudf::size_type, block_size>;
+  __shared__ typename block_reduce::TempStorage temp_storage;
 
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<cudf::size_type>(0),
-                     input.size(),
-                     build_encoding_fn{*d_strings, d_byte_indices.data(), d_offsets, d_chars});
+  // reset the first position -- no separator to be added here
+  if (lane_idx == 0) { *d_spaces = 0; }
 
-  return make_strings_column(
-    input.size(), std::move(offsets), std::move(chars), 0, rmm::device_buffer{});
-}
-
-/**
- * @brief Detect space to not-space transitions inside each string.
- *
- * This handles sliced input and null strings as well.
- * It is parallelized over bytes and returns true only for valid left edges
- * -- non-space preceded by a space.
- */
-struct edge_of_space_fn {
-  cudf::column_device_view const d_strings;
-  __device__ bool operator()(cudf::size_type offset)
-  {
-    auto const d_chars =
-      d_strings.child(cudf::strings_column_view::chars_column_index).data<char>();
-    if (is_whitespace(d_chars[offset]) || !is_whitespace(d_chars[offset - 1])) { return false; }
-
-    auto const offsets   = d_strings.child(cudf::strings_column_view::offsets_column_index);
-    auto const d_offsets = offsets.data<cudf::size_type>() + d_strings.offset();
-    // ignore offsets outside sliced range
-    if (offset < d_offsets[0] || offset >= d_offsets[d_strings.size()]) { return false; }
-
-    auto itr =
-      thrust::lower_bound(thrust::seq, d_offsets, d_offsets + d_strings.size() + 1, offset);
-    // ignore offsets at existing string boundaries
-    if (*itr == offset) { return false; }
-
-    // count only edges for valid strings
-    auto const index = static_cast<cudf::size_type>(thrust::distance(d_offsets, itr)) - 1;
-    return d_strings.is_valid(index);
+  // compute the output size for this string by counting the resulting separator positions
+  auto bytes = 0;
+  for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) {
+    bytes += (*itr > 0);
   }
-};
-
-/**
- * @brief Create new offsets by identifying substrings by whitespace.
- *
- * This is similar to cudf::strings::split_record but does not fully split
- * and only returns new offsets. The behavior is more like a view-only slice
- * of the chars child with the result still including trailing delimiters.
- *
- * The encoding algorithm ignores the trailing whitespace of each string.
- *
- * @param input Strings to tokenize.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return New offsets including those at the edge of each space.
- */
-std::unique_ptr<cudf::column> space_offsets(cudf::strings_column_view const& input,
-                                            cudf::column_device_view const& d_strings,
-                                            rmm::cuda_stream_view stream)
-{
-  // count space offsets
-  auto const begin = thrust::make_counting_iterator<cudf::size_type>(1);
-  auto const end   = thrust::make_counting_iterator<cudf::size_type>(input.chars().size());
-  edge_of_space_fn edge_of_space{d_strings};
-  auto const space_count = thrust::count_if(rmm::exec_policy(stream), begin, end, edge_of_space);
-
-  // copy space offsets
-  rmm::device_uvector<cudf::size_type> space_offsets(space_count, stream);
-  thrust::copy_if(rmm::exec_policy(stream), begin, end, space_offsets.data(), edge_of_space);
-
-  // create output offsets
-  auto result =
-    cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
-                              static_cast<cudf::size_type>(space_count + input.size() + 1),
-                              cudf::mask_state::UNALLOCATED,
-                              stream,
-                              rmm::mr::get_current_device_resource());
-
-  // combine current offsets with space offsets
-  thrust::merge(rmm::exec_policy(stream),
-                input.offsets_begin(),
-                input.offsets_end(),
-                space_offsets.begin(),
-                space_offsets.end(),
-                result->mutable_view().begin<cudf::size_type>());
-
-  return result;
+  auto const total_bytes = block_reduce(temp_storage).Sum(bytes, num_valid);
+  if (lane_idx == 0) { d_sizes[str_idx] = total_bytes + d_str.size_bytes(); }
 }
 
-/**
- * @brief Build new offsets that can be used to build a list column for calling join.
- *
- * This essentially returns the number of tokens for each string.
- */
-struct list_offsets_fn {
-  cudf::column_device_view const d_strings;
-  __device__ cudf::size_type operator()(cudf::size_type idx)
-  {
-    if (d_strings.is_null(idx)) return 0;
-    auto const d_str = d_strings.element<cudf::string_view>(idx);
-    if (d_str.empty()) return 1;  // empty is a single valid result
-
-    auto const begin = thrust::make_counting_iterator<cudf::size_type>(1);
-    auto const end   = thrust::make_counting_iterator<cudf::size_type>(d_str.size_bytes());
-
-    // this counts the number of non-adjacent delimiters
-    auto const result =
-      thrust::count_if(thrust::seq, begin, end, [data = d_str.data()](auto chidx) {
-        return !is_whitespace(data[chidx]) && is_whitespace(data[chidx - 1]);
-      });
-    return static_cast<cudf::size_type>(result) + 1;
-  }
-};
-
 }  // namespace
 
 std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
@@ -500,54 +328,120 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                                                  rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
 {
-  if (input.is_empty() || input.chars_size() == 0)
+  if (input.is_empty() || input.chars_size() == 0) {
     return cudf::make_empty_column(cudf::type_id::STRING);
+  }
+
+  CUDF_EXPECTS(separator.is_valid(stream), "separator parameter must be valid");
+  auto const d_separator = separator.value(stream);
+  CUDF_EXPECTS(d_separator.size_bytes() == 1, "for now, separator must be a single-byte character");
 
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
-  auto const offsets   = space_offsets(input, *d_strings, stream);
-
-  // build a view using the new offsets and the current input chars column
-  auto const input_view = cudf::column_view(cudf::data_type{cudf::type_id::STRING},
-                                            offsets->size() - 1,
-                                            nullptr,  // no parent data
-                                            nullptr,  // null-mask
-                                            0,        // null-count
-                                            0,        // offset
-                                            {offsets->view(), input.chars()});
-
-  // run BPE on this view
-  auto const bpe_column =
-    byte_pair_encoding(cudf::strings_column_view(input_view), *(merge_pairs.impl), stream);
-
-  // recombine the result:
-  // compute the offsets needed to build a list view
-  auto const list_offsets = [d_strings = *d_strings, stream] {
-    auto offsets_itr = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<cudf::size_type>(0), list_offsets_fn{d_strings});
-    auto offsets_column = std::get<0>(cudf::detail::make_offsets_child_column(
-      offsets_itr, offsets_itr + d_strings.size(), stream, rmm::mr::get_current_device_resource()));
-    return offsets_column;
-  }();
-
-  // build a list column_view using the BPE output and the list_offsets
-  auto const list_join = cudf::column_view(cudf::data_type{cudf::type_id::LIST},
-                                           input.size(),
-                                           nullptr,  // no parent data in list column
-                                           input.null_mask(),
-                                           input.null_count(),
-                                           0,
-                                           {list_offsets->view(), bpe_column->view()});
-
-  // build the output strings column
-  auto result =
-    cudf::strings::detail::join_list_elements(cudf::lists_column_view(list_join),
-                                              separator,
-                                              cudf::string_scalar(""),
-                                              cudf::strings::separator_on_nulls::NO,
-                                              cudf::strings::output_if_empty_list::EMPTY_STRING,
-                                              stream,
-                                              mr);
-  return result;
+
+  auto const first_offset  = (input.offset() == 0) ? 0
+                                                   : cudf::detail::get_value<cudf::size_type>(
+                                                      input.offsets(), input.offset(), stream);
+  auto const last_offset   = (input.offset() == 0 && input.size() == input.offsets().size() - 1)
+                               ? input.chars().size()
+                               : cudf::detail::get_value<cudf::size_type>(
+                                 input.offsets(), input.size() + input.offset(), stream);
+  auto const chars_size    = last_offset - first_offset;
+  auto const d_input_chars = input.chars().data<char>() + first_offset;
+
+  auto const offset_data_type = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
+  auto offsets                = cudf::make_numeric_column(
+    offset_data_type, input.size() + 1, cudf::mask_state::UNALLOCATED, stream, mr);
+  auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
+
+  rmm::device_uvector<int8_t> d_spaces(chars_size, stream);  // identifies non-merged pairs
+  // used for various purposes below: unpairable-offsets, pair ranks, separator insert positions
+  rmm::device_uvector<cudf::size_type> d_working(chars_size, stream);
+
+  auto const chars_begin = thrust::counting_iterator<cudf::size_type>(0);
+  auto const chars_end   = thrust::counting_iterator<cudf::size_type>(chars_size);
+
+  {
+    // this kernel locates unpairable sections of strings to create artificial string row
+    // boundaries; the boundary values are recorded as offsets in d_up_offsets
+    auto const d_up_offsets = d_working.data();  // store unpairable offsets here
+    auto const mp_map       = merge_pairs.impl->get_mp_table_ref();  // lookup table
+    auto const d_chars_span = cudf::device_span<char const>(d_input_chars, chars_size);
+    auto up_fn = bpe_unpairable_offsets_fn<decltype(mp_map)>{d_chars_span, first_offset, mp_map};
+    thrust::transform(rmm::exec_policy_nosync(stream), chars_begin, chars_end, d_up_offsets, up_fn);
+    auto const up_end =  // remove all but the unpairable offsets
+      thrust::remove(rmm::exec_policy_nosync(stream), d_up_offsets, d_up_offsets + chars_size, 0);
+    auto const unpairables = thrust::distance(d_up_offsets, up_end);  // number of unpairables
+
+    // new string boundaries created by combining unpairable offsets with the existing offsets
+    auto tmp_offsets = rmm::device_uvector<cudf::size_type>(unpairables + input.size() + 1, stream);
+    thrust::merge(rmm::exec_policy_nosync(stream),
+                  input.offsets_begin(),
+                  input.offsets_end(),
+                  d_up_offsets,
+                  up_end,
+                  tmp_offsets.begin());
+    // remove any adjacent duplicate offsets (i.e. empty or null rows)
+    auto const offsets_end =
+      thrust::unique(rmm::exec_policy_nosync(stream), tmp_offsets.begin(), tmp_offsets.end());
+    auto const offsets_total =
+      static_cast<cudf::size_type>(thrust::distance(tmp_offsets.begin(), offsets_end));
+    tmp_offsets.resize(offsets_total, stream);
+
+    // temp column created with the merged offsets and the original chars data
+    auto const col_offsets =
+      cudf::column_view(cudf::device_span<cudf::size_type const>(tmp_offsets));
+    auto const tmp_size  = offsets_total - 1;
+    auto const tmp_input = cudf::column_view(
+      input.parent().type(), tmp_size, nullptr, nullptr, 0, 0, {col_offsets, input.chars()});
+    auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream);
+
+    // launch the byte-pair-encoding kernel on the temp column
+    rmm::device_uvector<int8_t> d_rerank(chars_size, stream);  // more working memory;
+    auto const d_ranks  = d_working.data();                    // store pair ranks here
+    auto const pair_map = merge_pairs.impl->get_merge_pairs_ref();
+    bpe_parallel_fn<decltype(pair_map)><<<tmp_size, block_size, 0, stream.value()>>>(
+      *d_tmp_strings, pair_map, d_spaces.data(), d_ranks, d_rerank.data());
+  }
+
+  // compute the output sizes and store them in the d_offsets vector
+  bpe_finalize<<<input.size(), block_size, 0, stream.value()>>>(
+    *d_strings, d_spaces.data(), d_offsets);
+
+  // convert sizes to offsets in-place
+  auto const bytes =
+    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream);
+  CUDF_EXPECTS(bytes <= static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
+               "Size of output exceeds the column size limit",
+               std::overflow_error);
+
+  // build the output: inserting separators to the input character data
+  auto chars   = cudf::strings::detail::create_chars_child_column(bytes, stream, mr);
+  auto d_chars = chars->mutable_view().data<char>();
+
+  auto const d_inserts     = d_working.data();  // stores the insert positions
+  auto offsets_at_non_zero = [d_spaces = d_spaces.data()] __device__(auto idx) {
+    return d_spaces[idx] > 0;  // separator to be inserted here
+  };
+  auto const copy_end = thrust::copy_if(
+    rmm::exec_policy_nosync(stream), chars_begin + 1, chars_end, d_inserts, offsets_at_non_zero);
+
+  // this will insert the single-byte separator into positions specified in d_inserts
+  auto const sep_char = thrust::constant_iterator<char>(separator.to_string(stream)[0]);
+  thrust::merge_by_key(rmm::exec_policy_nosync(stream),
+                       d_inserts,      // where to insert separator byte
+                       copy_end,       //
+                       chars_begin,    // all indices
+                       chars_end,      //
+                       sep_char,       // byte to insert
+                       d_input_chars,  // original data
+                       thrust::make_discard_iterator(),
+                       d_chars);  // result
+
+  return cudf::make_strings_column(input.size(),
+                                   std::move(offsets),
+                                   std::move(chars),
+                                   input.null_count(),
+                                   cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
 }  // namespace detail
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cuh b/cpp/src/text/bpe/byte_pair_encoding.cuh
index cefd32e8f60..2a170317909 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cuh
+++ b/cpp/src/text/bpe/byte_pair_encoding.cuh
@@ -21,6 +21,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/hashing/detail/hash_allocator.cuh>
+#include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/strings/string_view.cuh>
 
@@ -30,20 +31,31 @@
 
 #include <cuco/static_map.cuh>
 
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/find.h>
+#include <thrust/pair.h>
+
 #include <cstdint>
 #include <type_traits>
 
 namespace nvtext {
 namespace detail {
 
-using hash_value_type    = uint32_t;
 using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+using hash_value_type    = string_hasher_type::result_type;
+using merge_pair_type    = thrust::pair<cudf::string_view, cudf::string_view>;
+
+using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
 
 /**
  * @brief Hasher function used for building and using the cuco static-map
  *
  * This takes advantage of heterogeneous lookup feature in cuco static-map which
- * allows inserting with one type (index) and looking up with a different type (string).
+ * allows inserting with one type (index) and looking up with a different type (merge_pair_type).
+ *
+ * The merge-pairs are in adjacent rows so each index will access two rows of string values.
+ * The hash of each string is combined for the returned result.
  */
 struct bpe_hasher {
   cudf::column_device_view const d_strings;
@@ -51,49 +63,115 @@ struct bpe_hasher {
   // used by insert
   __device__ hash_value_type operator()(cudf::size_type index) const
   {
-    return hasher(d_strings.element<cudf::string_view>(index));
+    index *= 2;
+    auto const lhs = d_strings.element<cudf::string_view>(index);
+    auto const rhs = d_strings.element<cudf::string_view>(index + 1);
+    return cudf::hashing::detail::hash_combine(hasher(lhs), hasher(rhs));
   }
   // used by find
-  __device__ hash_value_type operator()(cudf::string_view const& s) const { return hasher(s); }
+  __device__ hash_value_type operator()(merge_pair_type const& mp) const
+  {
+    return cudf::hashing::detail::hash_combine(hasher(mp.first), hasher(mp.second));
+  }
 };
 
 /**
  * @brief Equal function used for building and using the cuco static-map
  *
  * This takes advantage of heterogeneous lookup feature in cuco static-map which
- * allows inserting with one type (index) and looking up with a different type (string).
+ * allows inserting with one type (index) and looking up with a different type (merge_pair_type).
+ *
+ * The merge-pairs are in adjacent rows so each index will access two rows of string values.
+ * All rows from the input merge-pairs are unique.
  */
 struct bpe_equal {
   cudf::column_device_view const d_strings;
   // used by insert
   __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept
   {
-    return d_strings.element<cudf::string_view>(lhs) == d_strings.element<cudf::string_view>(rhs);
+    return lhs == rhs;  // all rows are unique
   }
   // used by find
-  __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
+  __device__ bool operator()(cudf::size_type lhs, merge_pair_type const& rhs) const noexcept
   {
-    return d_strings.element<cudf::string_view>(lhs) == rhs;
+    lhs *= 2;
+    auto const left  = d_strings.element<cudf::string_view>(lhs);
+    auto const right = d_strings.element<cudf::string_view>(lhs + 1);
+    return (left == rhs.first) && (right == rhs.second);
   }
 };
 
-using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
-
-using probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>;
+using bpe_probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>;
 
 using merge_pairs_map_type = cuco::experimental::static_map<cudf::size_type,
                                                             cudf::size_type,
                                                             cuco::experimental::extent<std::size_t>,
                                                             cuda::thread_scope_device,
                                                             bpe_equal,
-                                                            probe_scheme,
+                                                            bpe_probe_scheme,
                                                             hash_table_allocator_type>;
 
+/**
+ * @brief Hasher function used for building and using the cuco static-map
+ *
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (merge_pair_type).
+ *
+ * Each component of the merge-pairs (left and right) are stored individually in the map.
+ */
+struct mp_hasher {
+  cudf::column_device_view const d_strings;
+  string_hasher_type hasher{};
+  // used by insert
+  __device__ hash_value_type operator()(cudf::size_type index) const
+  {
+    auto const d_str = d_strings.element<cudf::string_view>(index);
+    return hasher(d_str);
+  }
+  // used by find
+  __device__ hash_value_type operator()(cudf::string_view const& d_str) const
+  {
+    return hasher(d_str);
+  }
+};
+
+/**
+ * @brief Equal function used for building and using the cuco static-map
+ *
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (string).
+ */
+struct mp_equal {
+  cudf::column_device_view const d_strings;
+  // used by insert
+  __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept
+  {
+    auto const left  = d_strings.element<cudf::string_view>(lhs);
+    auto const right = d_strings.element<cudf::string_view>(rhs);
+    return left == right;
+  }
+  // used by find
+  __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
+  {
+    auto const left = d_strings.element<cudf::string_view>(lhs);
+    return left == rhs;
+  }
+};
+
+using mp_probe_scheme = cuco::experimental::linear_probing<1, mp_hasher>;
+
+using mp_table_map_type = cuco::experimental::static_map<cudf::size_type,
+                                                         cudf::size_type,
+                                                         cuco::experimental::extent<std::size_t>,
+                                                         cuda::thread_scope_device,
+                                                         mp_equal,
+                                                         mp_probe_scheme,
+                                                         hash_table_allocator_type>;
+
 }  // namespace detail
 
-// since column_device_view::create returns is a little more than
-// std::unique_ptr<column_device_view> this helper simplifies the return type in a more maintainable
-// way
+// since column_device_view::create() returns is a little more than
+// std::unique_ptr<column_device_view> this helper simplifies the return type for us
 using col_device_view = std::invoke_result_t<decltype(&cudf::column_device_view::create),
                                              cudf::column_view,
                                              rmm::cuda_stream_view>;
@@ -101,14 +179,17 @@ using col_device_view = std::invoke_result_t<decltype(&cudf::column_device_view:
 struct bpe_merge_pairs::bpe_merge_pairs_impl {
   std::unique_ptr<cudf::column> const merge_pairs;
   col_device_view const d_merge_pairs;
-  std::unique_ptr<detail::merge_pairs_map_type> merge_pairs_map;
+  std::unique_ptr<detail::merge_pairs_map_type> merge_pairs_map;  // for BPE
+  std::unique_ptr<detail::mp_table_map_type> mp_table_map;        // for locating unpairables
 
   bpe_merge_pairs_impl(std::unique_ptr<cudf::column>&& merge_pairs,
                        col_device_view&& d_merge_pairs,
-                       std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map);
+                       std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map,
+                       std::unique_ptr<detail::mp_table_map_type>&& mp_table_map);
 
   auto const get_merge_pairs() const { return *d_merge_pairs; }
   auto get_merge_pairs_ref() const { return merge_pairs_map->ref(cuco::experimental::op::find); }
+  auto get_mp_table_ref() const { return mp_table_map->ref(cuco::experimental::op::find); }
 };
 
 }  // namespace nvtext
diff --git a/cpp/src/text/bpe/load_merge_pairs.cu b/cpp/src/text/bpe/load_merge_pairs.cu
index 77f0ebba43f..80073df5804 100644
--- a/cpp/src/text/bpe/load_merge_pairs.cu
+++ b/cpp/src/text/bpe/load_merge_pairs.cu
@@ -22,14 +22,13 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/split/split.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <thrust/functional.h>
-
 #include <fstream>
 #include <iostream>
 #include <vector>
@@ -88,32 +87,51 @@ std::unique_ptr<cudf::column> load_file_to_column(std::string const& filename_me
 std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   cudf::column_device_view const& input, rmm::cuda_stream_view stream)
 {
-  // Ensure capacity is at least (size/0.7) as documented here:
-  // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182
   auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
-    static_cast<size_t>(input.size() * 2),  // capacity is 2x;
+    static_cast<size_t>(input.size()),
     cuco::empty_key{-1},
-    cuco::empty_value{-1},  // empty value is not used
+    cuco::empty_value{-1},
     bpe_equal{input},
-    probe_scheme{bpe_hasher{input}},
+    bpe_probe_scheme{bpe_hasher{input}},
     hash_table_allocator_type{default_allocator<char>{}, stream},
     stream.value());
 
   auto iter = cudf::detail::make_counting_transform_iterator(
     0, [] __device__(cudf::size_type idx) { return cuco::make_pair(idx, idx); });
 
-  merge_pairs_map->insert_async(iter, iter + input.size(), stream.value());
+  merge_pairs_map->insert_async(iter, iter + (input.size() / 2), stream.value());
 
   return merge_pairs_map;
 }
 
+std::unique_ptr<detail::mp_table_map_type> initialize_mp_table_map(
+  cudf::column_device_view const& input, rmm::cuda_stream_view stream)
+{
+  auto mp_table_map = std::make_unique<mp_table_map_type>(
+    static_cast<size_t>(input.size()),
+    cuco::empty_key{-1},
+    cuco::empty_value{-1},
+    mp_equal{input},
+    mp_probe_scheme{mp_hasher{input}},
+    hash_table_allocator_type{default_allocator<char>{}, stream},
+    stream.value());
+
+  auto iter = cudf::detail::make_counting_transform_iterator(
+    0, [] __device__(cudf::size_type idx) { return cuco::make_pair(idx, idx); });
+
+  mp_table_map->insert_async(iter, iter + input.size(), stream.value());
+
+  return mp_table_map;
+}
+
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
   std::unique_ptr<cudf::column>&& input, rmm::cuda_stream_view stream)
 {
-  auto d_input     = cudf::column_device_view::create(input->view(), stream);
-  auto merge_pairs = initialize_merge_pairs_map(*d_input, stream);
+  auto d_input      = cudf::column_device_view::create(input->view(), stream);
+  auto merge_pairs  = initialize_merge_pairs_map(*d_input, stream);
+  auto mp_table_map = initialize_mp_table_map(*d_input, stream);
   return std::make_unique<nvtext::bpe_merge_pairs::bpe_merge_pairs_impl>(
-    std::move(input), std::move(d_input), std::move(merge_pairs));
+    std::move(input), std::move(d_input), std::move(merge_pairs), std::move(mp_table_map));
 }
 
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
@@ -121,8 +139,9 @@ std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_im
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  return create_bpe_merge_pairs_impl(std::make_unique<cudf::column>(input.parent(), stream, mr),
-                                     stream);
+  auto pairs   = cudf::strings::split_record(input, cudf::string_scalar(" "), 1, stream, mr);
+  auto content = pairs->release();
+  return create_bpe_merge_pairs_impl(std::move(content.children.back()), stream);
 }
 
 }  // namespace
@@ -135,6 +154,15 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filena
   return std::make_unique<bpe_merge_pairs>(std::move(input_column), stream, mr);
 }
 
+std::unique_ptr<bpe_merge_pairs> load_merge_pairs(cudf::strings_column_view const& merge_pairs,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(!merge_pairs.is_empty(), "Merge pairs must not be empty");
+  CUDF_EXPECTS(!merge_pairs.has_nulls(), "Merge pairs may not contain nulls");
+  return std::make_unique<bpe_merge_pairs>(merge_pairs, stream, mr);
+}
+
 }  // namespace detail
 
 std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filename_merges,
@@ -144,31 +172,42 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filena
   return detail::load_merge_pairs_file(filename_merges, cudf::get_default_stream(), mr);
 }
 
+std::unique_ptr<bpe_merge_pairs> load_merge_pairs(cudf::strings_column_view const& merge_pairs,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::load_merge_pairs(merge_pairs, stream, mr);
+}
+
 bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl(
   std::unique_ptr<cudf::column>&& merge_pairs,
   std::unique_ptr<cudf::column_device_view, std::function<void(cudf::column_device_view*)>>&&
     d_merge_pairs,
-  std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map)
+  std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map,
+  std::unique_ptr<detail::mp_table_map_type>&& mp_table_map)
   : merge_pairs(std::move(merge_pairs)),
     d_merge_pairs(std::move(d_merge_pairs)),
-    merge_pairs_map(std::move(merge_pairs_map))
+    merge_pairs_map(std::move(merge_pairs_map)),
+    mp_table_map(std::move(mp_table_map))
 {
 }
 
 bpe_merge_pairs::bpe_merge_pairs(std::unique_ptr<cudf::column>&& input,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource*)
-  : impl(detail::create_bpe_merge_pairs_impl(std::move(input), stream))
+  : impl(detail::create_bpe_merge_pairs_impl(std::move(input), stream).release())
 {
 }
 
 bpe_merge_pairs::bpe_merge_pairs(cudf::strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
-  : impl(detail::create_bpe_merge_pairs_impl(input, stream, mr))
+  : impl(detail::create_bpe_merge_pairs_impl(input, stream, mr).release())
 {
 }
 
-bpe_merge_pairs::~bpe_merge_pairs() = default;
+bpe_merge_pairs::bpe_merge_pairs() = default;
+bpe_merge_pairs::~bpe_merge_pairs() { delete impl; }
 
 }  // namespace nvtext
diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp
index 044c0ab0804..a13b61e0ba4 100644
--- a/cpp/tests/text/bpe_tests.cpp
+++ b/cpp/tests/text/bpe_tests.cpp
@@ -30,38 +30,35 @@ TEST_F(TextBytePairEncoding, BytePairEncoding)
 {
   // partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt
   auto mpt = cudf::test::strings_column_wrapper({
-    "e n",    // 12
-    "i t",    // 14
-    "i s",    // 15
-    "e s",    // 18
-    "en t",   // 42
-    "c e",    // 88
-    "es t",   // 139
-    "en ce",  // 338
-    "T h",    // 561
-    "Th is",  // 956
-    "t est",  // 9032
-    "s ent",  // 33830
+    "e n",    // 14
+    "i t",    // 16
+    "i s",    // 17
+    "e s",    // 20
+    "en t",   // 44
+    "c e",    // 90
+    "es t",   // 141
+    "en ce",  // 340
+    "t h",    // 146
+    "h i",    // 5049
+    "th is",  // 5407
+    "t est",  // 9034
+    "s i",    // 13142
+    "s ent"   // 33832
   });
 
-  nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)};
+  auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt));
 
   auto validity = cudf::test::iterators::null_at(4);
-  cudf::test::strings_column_wrapper input({" This\tis  it\n",
-                                            "This is test-sentence-1",
-                                            "This is test sentence-2",
-                                            "This-is test sentence 3",
-                                            "",
-                                            ""},
-                                           validity);
+  cudf::test::strings_column_wrapper input(
+    {"thisisit", "thisis test-sentence-1", "thisistestsentence-2", "this-istestsentence 3", "", ""},
+    validity);
   auto sv = cudf::strings_column_view(input);
 
-  auto results = nvtext::byte_pair_encoding(sv, merge_pairs);
-
-  auto expected = cudf::test::strings_column_wrapper({" This is it",
-                                                      "This is test - sent ence - 1",
-                                                      "This is test sent ence - 2",
-                                                      "This - is test sent ence 3",
+  auto results  = nvtext::byte_pair_encoding(sv, *merge_pairs);
+  auto expected = cudf::test::strings_column_wrapper({"this is it",
+                                                      "this is   test - sent ence - 1",
+                                                      "this is test sent ence - 2",
+                                                      "this - is test sent ence   3",
                                                       "",
                                                       ""},
                                                      validity);
@@ -70,41 +67,68 @@ TEST_F(TextBytePairEncoding, BytePairEncoding)
   auto sliced          = cudf::slice(input, {1, 4}).front();
   auto sliced_expected = cudf::slice(expected, {1, 4}).front();
 
-  results = nvtext::byte_pair_encoding(cudf::strings_column_view(sliced), merge_pairs);
+  sv      = cudf::strings_column_view(sliced);
+  results = nvtext::byte_pair_encoding(sv, *merge_pairs);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected);
 }
 
 TEST_F(TextBytePairEncoding, BytePairEncodingSeparator)
 {
   auto mpt = cudf::test::strings_column_wrapper(
-    {"e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"});
-  nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)};
+    {"Ġ t", "Ġt he", "h e", "e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"});
+
+  auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt));
 
   cudf::test::strings_column_wrapper input(
-    {"test-sentence-1", "test sentence-2", "test sentence 3", " test sentence 4 "});
+    {"Ġthe test sentence", "test Ġthe sentence", "Ġthetest sentence", "testĠthesentence"});
   auto sv = cudf::strings_column_view(input);
 
-  auto results = nvtext::byte_pair_encoding(sv, merge_pairs, std::string(" Ġ"));
+  auto results = nvtext::byte_pair_encoding(sv, *merge_pairs, std::string("$"));
+
+  auto expected = cudf::test::strings_column_wrapper({"Ġthe$ $test$ $sent$ence",
+                                                      "test$ $Ġthe$ $sent$ence",
+                                                      "Ġthe$test$ $sent$ence",
+                                                      "test$Ġthe$sent$ence"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+}
+
+TEST_F(TextBytePairEncoding, BPEAdjacentPairs)
+{
+  auto mpt         = cudf::test::strings_column_wrapper({
+    "▁ H",    //    157
+    "m m",    //  10742
+    "? !",    //  50675
+    "▁H mm",  // 174381
+    "mm m",   // 262776
+    "?! !",   // 352313
+    "? !?",   // 352314
+    "mm mm",  // 387733
+    "▁H m",   // 471269
+    "?! ?!",  // 506981
+    "?!? !",  // 506982
+  });
+  auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt));
+
+  cudf::test::strings_column_wrapper input({"▁Hmmmmm", "?!?!?!"});
 
-  auto expected = cudf::test::strings_column_wrapper(
-    {"test - sent ence - 1", "test Ġsent ence - 2", "test Ġsent ence Ġ3", " Ġtest Ġsent ence Ġ4"});
+  auto results  = nvtext::byte_pair_encoding(cudf::strings_column_view(input), *merge_pairs);
+  auto expected = cudf::test::strings_column_wrapper({"▁Hmm mmm", "?!?! ?!"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
 TEST_F(TextBytePairEncoding, BPE_Empty)
 {
-  auto mpt = cudf::test::strings_column_wrapper({"i s", "i t"});
-  nvtext::bpe_merge_pairs merge_pairs{mpt.release()};
-  auto empty   = cudf::make_empty_column(cudf::type_id::STRING);
-  auto results = nvtext::byte_pair_encoding(cudf::strings_column_view(empty->view()), merge_pairs);
+  auto mpt         = cudf::test::strings_column_wrapper({"i s", "i t"});
+  auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt));
+  auto empty       = cudf::make_empty_column(cudf::type_id::STRING);
+  auto results = nvtext::byte_pair_encoding(cudf::strings_column_view(empty->view()), *merge_pairs);
   EXPECT_EQ(0, results->size());
 }
 
 TEST_F(TextBytePairEncoding, BPE_Error)
 {
   auto empty = cudf::make_empty_column(cudf::type_id::STRING);
-  nvtext::bpe_merge_pairs merge_pairs{std::move(empty)};
-  cudf::test::strings_column_wrapper input({"isit"});
-  EXPECT_THROW(nvtext::byte_pair_encoding(cudf::strings_column_view(input), merge_pairs),
-               cudf::logic_error);
+  EXPECT_THROW(nvtext::load_merge_pairs(cudf::strings_column_view(*empty)), cudf::logic_error);
+  auto null_pairs = cudf::test::strings_column_wrapper({"", ""}, {1, 0});
+  EXPECT_THROW(nvtext::load_merge_pairs(cudf::strings_column_view(null_pairs)), cudf::logic_error);
 }

From 70c4283dbd6700bba43440e07b293b2294ea6634 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 6 Nov 2023 13:15:25 -0600
Subject: [PATCH 074/118] Register ``partd`` encode dispatch in ``dask_cudf``
 (#14287)

This PR enables "disk"-based shuffling of `cudf`-backed Dask-DataFrame collections, but does **not** yet add the `shuffle="disk"` option to the `dask_cudf.DataFrame.shuffle/sort_values` APIs.

We now use basic (slow) `pickle` logic to convert `cudf.DataFrame` objects to/from `bytes` here, so I'd like to consider further optimizations before making the `shuffle="disk"` option "official".

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Ray Douglass (https://github.com/raydouglass)
  - gpuCI (https://github.com/GPUtester)
  - Mike Wendt (https://github.com/mike-wendt)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/14287
---
 python/dask_cudf/dask_cudf/backends.py        | 25 +++++++++++++++++++
 python/dask_cudf/dask_cudf/tests/test_sort.py | 11 ++++++++
 2 files changed, 36 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 65d9c438fba..b0da82eaeee 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -481,6 +481,31 @@ def sizeof_cudf_series_index(obj):
     return obj.memory_usage()
 
 
+# TODO: Remove try/except when cudf is pinned to dask>=2023.10.0
+try:
+    from dask.dataframe.dispatch import partd_encode_dispatch
+
+    @partd_encode_dispatch.register(cudf.DataFrame)
+    def _simple_cudf_encode(_):
+        # Basic pickle-based encoding for a partd k-v store
+        import pickle
+        from functools import partial
+
+        import partd
+
+        def join(dfs):
+            if not dfs:
+                return cudf.DataFrame()
+            else:
+                return cudf.concat(dfs)
+
+        dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL)
+        return partial(partd.Encode, dumps, pickle.loads, join)
+
+except ImportError:
+    pass
+
+
 def _default_backend(func, *args, **kwargs):
     # Utility to call a dask.dataframe function with
     # the default ("pandas") backend
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index 94609b180d6..e58255cda06 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -114,3 +114,14 @@ def test_sort_values_empty_string(by):
     if "a" in by:
         expect = df.sort_values(by)
         assert dd.assert_eq(got, expect, check_index=False)
+
+
+def test_disk_shuffle():
+    try:
+        from dask.dataframe.dispatch import partd_encode_dispatch  # noqa: F401
+    except ImportError:
+        pytest.skip("need a version of dask that has partd_encode_dispatch")
+    df = cudf.DataFrame({"a": [1, 2, 3] * 20, "b": [4, 5, 6, 7] * 15})
+    ddf = dd.from_pandas(df, npartitions=4)
+    got = dd.DataFrame.shuffle(ddf, "a", shuffle="disk")
+    dd.assert_eq(got, df)

From f102ba810922600008cbf4e0ba9441e93963c7fb Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 6 Nov 2023 15:56:34 -0800
Subject: [PATCH 075/118] Expose streams in public unary APIs (#14342)

Contributes to #925

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14342
---
 cpp/include/cudf/detail/unary.hpp | 10 -----
 cpp/include/cudf/unary.hpp        | 15 ++++++-
 cpp/src/interop/to_arrow.cu       |  8 ++--
 cpp/src/unary/cast_ops.cu         |  3 +-
 cpp/src/unary/math_ops.cu         |  3 +-
 cpp/src/unary/nan_ops.cu          | 11 ++++--
 cpp/src/unary/null_ops.cu         | 11 ++++--
 cpp/tests/CMakeLists.txt          |  9 +++--
 cpp/tests/streams/unary_test.cpp  | 65 +++++++++++++++++++++++++++++++
 9 files changed, 106 insertions(+), 29 deletions(-)
 create mode 100644 cpp/tests/streams/unary_test.cpp

diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index 3fbdf4a5a8f..12f864de572 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -64,8 +64,6 @@ std::unique_ptr<column> true_if(InputIterator begin,
 
 /**
  * @copydoc cudf::unary_operation
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
                                               cudf::unary_operator op,
@@ -74,8 +72,6 @@ std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
 
 /**
  * @copydoc cudf::is_valid
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> is_valid(cudf::column_view const& input,
                                        rmm::cuda_stream_view stream,
@@ -83,8 +79,6 @@ std::unique_ptr<cudf::column> is_valid(cudf::column_view const& input,
 
 /**
  * @copydoc cudf::cast
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> cast(column_view const& input,
                              data_type type,
@@ -93,8 +87,6 @@ std::unique_ptr<column> cast(column_view const& input,
 
 /**
  * @copydoc cudf::is_nan
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> is_nan(cudf::column_view const& input,
                                rmm::cuda_stream_view stream,
@@ -102,8 +94,6 @@ std::unique_ptr<column> is_nan(cudf::column_view const& input,
 
 /**
  * @copydoc cudf::is_not_nan
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
                                    rmm::cuda_stream_view stream,
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 1130c41afe5..64e802d88dd 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -65,6 +66,7 @@ enum class unary_operator : int32_t {
  *
  * @param input A `column_view` as input
  * @param op operation to perform
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns Column of same size as `input` containing result of the operation
@@ -72,6 +74,7 @@ enum class unary_operator : int32_t {
 std::unique_ptr<cudf::column> unary_operation(
   cudf::column_view const& input,
   cudf::unary_operator op,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -79,6 +82,7 @@ std::unique_ptr<cudf::column> unary_operation(
  * indicates the value is null and `false` indicates the value is valid.
  *
  * @param input A `column_view` as input
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns A non-nullable column of `type_id::BOOL8` elements with `true`
@@ -86,6 +90,7 @@ std::unique_ptr<cudf::column> unary_operation(
  */
 std::unique_ptr<cudf::column> is_null(
   cudf::column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -93,6 +98,7 @@ std::unique_ptr<cudf::column> is_null(
  * indicates the value is valid and `false` indicates the value is null.
  *
  * @param input A `column_view` as input
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns A non-nullable column of `type_id::BOOL8` elements with `false`
@@ -100,6 +106,7 @@ std::unique_ptr<cudf::column> is_null(
  */
 std::unique_ptr<cudf::column> is_valid(
   cudf::column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -109,6 +116,7 @@ std::unique_ptr<cudf::column> is_valid(
  *
  * @param input Input column
  * @param out_type Desired datatype of output column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns Column of same size as `input` containing result of the cast operation
@@ -117,6 +125,7 @@ std::unique_ptr<cudf::column> is_valid(
 std::unique_ptr<column> cast(
   column_view const& input,
   data_type out_type,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -127,12 +136,14 @@ std::unique_ptr<column> cast(
  * @throws cudf::logic_error if `input` is a non-floating point type
  *
  * @param input A column of floating-point elements
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns A non-nullable column of `type_id::BOOL8` elements with `true` representing `NAN` values
  */
 std::unique_ptr<column> is_nan(
   cudf::column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -143,6 +154,7 @@ std::unique_ptr<column> is_nan(
  * @throws cudf::logic_error if `input` is a non-floating point type
  *
  * @param input A column of floating-point elements
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns A non-nullable column of `type_id::BOOL8` elements with `false` representing `NAN`
@@ -150,6 +162,7 @@ std::unique_ptr<column> is_nan(
  */
 std::unique_ptr<column> is_not_nan(
   cudf::column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 28230cf8e74..3a9fe50d25b 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -382,10 +382,10 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::dictionary32>(
 {
   // Arrow dictionary requires indices to be signed integer
   std::unique_ptr<column> dict_indices =
-    cast(cudf::dictionary_column_view(input).get_indices_annotated(),
-         cudf::data_type{type_id::INT32},
-         stream,
-         rmm::mr::get_current_device_resource());
+    detail::cast(cudf::dictionary_column_view(input).get_indices_annotated(),
+                 cudf::data_type{type_id::INT32},
+                 stream,
+                 rmm::mr::get_current_device_resource());
   auto indices = dispatch_to_arrow{}.operator()<int32_t>(
     dict_indices->view(), dict_indices->type().id(), {}, ar_mr, stream);
   auto dict_keys = cudf::dictionary_column_view(input).keys();
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 6fa87b1f709..8421f32056e 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -415,10 +415,11 @@ std::unique_ptr<column> cast(column_view const& input,
 
 std::unique_ptr<column> cast(column_view const& input,
                              data_type type,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::cast(input, type, cudf::get_default_stream(), mr);
+  return detail::cast(input, type, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index d84e0171b49..88922362319 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -646,10 +646,11 @@ std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
 
 std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
                                               cudf::unary_operator op,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::unary_operation(input, op, cudf::get_default_stream(), mr);
+  return detail::unary_operation(input, op, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/unary/nan_ops.cu b/cpp/src/unary/nan_ops.cu
index 2cf83466b03..092ad3b6731 100644
--- a/cpp/src/unary/nan_ops.cu
+++ b/cpp/src/unary/nan_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -91,17 +91,20 @@ std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
 
 }  // namespace detail
 
-std::unique_ptr<column> is_nan(cudf::column_view const& input, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> is_nan(cudf::column_view const& input,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_nan(input, cudf::get_default_stream(), mr);
+  return detail::is_nan(input, stream, mr);
 }
 
 std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_not_nan(input, cudf::get_default_stream(), mr);
+  return detail::is_not_nan(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/unary/null_ops.cu b/cpp/src/unary/null_ops.cu
index e64c68fdae6..6bdd65dd42d 100644
--- a/cpp/src/unary/null_ops.cu
+++ b/cpp/src/unary/null_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,17 +55,20 @@ std::unique_ptr<column> is_valid(cudf::column_view const& input,
 
 }  // namespace detail
 
-std::unique_ptr<column> is_null(cudf::column_view const& input, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> is_null(cudf::column_view const& input,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_null(input, cudf::get_default_stream(), mr);
+  return detail::is_null(input, stream, mr);
 }
 
 std::unique_ptr<column> is_valid(cudf::column_view const& input,
+                                 rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_valid(input, cudf::get_default_stream(), mr);
+  return detail::is_valid(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index e966ef3fb04..47e266ced71 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -629,14 +629,17 @@ ConfigureTest(
 ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_STRINGS_TEST
   streams/strings/case_test.cpp
@@ -653,12 +656,10 @@ ConfigureTest(
   STREAM_MODE
   testing
 )
-ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_TEXT_TEST streams/text/ngrams_test.cpp streams/text/tokenize_test.cpp STREAM_MODE testing
 )
-ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_UNARY_TEST streams/unary_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/streams/unary_test.cpp b/cpp/tests/streams/unary_test.cpp
new file mode 100644
index 00000000000..1734c0c4e9f
--- /dev/null
+++ b/cpp/tests/streams/unary_test.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/unary.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class UnaryTest : public cudf::test::BaseFixture {};
+
+TEST_F(UnaryTest, UnaryOperation)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+
+  cudf::unary_operation(column, cudf::unary_operator::ABS, cudf::test::get_default_stream());
+}
+
+TEST_F(UnaryTest, IsNull)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+
+  cudf::is_null(column, cudf::test::get_default_stream());
+}
+
+TEST_F(UnaryTest, IsValid)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+
+  cudf::is_valid(column, cudf::test::get_default_stream());
+}
+
+TEST_F(UnaryTest, Cast)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+
+  cudf::cast(column, cudf::data_type{cudf::type_id::INT64}, cudf::test::get_default_stream());
+}
+
+TEST_F(UnaryTest, IsNan)
+{
+  cudf::test::fixed_width_column_wrapper<float> const column{10, 20, 30, 40, 50};
+
+  cudf::is_nan(column, cudf::test::get_default_stream());
+}
+
+TEST_F(UnaryTest, IsNotNan)
+{
+  cudf::test::fixed_width_column_wrapper<float> const column{10, 20, 30, 40, 50};
+
+  cudf::is_not_nan(column, cudf::test::get_default_stream());
+}

From 16051a718509c218010e6912d6f8e0fca7a7aa24 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 7 Nov 2023 11:15:43 -0500
Subject: [PATCH 076/118] Expose stream parameter in public nvtext replace APIs
 (#14329)

Add stream parameter to public APIs:

- `nvtext::replace_tokens()`
- `nvtext::filter_tokens`
- `nvtext::normalize_spaces()`
- `nvtext::normalize_characters()`

Also cleaned up some of the doxygen comments and added stream gtests.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/14329
---
 cpp/include/nvtext/normalize.hpp        | 18 +++++---
 cpp/include/nvtext/replace.hpp          | 28 +++++++-----
 cpp/src/text/normalize.cu               | 10 +++--
 cpp/src/text/replace.cu                 | 12 ++---
 cpp/tests/CMakeLists.txt                |  3 +-
 cpp/tests/streams/text/replace_test.cpp | 60 +++++++++++++++++++++++++
 6 files changed, 101 insertions(+), 30 deletions(-)
 create mode 100644 cpp/tests/streams/text/replace_test.cpp

diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
index 1be25b4f1f4..3cbff5c744b 100644
--- a/cpp/include/nvtext/normalize.hpp
+++ b/cpp/include/nvtext/normalize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,12 +44,14 @@ namespace nvtext {
  * A null input element at row `i` produces a corresponding null entry
  * for row `i` in the output column.
  *
- * @param strings Strings column to normalize.
- * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @param input Strings column to normalize
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @return New strings columns of normalized strings.
  */
 std::unique_ptr<cudf::column> normalize_spaces(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -89,16 +91,18 @@ std::unique_ptr<cudf::column> normalize_spaces(
  * This function requires about 16x the number of character bytes in the input
  * strings column as working memory.
  *
- * @param strings The input strings to normalize.
+ * @param input The input strings to normalize
  * @param do_lower_case If true, upper-case characters are converted to
  *        lower-case and accents are stripped from those characters.
  *        If false, accented and upper-case characters are not transformed.
- * @param mr Memory resource to allocate any returned objects.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Memory resource to allocate any returned objects
  * @return Normalized strings column
  */
 std::unique_ptr<cudf::column> normalize_characters(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   bool do_lower_case,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/nvtext/replace.hpp b/cpp/include/nvtext/replace.hpp
index 0dde7f195b9..88cf7d41901 100644
--- a/cpp/include/nvtext/replace.hpp
+++ b/cpp/include/nvtext/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,19 +73,21 @@ namespace nvtext {
  * @throw cudf::logic_error if targets or replacements contain nulls
  * @throw cudf::logic_error if delimiter is invalid
  *
- * @param strings Strings column to replace.
- * @param targets Strings to compare against tokens found in `strings`
+ * @param input Strings column to replace
+ * @param targets Strings to compare against tokens found in `input`
  * @param replacements Replacement strings for each string in `targets`
  * @param delimiter Characters used to separate each string into tokens.
  *                  The default of empty string will identify tokens using whitespace.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of with replaced strings.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of with replaced strings
  */
 std::unique_ptr<cudf::column> replace_tokens(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::strings_column_view const& targets,
   cudf::strings_column_view const& replacements,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
+  rmm::cuda_stream_view stream         = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
 /**
@@ -120,19 +122,21 @@ std::unique_ptr<cudf::column> replace_tokens(
  *
  * @throw cudf::logic_error if `delimiter` or `replacement` is invalid
  *
- * @param strings Strings column to replace.
- * @param min_token_length The minimum number of characters to retain a token in the output string.
- * @param replacement Optional replacement string to be used in place of removed tokens.
+ * @param input Strings column to replace
+ * @param min_token_length The minimum number of characters to retain a token in the output string
+ * @param replacement Optional replacement string to be used in place of removed tokens
  * @param delimiter Characters used to separate each string into tokens.
  *                  The default of empty string will identify tokens using whitespace.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of with replaced strings.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of with replaced strings
  */
 std::unique_ptr<cudf::column> filter_tokens(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type min_token_length,
   cudf::string_scalar const& replacement = cudf::string_scalar{""},
   cudf::string_scalar const& delimiter   = cudf::string_scalar{""},
+  rmm::cuda_stream_view stream           = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 1b07b0785f5..0fc1d221b15 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -242,22 +242,24 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
 
 // external APIs
 
-std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const& input,
+                                               rmm::cuda_stream_view stream,
                                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::normalize_spaces(strings, cudf::get_default_stream(), mr);
+  return detail::normalize_spaces(input, stream, mr);
 }
 
 /**
  * @copydoc nvtext::normalize_characters
  */
-std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& input,
                                                    bool do_lower_case,
+                                                   rmm::cuda_stream_view stream,
                                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::normalize_characters(strings, do_lower_case, cudf::get_default_stream(), mr);
+  return detail::normalize_characters(input, do_lower_case, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index 34916e121dc..a4b28fe2dab 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -274,26 +274,26 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
 
 // external APIs
 
-std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& input,
                                              cudf::strings_column_view const& targets,
                                              cudf::strings_column_view const& replacements,
                                              cudf::string_scalar const& delimiter,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_tokens(
-    strings, targets, replacements, delimiter, cudf::get_default_stream(), mr);
+  return detail::replace_tokens(input, targets, replacements, delimiter, stream, mr);
 }
 
-std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& input,
                                             cudf::size_type min_token_length,
                                             cudf::string_scalar const& replacement,
                                             cudf::string_scalar const& delimiter,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::filter_tokens(
-    strings, min_token_length, replacement, delimiter, cudf::get_default_stream(), mr);
+  return detail::filter_tokens(input, min_token_length, replacement, delimiter, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 47e266ced71..196a4f2d038 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -657,7 +657,8 @@ ConfigureTest(
   testing
 )
 ConfigureTest(
-  STREAM_TEXT_TEST streams/text/ngrams_test.cpp streams/text/tokenize_test.cpp STREAM_MODE testing
+  STREAM_TEXT_TEST streams/text/ngrams_test.cpp streams/text/replace_test.cpp
+  streams/text/tokenize_test.cpp STREAM_MODE testing
 )
 ConfigureTest(STREAM_UNARY_TEST streams/unary_test.cpp STREAM_MODE testing)
 
diff --git a/cpp/tests/streams/text/replace_test.cpp b/cpp/tests/streams/text/replace_test.cpp
new file mode 100644
index 00000000000..7617f886f9d
--- /dev/null
+++ b/cpp/tests/streams/text/replace_test.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <nvtext/normalize.hpp>
+#include <nvtext/replace.hpp>
+
+class TextReplaceTest : public cudf::test::BaseFixture {};
+
+TEST_F(TextReplaceTest, Replace)
+{
+  auto const input     = cudf::test::strings_column_wrapper({"the fox jumped over the dog"});
+  auto const targets   = cudf::test::strings_column_wrapper({"the", "dog"});
+  auto const repls     = cudf::test::strings_column_wrapper({"_", ""});
+  auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
+  nvtext::replace_tokens(cudf::strings_column_view(input),
+                         cudf::strings_column_view(targets),
+                         cudf::strings_column_view(repls),
+                         delimiter,
+                         cudf::test::get_default_stream());
+}
+
+TEST_F(TextReplaceTest, Filter)
+{
+  auto const input     = cudf::test::strings_column_wrapper({"one two three", "four five six"});
+  auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
+  auto const repl      = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
+  nvtext::filter_tokens(
+    cudf::strings_column_view(input), 1, delimiter, repl, cudf::test::get_default_stream());
+}
+
+TEST_F(TextReplaceTest, NormalizeSpaces)
+{
+  auto input =
+    cudf::test::strings_column_wrapper({"the\tquick brown\nfox", "jumped\rover the lazy\r\t\n"});
+  nvtext::normalize_spaces(cudf::strings_column_view(input), cudf::test::get_default_stream());
+}
+
+TEST_F(TextReplaceTest, NormalizeCharacters)
+{
+  auto input = cudf::test::strings_column_wrapper({"abc£def", "éè â îô\taeio", "\tĂĆĖÑ  Ü"});
+  nvtext::normalize_characters(
+    cudf::strings_column_view(input), false, cudf::test::get_default_stream());
+}

From 2463b3ad53a58a14da39dc87dfca8f63ebd0b641 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 7 Nov 2023 12:55:03 -0600
Subject: [PATCH 077/118] Fix a pytest typo in `test_kurt_skew_error` (#14368)

This PR fixes a pytest typo that pytest framework somehow conveniently ignores and continues to execute.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14368
---
 python/cudf/cudf/tests/test_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index 3ac605a1a4d..5f010668383 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -272,7 +272,7 @@ def test_kurt_skew_error(op):
     gs = cudf.Series(["ab", "cd"])
     ps = gs.to_pandas()
 
-    with pytest.raises(FutureWarning):
+    with pytest.warns(FutureWarning):
         assert_exceptions_equal(
             getattr(gs, op),
             getattr(ps, op),

From 723f0e4f6d2fbddc03907bdf34abfa5e54ac61de Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 7 Nov 2023 19:56:45 -0600
Subject: [PATCH 078/118] Upgrade to arrow 14 (#14371)

This PR upgrades `arrow` libraries to `14`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14371
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 4 ++--
 conda/environments/all_cuda-120_arch-x86_64.yaml | 4 ++--
 conda/recipes/cudf/meta.yaml                     | 4 ++--
 conda/recipes/libcudf/conda_build_config.yaml    | 2 +-
 cpp/cmake/thirdparty/get_arrow.cmake             | 2 +-
 dependencies.yaml                                | 8 ++++----
 python/cudf/pyproject.toml                       | 4 ++--
 python/cudf_kafka/pyproject.toml                 | 2 +-
 8 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 3d3d4f15d05..ef3bf4baaf8 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -40,7 +40,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow==13.0.0.*
+- libarrow-all==14.0.0.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
@@ -69,7 +69,7 @@ dependencies:
 - pre-commit
 - protobuf>=4.21,<5
 - ptxcompiler
-- pyarrow==13.0.0.*
+- pyarrow==14.0.0.*
 - pydata-sphinx-theme!=0.14.2
 - pytest
 - pytest-benchmark
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 4f39424bbc6..672e01138cd 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -42,7 +42,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow==13.0.0.*
+- libarrow-all==14.0.0.*
 - libcufile-dev
 - libcurand-dev
 - libkvikio==23.12.*
@@ -67,7 +67,7 @@ dependencies:
 - pip
 - pre-commit
 - protobuf>=4.21,<5
-- pyarrow==13.0.0.*
+- pyarrow==14.0.0.*
 - pydata-sphinx-theme!=0.14.2
 - pytest
 - pytest-benchmark
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 1ed07a85b88..10ff90bc018 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -55,13 +55,13 @@ requirements:
     - cuda-version ={{ cuda_version }}
     - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
-    - protobuf ==4.23.*
+    - protobuf ==4.24.*
     - python
     - cython >=3.0.0
     - scikit-build >=0.13.1
     - setuptools
     - dlpack >=0.5,<0.6.0a0
-    - pyarrow ==13.0.0.*
+    - pyarrow ==14.0.0.*
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index fe692614b8e..38efbd4fe4f 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -23,7 +23,7 @@ gtest_version:
   - ">=1.13.0"
 
 libarrow_version:
-  - "==13.0.0"
+  - "==14.0.0"
 
 dlpack_version:
   - ">=0.5,<0.6.0a0"
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 469b968eefd..d85d5d5f7d3 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -427,7 +427,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
       # This version must be kept in sync with the libarrow version pinned for builds in
       # dependencies.yaml.
-      13.0.0
+      14.0.0
       CACHE STRING "The version of Arrow to find (or build)"
   )
 endif()
diff --git a/dependencies.yaml b/dependencies.yaml
index bb61e244b97..ef249fc2c81 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -224,7 +224,7 @@ dependencies:
           - &gmock gmock>=1.13.0
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - libarrow==13.0.0.*
+          - libarrow-all==14.0.0.*
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==2.6.1
@@ -246,7 +246,7 @@ dependencies:
         packages:
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - pyarrow==13.0.0.*
+          - pyarrow==14.0.0.*
   build_python:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -264,13 +264,13 @@ dependencies:
       - output_types: conda
         packages:
           # Allow runtime version to float up to minor version
-          - libarrow==13.*
+          - libarrow-all==14.*
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           # Allow runtime version to float up to minor version
-          - pyarrow==13.*
+          - pyarrow==14.*
   cudatoolkit:
     specific:
       - output_types: conda
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index e934846ec35..0d4fad3818d 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -8,7 +8,7 @@ requires = [
     "ninja",
     "numpy>=1.21,<1.25",
     "protoc-wheel",
-    "pyarrow==13.0.0.*",
+    "pyarrow==14.0.0.*",
     "rmm==23.12.*",
     "scikit-build>=0.13.1",
     "setuptools",
@@ -38,7 +38,7 @@ dependencies = [
     "pandas>=1.3,<1.6.0dev0",
     "protobuf>=4.21,<5",
     "ptxcompiler",
-    "pyarrow==13.*",
+    "pyarrow==14.*",
     "rmm==23.12.*",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 5d2588fa6f7..293586cbba0 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -5,7 +5,7 @@
 requires = [
     "cython>=3.0.0",
     "numpy>=1.21,<1.25",
-    "pyarrow==13.0.0.*",
+    "pyarrow==14.0.0.*",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From 0438ac727970f897fd9ce1beae550ef487d92021 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 8 Nov 2023 09:10:39 -0600
Subject: [PATCH 079/118] Use 23.12 workflows.

---
 .github/workflows/pr.yaml   | 4 ++--
 .github/workflows/test.yaml | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index b2384f0fb8d..40cf0dcd2c1 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -133,7 +133,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
@@ -142,7 +142,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | max_by(.CUDA_VER) | [.]
       build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index e09a6d03eea..0b6b55069f6 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -99,7 +99,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -109,7 +109,7 @@ jobs:
   pandas-tests:
     # run the Pandas unit tests
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | max_by(.CUDA_VER) | [.]
       build_type: nightly

From ec4c47da40897a5e35166f265a659f24c408abe5 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 8 Nov 2023 09:40:42 -0600
Subject: [PATCH 080/118] Add manylinux tag.

---
 ci/cudf_pandas_scripts/run_tests.sh | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index cc578b50fd0..7eab3221e5e 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -31,8 +31,21 @@ done
 if [ "$no_cudf" = true ]; then
     echo "Skipping cudf install"
 else
+    # Set the manylinux version used for downloading the wheels so that we test the
+    # newer ABI wheels on the newer images that support their installation.
+    # Need to disable pipefail for the head not to fail, see
+    # https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
+    set +o pipefail
+    glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
+    set -o pipefail
+    manylinux_version="2_17"
+    if [[ ${glibc_minor_version} -ge 28 ]]; then
+        manylinux_version="2_28"
+    fi
+    manylinux="manylinux_${manylinux_version}"
+
     RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-    RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+    RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
     python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,cudf_pandas_tests]
 fi
 

From 14449b697f5ba294733af713c71f7ae4de4b5ca6 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 8 Nov 2023 09:52:40 -0600
Subject: [PATCH 081/118] Update wheel names and references to outdated branch.

---
 ci/build_wheel_cudf.sh                     | 13 ++-----------
 ci/cudf_pandas_scripts/pandas-tests/run.sh | 19 ++++++++++++-------
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index 9b3c4a4a091..456a3a289d1 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -7,19 +7,10 @@ package_dir="python/cudf"
 
 export SKBUILD_CONFIGURE_OPTIONS="-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF"
 
-# Force a build using the latest version of the code before this PR
-CUDF_BUILD_BRANCH=${1:-""}
-WHEEL_NAME="cudf"
-if [[ "${CUDF_BUILD_BRANCH}" == "main" ]]; then
-    MAIN_COMMIT=$(git merge-base HEAD origin/branch-23.10-xdf)
-    git checkout $MAIN_COMMIT
-    WHEEL_NAME="${WHEEL_NAME}_${CUDF_BUILD_BRANCH}"
-fi
-
-./ci/build_wheel.sh ${WHEEL_NAME} ${package_dir}
+./ci/build_wheel.sh cudf ${package_dir}
 
 python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
 
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="${WHEEL_NAME}_${AUDITWHEEL_POLICY}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
+RAPIDS_PY_WHEEL_NAME="cudf_${AUDITWHEEL_POLICY}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index 920625b452f..d36b609799b 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -8,16 +8,21 @@ PANDAS_TESTS_BRANCH=${1}
 rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch"
 rapids-logger "PR number: $RAPIDS_REF_NAME"
 
-
-COMMIT=$(git rev-parse HEAD)
-WHEEL_NAME="cudf"
-if [[ "${PANDAS_TESTS_BRANCH}" == "main" ]]; then
-    COMMIT=$(git merge-base HEAD origin/branch-23.10-xdf)
-    WHEEL_NAME="${WHEEL_NAME}_${PANDAS_TESTS_BRANCH}"
+# Set the manylinux version used for downloading the wheels so that we test the
+# newer ABI wheels on the newer images that support their installation.
+# Need to disable pipefail for the head not to fail, see
+# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q
+set +o pipefail
+glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2)
+set -o pipefail
+manylinux_version="2_17"
+if [[ ${glibc_minor_version} -ge 28 ]]; then
+    manylinux_version="2_28"
 fi
+manylinux="manylinux_${manylinux_version}"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="${WHEEL_NAME}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
 python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,pandas_tests]
 
 git checkout $COMMIT

From d3dcc75c8f82d3ba2bcf1efed493ebbf02b2e6a1 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Wed, 8 Nov 2023 12:15:57 -0500
Subject: [PATCH 082/118] Update README (#14374)

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/14374
---
 README.md | 73 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 39 insertions(+), 34 deletions(-)

diff --git a/README.md b/README.md
index 5f2ce014dba..677cfc89d52 100644
--- a/README.md
+++ b/README.md
@@ -1,57 +1,62 @@
 # <div align="left"><img src="img/rapids_logo.png" width="90px"/>&nbsp;cuDF - GPU DataFrames</div>
 
-**NOTE:** For the latest stable [README.md](https://github.com/rapidsai/cudf/blob/main/README.md) ensure you are on the `main` branch.
+## 📢 cuDF can now be used as a no-code-change accelerator for pandas! To learn more, see [here](https://rapids.ai/cudf-pandas/)!
 
-## Resources
-
-- [cuDF Reference Documentation](https://docs.rapids.ai/api/cudf/stable/): Python API reference, tutorials, and topic guides.
-- [libcudf Reference Documentation](https://docs.rapids.ai/api/libcudf/stable/): C/C++ CUDA library API reference.
-- [Getting Started](https://rapids.ai/start.html): Instructions for installing cuDF.
-- [RAPIDS Community](https://rapids.ai/community.html): Get help, contribute, and collaborate.
-- [GitHub repository](https://github.com/rapidsai/cudf): Download the cuDF source code.
-- [Issue tracker](https://github.com/rapidsai/cudf/issues): Report issues or request features.
-
-## Overview
-
-Built based on the [Apache Arrow](http://arrow.apache.org/) columnar memory format, cuDF is a GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data.
+cuDF is a GPU DataFrame library for loading joining, aggregating,
+filtering, and otherwise manipulating data. cuDF leverages
+[libcudf](https://docs.rapids.ai/api/libcudf/stable/), a
+blazing-fast C++/CUDA dataframe library and the [Apache
+Arrow](https://arrow.apache.org/) columnar format to provide a
+GPU-accelerated pandas API.
 
-cuDF provides a pandas-like API that will be familiar to data engineers & data scientists, so they can use it to easily accelerate their workflows without going into the details of CUDA programming.
+You can import `cudf` directly and use it like `pandas`:
 
-For example, the following snippet downloads a CSV, then uses the GPU to parse it into rows and columns and run calculations:
 ```python
-import cudf, requests
+import cudf
+import requests
 from io import StringIO
 
 url = "https://github.com/plotly/datasets/raw/master/tips.csv"
-content = requests.get(url).content.decode('utf-8')
+content = requests.get(url).content.decode("utf-8")
 
 tips_df = cudf.read_csv(StringIO(content))
-tips_df['tip_percentage'] = tips_df['tip'] / tips_df['total_bill'] * 100
+tips_df["tip_percentage"] = tips_df["tip"] / tips_df["total_bill"] * 100
 
 # display average tip by dining party size
-print(tips_df.groupby('size').tip_percentage.mean())
+print(tips_df.groupby("size").tip_percentage.mean())
 ```
 
-Output:
-```
-size
-1    21.729201548727808
-2    16.571919173482897
-3    15.215685473711837
-4    14.594900639351332
-5    14.149548965142023
-6    15.622920072028379
-Name: tip_percentage, dtype: float64
-```
+Or, you can use cuDF as a no-code-change accelerator for pandas, using
+[`cudf.pandas`](https://docs.rapids.ai/api/cudf/stable/cudf_pandas).
+`cudf.pandas` supports 100% of the pandas API, utilizing cuDF for
+supported operations and falling back to pandas when needed:
 
-For additional examples, browse our complete [API documentation](https://docs.rapids.ai/api/cudf/stable/), or check out our more detailed [notebooks](https://github.com/rapidsai/notebooks-contrib).
+```python
+%load_ext cudf.pandas  # pandas operations now use the GPU!
 
-## Quick Start
+import pandas as pd
+import requests
+from io import StringIO
 
-Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapidsai/), choosing a tag based on the NVIDIA CUDA version you're running. This provides a ready to run Docker container with example notebooks and data, showcasing how you can utilize cuDF.
+url = "https://github.com/plotly/datasets/raw/master/tips.csv"
+content = requests.get(url).content.decode("utf-8")
 
-## Installation
+tips_df = pd.read_csv(StringIO(content))
+tips_df["tip_percentage"] = tips_df["tip"] / tips_df["total_bill"] * 100
 
+# display average tip by dining party size
+print(tips_df.groupby("size").tip_percentage.mean())
+```
+
+## Resources
+
+- [Try cudf.pandas now](https://nvda.ws/rapids-cudf): Explore `cudf.pandas` on a free GPU enabled instance on Google Colab!
+- [Install](https://rapids.ai/start.html): Instructions for installing cuDF and other [RAPIDS](https://rapids.ai) libraries.
+- [cudf (Python) documentation](https://docs.rapids.ai/api/cudf/stable/)
+- [libcudf (C++/CUDA) documentation](https://docs.rapids.ai/api/libcudf/stable/)
+- [RAPIDS Community](https://rapids.ai/community.html): Get help, contribute, and collaborate.
+
+## Installation
 
 ### CUDA/GPU requirements
 

From 3b9cb4c29980287d1f13aa6b3056fe1accea71af Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 8 Nov 2023 11:26:43 -0600
Subject: [PATCH 083/118] Update test_no_cuinit.py

---
 python/cudf/cudf/tests/test_no_cuinit.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/cudf/cudf/tests/test_no_cuinit.py b/python/cudf/cudf/tests/test_no_cuinit.py
index b142b0dab33..45d812fe9a2 100644
--- a/python/cudf/cudf/tests/test_no_cuinit.py
+++ b/python/cudf/cudf/tests/test_no_cuinit.py
@@ -66,6 +66,7 @@ def test_cudf_import_no_cuinit(cuda_gdb):
         env=env,
         capture_output=True,
         text=True,
+        cwd="/",
     )
 
     cuInit_called = output.stdout.find("in cuInit ()")

From 8cdedd8b7d7ad60b241c14b2334f127cf06a1e12 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 8 Nov 2023 13:33:45 -0600
Subject: [PATCH 084/118] Add cwd="/" to test_s3.py.

---
 python/cudf/cudf/tests/test_s3.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index d16cbd2377a..b92f84b677c 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -544,6 +544,7 @@ def test_no_s3fs_on_cudf_import():
             sys.executable,
             "-c",
             "import cudf; import sys; print('pyarrow._s3fs' in sys.modules)",
-        ]
+        ],
+        cwd="/",
     )
     assert output.strip() == b"False"

From c4e6c092946c6ee93924c0e548e28b5a56a5e482 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Wed, 8 Nov 2023 14:34:53 -0800
Subject: [PATCH 085/118] Add python tests for Parquet DELTA_BINARY_PACKED
 encoder (#14316)

During the review of #14100 there was a suggestion to add a test of writing using cudf and then reading the resulting file back with pyarrow. This PR adds the necessary python bindings to perform this test.

NOTE: there is currently an issue with encoding 32-bit values where the deltas exceed 32-bits. parquet-mr and arrow truncate the deltas for the INT32 physical type and allow values to overflow, whereas cudf currently uses 64-bit deltas, which avoids the overflow, but can result in requiring 33-bits when encoding. The current cudf behavior is allowed by the specification (and in fact is readable by parquet-mr), but using the extra bit is not in the Parquet spirit of least output file size. This will be addressed in follow-on work.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - MithunR (https://github.com/mythrocks)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/14316
---
 cpp/include/cudf/io/types.hpp            |  6 +--
 cpp/src/io/parquet/delta_enc.cuh         |  2 +
 cpp/src/io/parquet/page_enc.cu           | 14 ++++--
 python/cudf/cudf/_lib/cpp/io/parquet.pxd | 16 +++++++
 python/cudf/cudf/_lib/cpp/io/types.pxd   |  5 +++
 python/cudf/cudf/_lib/parquet.pyx        | 16 +++++++
 python/cudf/cudf/core/dataframe.py       |  4 ++
 python/cudf/cudf/io/parquet.py           | 13 ++++--
 python/cudf/cudf/tests/test_parquet.py   | 54 ++++++++++++++++--------
 python/cudf/cudf/utils/ioutils.py        |  8 ++++
 10 files changed, 110 insertions(+), 28 deletions(-)

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index abf400da102..50119e60882 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -195,9 +195,9 @@ class writer_compression_statistics {
  * @brief Control use of dictionary encoding for parquet writer
  */
 enum dictionary_policy {
-  NEVER,     ///< Never use dictionary encoding
-  ADAPTIVE,  ///< Use dictionary when it will not impact compression
-  ALWAYS     ///< Use dictionary reqardless of impact on compression
+  NEVER    = 0,  ///< Never use dictionary encoding
+  ADAPTIVE = 1,  ///< Use dictionary when it will not impact compression
+  ALWAYS   = 2   ///< Use dictionary regardless of impact on compression
 };
 
 /**
diff --git a/cpp/src/io/parquet/delta_enc.cuh b/cpp/src/io/parquet/delta_enc.cuh
index 28f8cdfe2c1..b0a7493fcab 100644
--- a/cpp/src/io/parquet/delta_enc.cuh
+++ b/cpp/src/io/parquet/delta_enc.cuh
@@ -46,6 +46,8 @@ inline __device__ void put_zz128(uint8_t*& p, zigzag128_t v)
 // too much shared memory.
 // The parquet spec requires block_size to be a multiple of 128, and values_per_mini_block
 // to be a multiple of 32.
+// TODO: if these are ever made configurable, be sure to fix the page size calculation in
+// delta_data_len() (page_enc.cu).
 constexpr int block_size            = 128;
 constexpr int num_mini_blocks       = 4;
 constexpr int values_per_mini_block = block_size / num_mini_blocks;
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 9acafd50585..2b7980c93e9 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -393,13 +393,20 @@ __device__ size_t delta_data_len(Type physical_type, cudf::type_id type_id, uint
 
   auto const vals_per_block = delta::block_size;
   size_t const num_blocks   = util::div_rounding_up_unsafe(num_values, vals_per_block);
-  // need max dtype_len + 1 bytes for min_delta
+  // need max dtype_len + 1 bytes for min_delta (because we only encode 7 bits per byte)
   // one byte per mini block for the bitwidth
-  // and block_size * dtype_len bytes for the actual encoded data
-  auto const block_size = dtype_len + 1 + delta::num_mini_blocks + vals_per_block * dtype_len;
+  auto const mini_block_header_size = dtype_len + 1 + delta::num_mini_blocks;
+  // each encoded value can be at most sizeof(type) * 8 + 1 bits
+  auto const max_bits = dtype_len * 8 + 1;
+  // each data block will then be max_bits * values per block. vals_per_block is guaranteed to be
+  // divisible by 128 (via static assert on delta::block_size), but do safe division anyway.
+  auto const bytes_per_block = cudf::util::div_rounding_up_unsafe(max_bits * vals_per_block, 8);
+  auto const block_size      = mini_block_header_size + bytes_per_block;
 
   // delta header is 2 bytes for the block_size, 1 byte for number of mini-blocks,
   // max 5 bytes for number of values, and max dtype_len + 1 for first value.
+  // TODO: if we ever allow configurable block sizes then this calculation will need to be
+  // modified.
   auto const header_size = 2 + 1 + 5 + dtype_len + 1;
 
   return header_size + num_blocks * block_size;
@@ -1279,6 +1286,7 @@ __device__ void finish_page_encode(state_buf* s,
     uint8_t const* const base   = s->page.page_data + s->page.max_hdr_size;
     auto const actual_data_size = static_cast<uint32_t>(end_ptr - base);
     if (actual_data_size > s->page.max_data_size) {
+      // FIXME(ets): this needs to do error propagation back to the host
       CUDF_UNREACHABLE("detected possible page data corruption");
     }
     s->page.max_data_size = actual_data_size;
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index cace29b5d45..a6a7ba034aa 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -100,6 +100,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_row_group_size_rows(size_type val) except +
         void set_max_page_size_bytes(size_t val) except +
         void set_max_page_size_rows(size_type val) except +
+        void enable_write_v2_headers(bool val) except +
+        void set_dictionary_policy(cudf_io_types.dictionary_policy policy)except +
 
         @staticmethod
         parquet_writer_options_builder builder(
@@ -150,6 +152,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_writer_options_builder& max_page_size_rows(
             size_type val
         ) except +
+        parquet_writer_options_builder& write_v2_headers(
+            bool val
+        ) except +
+        parquet_writer_options_builder& dictionary_policy(
+            cudf_io_types.dictionary_policy val
+        ) except +
 
         parquet_writer_options build() except +
 
@@ -191,6 +199,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_row_group_size_rows(size_type val) except +
         void set_max_page_size_bytes(size_t val) except +
         void set_max_page_size_rows(size_type val) except +
+        void enable_write_v2_headers(bool val) except +
+        void set_dictionary_policy(cudf_io_types.dictionary_policy policy)except +
 
         @staticmethod
         chunked_parquet_writer_options_builder builder(
@@ -232,6 +242,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         chunked_parquet_writer_options_builder& max_page_size_rows(
             size_type val
         ) except +
+        parquet_writer_options_builder& write_v2_headers(
+            bool val
+        ) except +
+        parquet_writer_options_builder& dictionary_policy(
+            cudf_io_types.dictionary_policy val
+        ) except +
 
         chunked_parquet_writer_options build() except +
 
diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd
index 01eaca82692..d8cc329b0a0 100644
--- a/python/cudf/cudf/_lib/cpp/io/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/types.pxd
@@ -52,6 +52,11 @@ cdef extern from "cudf/io/types.hpp" \
         STATISTICS_PAGE = 2,
         STATISTICS_COLUMN = 3,
 
+    ctypedef enum dictionary_policy:
+        NEVER = 0,
+        ADAPTIVE = 1,
+        ALWAYS = 2,
+
     cdef cppclass column_name_info:
         string name
         vector[column_name_info] children
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index f75a6c2b20e..d8d363686cc 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -321,6 +321,8 @@ def write_parquet(
     object max_page_size_rows=None,
     object partitions_info=None,
     object force_nullable_schema=False,
+    header_version="1.0",
+    use_dictionary=True,
 ):
     """
     Cython function to call into libcudf API, see `write_parquet`.
@@ -383,6 +385,18 @@ def write_parquet(
         tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata)
         user_data.push_back(tmp_user_data)
 
+    if header_version not in ("1.0", "2.0"):
+        raise ValueError(
+            f"Invalid parquet header version: {header_version}. "
+            "Valid values are '1.0' and '2.0'"
+        )
+
+    dict_policy = (
+        cudf_io_types.dictionary_policy.ALWAYS
+        if use_dictionary
+        else cudf_io_types.dictionary_policy.NEVER
+    )
+
     cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression)
     cdef cudf_io_types.statistics_freq stat_freq = _get_stat_freq(statistics)
 
@@ -399,6 +413,8 @@ def write_parquet(
         .compression(comp_type)
         .stats_level(stat_freq)
         .int96_timestamps(_int96_timestamps)
+        .write_v2_headers(header_version == "2.0")
+        .dictionary_policy(dict_policy)
         .utc_timestamps(False)
         .build()
     )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 6f7047b9d41..16eead6ea81 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6492,6 +6492,8 @@ def to_parquet(
         max_page_size_rows=None,
         storage_options=None,
         return_metadata=False,
+        use_dictionary=True,
+        header_version="1.0",
         *args,
         **kwargs,
     ):
@@ -6516,6 +6518,8 @@ def to_parquet(
             max_page_size_rows=max_page_size_rows,
             storage_options=storage_options,
             return_metadata=return_metadata,
+            use_dictionary=use_dictionary,
+            header_version=header_version,
             *args,
             **kwargs,
         )
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index b96440cce2f..bcc24a85cf9 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -66,6 +66,8 @@ def _write_parquet(
     partitions_info=None,
     storage_options=None,
     force_nullable_schema=False,
+    header_version="1.0",
+    use_dictionary=True,
 ):
     if is_list_like(paths) and len(paths) > 1:
         if partitions_info is None:
@@ -96,6 +98,8 @@ def _write_parquet(
         "max_page_size_rows": max_page_size_rows,
         "partitions_info": partitions_info,
         "force_nullable_schema": force_nullable_schema,
+        "header_version": header_version,
+        "use_dictionary": use_dictionary,
     }
     if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs):
         with ExitStack() as stack:
@@ -204,7 +208,6 @@ def write_to_dataset(
     fs.mkdirs(root_path, exist_ok=True)
 
     if partition_cols is not None and len(partition_cols) > 0:
-
         (
             full_paths,
             metadata_file_paths,
@@ -712,7 +715,6 @@ def _parquet_to_frame(
     dataset_kwargs=None,
     **kwargs,
 ):
-
     # If this is not a partitioned read, only need
     # one call to `_read_parquet`
     if not partition_keys:
@@ -756,7 +758,7 @@ def _parquet_to_frame(
             )
         )
         # Add partition columns to the last DataFrame
-        for (name, value) in part_key:
+        for name, value in part_key:
             _len = len(dfs[-1])
             if partition_categories and name in partition_categories:
                 # Build the categorical column from `codes`
@@ -869,6 +871,8 @@ def to_parquet(
     storage_options=None,
     return_metadata=False,
     force_nullable_schema=False,
+    header_version="1.0",
+    use_dictionary=True,
     *args,
     **kwargs,
 ):
@@ -943,6 +947,8 @@ def to_parquet(
             partitions_info=partition_info,
             storage_options=storage_options,
             force_nullable_schema=force_nullable_schema,
+            header_version=header_version,
+            use_dictionary=use_dictionary,
         )
 
     else:
@@ -1045,7 +1051,6 @@ def _get_groups_and_offsets(
     preserve_index=False,
     **kwargs,
 ):
-
     if not (set(df._data) - set(partition_cols)):
         warnings.warn("No data left to save outside partition columns")
 
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 338081fd4f7..0c59fd0e5aa 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1280,32 +1280,29 @@ def test_parquet_reader_v2(tmpdir, simple_pdf):
     simple_pdf.to_parquet(pdf_fname, data_page_version="2.0")
     assert_eq(cudf.read_parquet(pdf_fname), simple_pdf)
 
+    cudf.from_pandas(simple_pdf).to_parquet(pdf_fname, header_version="2.0")
+    assert_eq(cudf.read_parquet(pdf_fname), simple_pdf)
+
 
 @pytest.mark.parametrize("nrows", [1, 100000])
 @pytest.mark.parametrize("add_nulls", [True, False])
-def test_delta_binary(nrows, add_nulls, tmpdir):
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "int8",
+        "int16",
+        "int32",
+        "int64",
+    ],
+)
+def test_delta_binary(nrows, add_nulls, dtype, tmpdir):
     null_frequency = 0.25 if add_nulls else 0
 
     # Create a pandas dataframe with random data of mixed types
     arrow_table = dg.rand_dataframe(
         dtypes_meta=[
             {
-                "dtype": "int8",
-                "null_frequency": null_frequency,
-                "cardinality": nrows,
-            },
-            {
-                "dtype": "int16",
-                "null_frequency": null_frequency,
-                "cardinality": nrows,
-            },
-            {
-                "dtype": "int32",
-                "null_frequency": null_frequency,
-                "cardinality": nrows,
-            },
-            {
-                "dtype": "int64",
+                "dtype": dtype,
                 "null_frequency": null_frequency,
                 "cardinality": nrows,
             },
@@ -1330,6 +1327,28 @@ def test_delta_binary(nrows, add_nulls, tmpdir):
     pcdf = cudf.from_pandas(test_pdf)
     assert_eq(cdf, pcdf)
 
+    # Write back out with cudf and make sure pyarrow can read it
+    cudf_fname = tmpdir.join("cudfv2.parquet")
+    pcdf.to_parquet(
+        cudf_fname,
+        compression=None,
+        header_version="2.0",
+        use_dictionary=False,
+    )
+
+    # FIXME(ets): should probably not use more bits than the data type
+    try:
+        cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
+    except OSError as e:
+        if dtype == "int32" and nrows == 100000:
+            pytest.mark.xfail(
+                reason="arrow does not support 33-bit delta encoding"
+            )
+        else:
+            raise e
+    else:
+        assert_eq(cdf2, cdf)
+
 
 @pytest.mark.parametrize(
     "data",
@@ -1464,7 +1483,6 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf):
 
 
 def test_multifile_parquet_folder(tmpdir):
-
     test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2)
     test_pdf2 = make_pdf(nrows=20)
     expect = pd.concat([test_pdf1, test_pdf2])
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index d2739b35049..6641bd8290a 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -287,6 +287,14 @@
     include the file path metadata (relative to `root_path`).
     To request metadata binary blob when using with ``partition_cols``, Pass
     ``return_metadata=True`` instead of specifying ``metadata_file_path``
+use_dictionary : bool, default True
+    When ``False``, prevents the use of dictionary encoding for Parquet page
+    data. When ``True``, dictionary encoding is preferred when not disabled due
+    to dictionary size constraints.
+header_version : {{'1.0', '2.0'}}, default "1.0"
+    Controls whether to use version 1.0 or version 2.0 page headers when
+    encoding. Version 1.0 is more portable, but version 2.0 enables the
+    use of newer encoding schemes.
 force_nullable_schema : bool, default False.
     If True, writes all columns as `null` in schema.
     If False, columns are written as `null` if they contain null values,

From 7da0336ff9a0f1f06e06973c0ce21497e3549611 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 8 Nov 2023 20:23:15 -0500
Subject: [PATCH 086/118] Split up scan_inclusive.cu to improve its compile
 time (#14358)

Splits out the `strings` and `struct` specializations in `scan_inclusive.cu` into separate source files to improve compile time.
Each specialization is unique code with limited aggregation types.
No functional changes. Just code moved around.
Found while working on #14234

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14358
---
 cpp/CMakeLists.txt                        |   2 +
 cpp/include/cudf/strings/detail/scan.hpp  |  47 ++++++++
 cpp/include/cudf/structs/detail/scan.hpp  |  45 ++++++++
 cpp/src/reductions/scan/scan_inclusive.cu | 112 +-----------------
 cpp/src/strings/scan/scan_inclusive.cu    | 132 ++++++++++++++++++++++
 cpp/src/structs/scan/scan_inclusive.cu    |  89 +++++++++++++++
 6 files changed, 319 insertions(+), 108 deletions(-)
 create mode 100644 cpp/include/cudf/strings/detail/scan.hpp
 create mode 100644 cpp/include/cudf/structs/detail/scan.hpp
 create mode 100644 cpp/src/strings/scan/scan_inclusive.cu
 create mode 100644 cpp/src/structs/scan/scan_inclusive.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index dc12564c656..bd9c936626a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -582,6 +582,7 @@ add_library(
   src/strings/replace/replace.cu
   src/strings/replace/replace_re.cu
   src/strings/reverse.cu
+  src/strings/scan/scan_inclusive.cu
   src/strings/search/findall.cu
   src/strings/search/find.cu
   src/strings/search/find_multiple.cu
@@ -598,6 +599,7 @@ add_library(
   src/strings/utilities.cu
   src/strings/wrap.cu
   src/structs/copying/concatenate.cu
+  src/structs/scan/scan_inclusive.cu
   src/structs/structs_column_factories.cu
   src/structs/structs_column_view.cpp
   src/structs/utilities.cpp
diff --git a/cpp/include/cudf/strings/detail/scan.hpp b/cpp/include/cudf/strings/detail/scan.hpp
new file mode 100644
index 00000000000..611e32e28cd
--- /dev/null
+++ b/cpp/include/cudf/strings/detail/scan.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+/**
+ * @brief Scan function for strings
+ *
+ * Called by cudf::scan() with only min and max aggregates.
+ *
+ * @tparam Op Either DeviceMin or DeviceMax operations
+ *
+ * @param input Input strings column
+ * @param mask Mask for scan
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
+template <typename Op>
+std::unique_ptr<column> scan_inclusive(column_view const& input,
+                                       bitmask_type const* mask,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/include/cudf/structs/detail/scan.hpp b/cpp/include/cudf/structs/detail/scan.hpp
new file mode 100644
index 00000000000..531e0a6c65f
--- /dev/null
+++ b/cpp/include/cudf/structs/detail/scan.hpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace structs {
+namespace detail {
+/**
+ * @brief Scan function for struct column type
+ *
+ * Called by cudf::scan() with only min and max aggregates.
+ *
+ * @tparam Op Either DeviceMin or DeviceMax operations
+ *
+ * @param input Input column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New struct column
+ */
+template <typename Op>
+std::unique_ptr<column> scan_inclusive(column_view const& input,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace structs
+}  // namespace cudf
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index e74fce62caf..91aa1cac487 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <reductions/nested_type_minmax_util.cuh>
 #include <reductions/scan/scan.cuh>
 
 #include <cudf/column/column_device_view.cuh>
@@ -25,9 +24,10 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/reduction.hpp>
+#include <cudf/strings/detail/scan.hpp>
+#include <cudf/structs/detail/scan.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/find.h>
@@ -68,43 +68,6 @@ std::pair<rmm::device_buffer, size_type> mask_scan(column_view const& input_view
 
 namespace {
 
-/**
- * @brief Min/Max inclusive scan operator
- *
- * This operator will accept index values, check them and then
- * run the `Op` operation on the individual element objects.
- * The returned result is the appropriate index value.
- *
- * This was specifically created to workaround a thrust issue
- * https://github.com/NVIDIA/thrust/issues/1479
- * where invalid values are passed to the operator.
- */
-template <typename Element, typename Op>
-struct min_max_scan_operator {
-  column_device_view const col;      ///< strings column device view
-  Element const null_replacement{};  ///< value used when element is null
-  bool const has_nulls;              ///< true if col has null elements
-
-  min_max_scan_operator(column_device_view const& col, bool has_nulls = true)
-    : col{col}, null_replacement{Op::template identity<Element>()}, has_nulls{has_nulls}
-  {
-    // verify validity bitmask is non-null, otherwise, is_null_nocheck() will crash
-    if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
-  }
-
-  __device__ inline size_type operator()(size_type lhs, size_type rhs) const
-  {
-    // thrust::inclusive_scan may pass us garbage values so we need to protect ourselves;
-    // in these cases the return value does not matter since the result is not used
-    if (lhs < 0 || rhs < 0 || lhs >= col.size() || rhs >= col.size()) return 0;
-    Element d_lhs =
-      has_nulls && col.is_null_nocheck(lhs) ? null_replacement : col.element<Element>(lhs);
-    Element d_rhs =
-      has_nulls && col.is_null_nocheck(rhs) ? null_replacement : col.element<Element>(rhs);
-    return Op{}(d_lhs, d_rhs) == d_lhs ? lhs : rhs;
-  }
-};
-
 template <typename Op, typename T>
 struct scan_functor {
   static std::unique_ptr<column> invoke(column_view const& input_view,
@@ -127,11 +90,6 @@ struct scan_functor {
   }
 };
 
-struct null_iterator {
-  bitmask_type const* mask;
-  __device__ bool operator()(size_type idx) const { return !bit_is_set(mask, idx); }
-};
-
 template <typename Op>
 struct scan_functor<Op, cudf::string_view> {
   static std::unique_ptr<column> invoke(column_view const& input_view,
@@ -139,38 +97,7 @@ struct scan_functor<Op, cudf::string_view> {
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
   {
-    auto d_input = column_device_view::create(input_view, stream);
-
-    // build indices of the scan operation results
-    rmm::device_uvector<size_type> result_map(input_view.size(), stream);
-    thrust::inclusive_scan(
-      rmm::exec_policy(stream),
-      thrust::counting_iterator<size_type>(0),
-      thrust::counting_iterator<size_type>(input_view.size()),
-      result_map.begin(),
-      min_max_scan_operator<cudf::string_view, Op>{*d_input, input_view.has_nulls()});
-
-    if (input_view.has_nulls()) {
-      // fill the null rows with out-of-bounds values so gather records them as null;
-      // this prevents un-sanitized null entries in the output
-      auto null_itr = detail::make_counting_transform_iterator(0, null_iterator{mask});
-      auto oob_val  = thrust::constant_iterator<size_type>(input_view.size());
-      thrust::scatter_if(rmm::exec_policy(stream),
-                         oob_val,
-                         oob_val + input_view.size(),
-                         thrust::counting_iterator<size_type>(0),
-                         null_itr,
-                         result_map.data());
-    }
-
-    // call gather using the indices to build the output column
-    auto result_table = cudf::detail::gather(cudf::table_view({input_view}),
-                                             result_map,
-                                             out_of_bounds_policy::NULLIFY,
-                                             negative_index_policy::NOT_ALLOWED,
-                                             stream,
-                                             mr);
-    return std::move(result_table->release().front());
+    return cudf::strings::detail::scan_inclusive<Op>(input_view, mask, stream, mr);
   }
 };
 
@@ -181,38 +108,7 @@ struct scan_functor<Op, cudf::struct_view> {
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
   {
-    // Create a gather map containing indices of the prefix min/max elements.
-    auto gather_map = rmm::device_uvector<size_type>(input.size(), stream);
-    auto const binop_generator =
-      cudf::reduction::detail::comparison_binop_generator::create<Op>(input, stream);
-    thrust::inclusive_scan(rmm::exec_policy(stream),
-                           thrust::counting_iterator<size_type>(0),
-                           thrust::counting_iterator<size_type>(input.size()),
-                           gather_map.begin(),
-                           binop_generator.binop());
-
-    // Gather the children columns of the input column. Must use `get_sliced_child` to properly
-    // handle input in case it is a sliced view.
-    auto const input_children = [&] {
-      auto const it = cudf::detail::make_counting_transform_iterator(
-        0, [structs_view = structs_column_view{input}, &stream](auto const child_idx) {
-          return structs_view.get_sliced_child(child_idx, stream);
-        });
-      return std::vector<column_view>(it, it + input.num_children());
-    }();
-
-    // Gather the children elements of the prefix min/max struct elements for the output.
-    auto scanned_children = cudf::detail::gather(table_view{input_children},
-                                                 gather_map,
-                                                 out_of_bounds_policy::DONT_CHECK,
-                                                 negative_index_policy::NOT_ALLOWED,
-                                                 stream,
-                                                 mr)
-                              ->release();
-
-    // Don't need to set a null mask because that will be handled at the caller.
-    return make_structs_column(
-      input.size(), std::move(scanned_children), 0, rmm::device_buffer{0, stream, mr}, stream, mr);
+    return cudf::structs::detail::scan_inclusive<Op>(input, stream, mr);
   }
 };
 
diff --git a/cpp/src/strings/scan/scan_inclusive.cu b/cpp/src/strings/scan/scan_inclusive.cu
new file mode 100644
index 00000000000..0cf492fa295
--- /dev/null
+++ b/cpp/src/strings/scan/scan_inclusive.cu
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/device_operators.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/scatter.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+namespace {
+
+/**
+ * @brief Min/Max inclusive scan operator
+ *
+ * This operator will accept index values, check them and then
+ * run the `Op` operation on the individual element objects.
+ * The returned result is the appropriate index value.
+ *
+ * This was specifically created to workaround a thrust issue
+ * https://github.com/NVIDIA/thrust/issues/1479
+ * where invalid values are passed to the operator.
+ */
+template <typename Element, typename Op>
+struct min_max_scan_operator {
+  column_device_view const col;      ///< strings column device view
+  Element const null_replacement{};  ///< value used when element is null
+  bool const has_nulls;              ///< true if col has null elements
+
+  min_max_scan_operator(column_device_view const& col, bool has_nulls = true)
+    : col{col}, null_replacement{Op::template identity<Element>()}, has_nulls{has_nulls}
+  {
+    // verify validity bitmask is non-null, otherwise, is_null_nocheck() will crash
+    if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
+  }
+
+  __device__ inline size_type operator()(size_type lhs, size_type rhs) const
+  {
+    // thrust::inclusive_scan may pass us garbage values so we need to protect ourselves;
+    // in these cases the return value does not matter since the result is not used
+    if (lhs < 0 || rhs < 0 || lhs >= col.size() || rhs >= col.size()) return 0;
+    Element d_lhs =
+      has_nulls && col.is_null_nocheck(lhs) ? null_replacement : col.element<Element>(lhs);
+    Element d_rhs =
+      has_nulls && col.is_null_nocheck(rhs) ? null_replacement : col.element<Element>(rhs);
+    return Op{}(d_lhs, d_rhs) == d_lhs ? lhs : rhs;
+  }
+};
+
+struct null_iterator {
+  bitmask_type const* mask;
+  __device__ bool operator()(size_type idx) const { return !bit_is_set(mask, idx); }
+};
+
+}  // namespace
+
+template <typename Op>
+std::unique_ptr<column> scan_inclusive(column_view const& input,
+                                       bitmask_type const* mask,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  auto d_input = column_device_view::create(input, stream);
+
+  // build indices of the scan operation results
+  rmm::device_uvector<size_type> result_map(input.size(), stream);
+  thrust::inclusive_scan(rmm::exec_policy(stream),
+                         thrust::counting_iterator<size_type>(0),
+                         thrust::counting_iterator<size_type>(input.size()),
+                         result_map.begin(),
+                         min_max_scan_operator<cudf::string_view, Op>{*d_input, input.has_nulls()});
+
+  if (input.has_nulls()) {
+    // fill the null rows with out-of-bounds values so gather records them as null;
+    // this prevents un-sanitized null entries in the output
+    auto null_itr = cudf::detail::make_counting_transform_iterator(0, null_iterator{mask});
+    auto oob_val  = thrust::constant_iterator<size_type>(input.size());
+    thrust::scatter_if(rmm::exec_policy(stream),
+                       oob_val,
+                       oob_val + input.size(),
+                       thrust::counting_iterator<size_type>(0),
+                       null_itr,
+                       result_map.data());
+  }
+
+  // call gather using the indices to build the output column
+  auto result_table = cudf::detail::gather(cudf::table_view({input}),
+                                           result_map,
+                                           cudf::out_of_bounds_policy::NULLIFY,
+                                           cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                           stream,
+                                           mr);
+  return std::move(result_table->release().front());
+}
+
+template std::unique_ptr<column> scan_inclusive<DeviceMin>(column_view const& input,
+                                                           bitmask_type const* mask,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr);
+
+template std::unique_ptr<column> scan_inclusive<DeviceMax>(column_view const& input,
+                                                           bitmask_type const* mask,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/structs/scan/scan_inclusive.cu b/cpp/src/structs/scan/scan_inclusive.cu
new file mode 100644
index 00000000000..823e4472960
--- /dev/null
+++ b/cpp/src/structs/scan/scan_inclusive.cu
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <reductions/nested_type_minmax_util.cuh>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/device_operators.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/scan.h>
+
+#include <vector>
+
+namespace cudf {
+namespace structs {
+namespace detail {
+namespace {
+
+}  // namespace
+
+template <typename Op>
+std::unique_ptr<column> scan_inclusive(column_view const& input,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  // Create a gather map containing indices of the prefix min/max elements.
+  auto gather_map = rmm::device_uvector<size_type>(input.size(), stream);
+  auto const binop_generator =
+    cudf::reduction::detail::comparison_binop_generator::create<Op>(input, stream);
+  thrust::inclusive_scan(rmm::exec_policy(stream),
+                         thrust::counting_iterator<size_type>(0),
+                         thrust::counting_iterator<size_type>(input.size()),
+                         gather_map.begin(),
+                         binop_generator.binop());
+
+  // Gather the children columns of the input column. Must use `get_sliced_child` to properly
+  // handle input in case it is a sliced view.
+  auto const input_children = [&] {
+    auto const it = cudf::detail::make_counting_transform_iterator(
+      0, [structs_view = structs_column_view{input}, &stream](auto const child_idx) {
+        return structs_view.get_sliced_child(child_idx, stream);
+      });
+    return std::vector<column_view>(it, it + input.num_children());
+  }();
+
+  // Gather the children elements of the prefix min/max struct elements for the output.
+  auto scanned_children = cudf::detail::gather(table_view{input_children},
+                                               gather_map,
+                                               cudf::out_of_bounds_policy::DONT_CHECK,
+                                               cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                               stream,
+                                               mr)
+                            ->release();
+
+  // Don't need to set a null mask because that will be handled at the caller.
+  return make_structs_column(
+    input.size(), std::move(scanned_children), 0, rmm::device_buffer{0, stream, mr}, stream, mr);
+}
+
+template std::unique_ptr<column> scan_inclusive<DeviceMin>(column_view const& input_view,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr);
+
+template std::unique_ptr<column> scan_inclusive<DeviceMax>(column_view const& input_view,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace structs
+}  // namespace cudf

From 1c6f6b4f9d5ef80fd549a92c7257e1124867f7f9 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 9 Nov 2023 12:52:38 -0600
Subject: [PATCH 087/118] Update to Arrow 14.0.1. (#14387)

Updates Arrow to 14.0.1.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14387
---
 conda/environments/all_cuda-118_arch-x86_64.yaml |  4 ++--
 conda/environments/all_cuda-120_arch-x86_64.yaml |  4 ++--
 conda/recipes/cudf/meta.yaml                     |  2 +-
 conda/recipes/libcudf/conda_build_config.yaml    |  2 +-
 cpp/cmake/thirdparty/get_arrow.cmake             |  2 +-
 dependencies.yaml                                | 10 ++++++----
 python/cudf/pyproject.toml                       |  4 ++--
 python/cudf_kafka/pyproject.toml                 |  2 +-
 8 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 2c98fe472d6..4d5c56e4a7d 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -40,7 +40,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-all==14.0.0.*
+- libarrow-all==14.0.1.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
@@ -69,7 +69,7 @@ dependencies:
 - pre-commit
 - protobuf>=4.21,<5
 - ptxcompiler
-- pyarrow==14.0.0.*
+- pyarrow==14.0.1.*
 - pydata-sphinx-theme!=0.14.2
 - pytest
 - pytest-benchmark
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index fc1f42fec26..8606932dae4 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -42,7 +42,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-all==14.0.0.*
+- libarrow-all==14.0.1.*
 - libcufile-dev
 - libcurand-dev
 - libkvikio==23.12.*
@@ -67,7 +67,7 @@ dependencies:
 - pip
 - pre-commit
 - protobuf>=4.21,<5
-- pyarrow==14.0.0.*
+- pyarrow==14.0.1.*
 - pydata-sphinx-theme!=0.14.2
 - pytest
 - pytest-benchmark
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index c22f3da7fb6..9b5c5f3d14b 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -61,7 +61,7 @@ requirements:
     - scikit-build >=0.13.1
     - setuptools
     - dlpack >=0.5,<0.6.0a0
-    - pyarrow ==14.0.0.*
+    - pyarrow ==14.0.1.*
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 38efbd4fe4f..05b2135184b 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -23,7 +23,7 @@ gtest_version:
   - ">=1.13.0"
 
 libarrow_version:
-  - "==14.0.0"
+  - "==14.0.1"
 
 dlpack_version:
   - ">=0.5,<0.6.0a0"
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index d85d5d5f7d3..05aa5730b4d 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -427,7 +427,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
       # This version must be kept in sync with the libarrow version pinned for builds in
       # dependencies.yaml.
-      14.0.0
+      14.0.1
       CACHE STRING "The version of Arrow to find (or build)"
   )
 endif()
diff --git a/dependencies.yaml b/dependencies.yaml
index 74b640dbed2..35d08239a4c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -241,7 +241,7 @@ dependencies:
           - &gmock gmock>=1.13.0
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - libarrow-all==14.0.0.*
+          - libarrow-all==14.0.1.*
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==2.6.1
@@ -263,7 +263,7 @@ dependencies:
         packages:
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - pyarrow==14.0.0.*
+          - pyarrow==14.0.1.*
   build_python:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -281,13 +281,15 @@ dependencies:
       - output_types: conda
         packages:
           # Allow runtime version to float up to minor version
-          - libarrow-all==14.*
+          # Disallow libarrow 14.0.0 due to a CVE
+          - libarrow-all>=14.0.1,<15.0.0a0
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           # Allow runtime version to float up to minor version
-          - pyarrow==14.*
+          # Disallow pyarrow 14.0.0 due to a CVE
+          - pyarrow>=14.0.1,<15.0.0a0
   cudatoolkit:
     specific:
       - output_types: conda
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index f882fa2d583..1c687269e55 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -8,7 +8,7 @@ requires = [
     "ninja",
     "numpy>=1.21,<1.25",
     "protoc-wheel",
-    "pyarrow==14.0.0.*",
+    "pyarrow==14.0.1.*",
     "rmm==23.12.*",
     "scikit-build>=0.13.1",
     "setuptools",
@@ -38,7 +38,7 @@ dependencies = [
     "pandas>=1.3,<1.6.0dev0",
     "protobuf>=4.21,<5",
     "ptxcompiler",
-    "pyarrow==14.*",
+    "pyarrow>=14.0.1,<15.0.0a0",
     "rich",
     "rmm==23.12.*",
     "typing_extensions>=4.0.0",
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 293586cbba0..f5cbd480e9c 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -5,7 +5,7 @@
 requires = [
     "cython>=3.0.0",
     "numpy>=1.21,<1.25",
-    "pyarrow==14.0.0.*",
+    "pyarrow==14.0.1.*",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From f5d3fc1676ec32f30081e8ff84f07b79645cedd3 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 9 Nov 2023 14:04:25 -0500
Subject: [PATCH 088/118] cudf jit parser now supports .pragma instructions
 with quotes (#14348)

During testing of CUDA 12.3 it was found that python user defined functions could generate .pragma instructions that have quotes, so as "nounroll" and "used_bytes_mask N".

The jit parser now makes sure to escape quotes that are part of a `.pragma` instruction so that it can be properly passed to nvrtc/jitify as a CUDA header.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/14348
---
 cpp/src/jit/parser.cpp               |  15 ++
 cpp/tests/CMakeLists.txt             |   5 +
 cpp/tests/jit/parse_ptx_function.cpp | 218 +++++++++++++++++++++++++++
 3 files changed, 238 insertions(+)
 create mode 100644 cpp/tests/jit/parse_ptx_function.cpp

diff --git a/cpp/src/jit/parser.cpp b/cpp/src/jit/parser.cpp
index 1bc126d3be9..e59c1089318 100644
--- a/cpp/src/jit/parser.cpp
+++ b/cpp/src/jit/parser.cpp
@@ -114,6 +114,7 @@ std::string ptx_parser::parse_instruction(std::string const& src)
   size_t start                      = 0;
   size_t stop                       = 0;
   bool is_instruction               = true;
+  bool is_pragma_instruction        = false;
   bool is_param_loading_instruction = false;
   std::string constraint;
   std::string register_type;
@@ -181,6 +182,9 @@ std::string ptx_parser::parse_instruction(std::string const& src)
                "value through the first function parameter. Thus the `st.param.***` instructions "
                "are not processed. *** */" +
                "\");" + original_code;  // Our port does not support return value;
+      } else if (piece.find(".pragma") != std::string::npos) {
+        is_pragma_instruction = true;
+        output += " " + piece;
       } else if (piece[0] == '@') {
         output += " @" + remove_nonalphanumeric(piece.substr(1, piece.size() - 1));
       } else {
@@ -200,6 +204,17 @@ std::string ptx_parser::parse_instruction(std::string const& src)
         }
         // Here we get to see the actual type of the input arguments.
         input_arg_list[remove_nonalphanumeric(piece)] = register_type_to_cpp_type(register_type);
+      } else if (is_pragma_instruction) {
+        // quote any string
+        std::string transformed_piece;
+        for (const auto& c : piece) {
+          if (c == '"') {
+            transformed_piece += "\\\"";
+          } else {
+            transformed_piece += c;
+          }
+        }
+        output += transformed_piece;
       } else {
         output += escape_percent(std::string(src, start, stop - start));
       }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 196a4f2d038..b0382d15807 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -620,6 +620,11 @@ ConfigureTest(
 # * bin tests ----------------------------------------------------------------------------------
 ConfigureTest(LABEL_BINS_TEST labeling/label_bins_tests.cpp)
 
+# ##################################################################################################
+# * jit tests ----------------------------------------------------------------------------------
+ConfigureTest(JIT_PARSER_TEST jit/parse_ptx_function.cpp)
+target_include_directories(JIT_PARSER_TEST PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>")
+
 # ##################################################################################################
 # * stream testing ---------------------------------------------------------------------------------
 ConfigureTest(
diff --git a/cpp/tests/jit/parse_ptx_function.cpp b/cpp/tests/jit/parse_ptx_function.cpp
new file mode 100644
index 00000000000..5f00c5f561a
--- /dev/null
+++ b/cpp/tests/jit/parse_ptx_function.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include <cctype>
+
+#include <cudf_test/base_fixture.hpp>
+#include <jit/parser.hpp>
+
+struct JitParseTest : public ::testing::Test {};
+
+TEST_F(JitParseTest, PTXNoFunction)
+{
+  std::string raw_ptx = R"(
+.visible .entry _ZN3cub17CUB_101702_750_NS11EmptyKernelIvEEvv()
+{
+  ret;
+})";
+
+  EXPECT_THROW(cudf::jit::parse_single_function_ptx(raw_ptx, "GENERIC_OP", "float", {0}),
+               cudf::logic_error);
+}
+
+inline bool ptx_equal(std::string input, std::string expected)
+{
+  // Remove all whitespace and newline characters and compare
+  // This allows us to handle things like excess newline characters
+  // and trailing whitespace in the 'input'
+
+  auto whitespace_or_newline = [](unsigned char c) { return std::isspace(c) || c == '\n'; };
+  input.erase(std::remove_if(input.begin(), input.end(), whitespace_or_newline), input.end());
+  expected.erase(std::remove_if(expected.begin(), expected.end(), whitespace_or_newline),
+                 expected.end());
+  return input == expected;
+}
+
+TEST_F(JitParseTest, SimplePTX)
+{
+  std::string raw_ptx = R"(
+.visible .func  (.param .b32 func_retval0) _ZN8__main__7add$241Eff(
+  .param .b64 _ZN8__main__7add$241Eff_param_0,
+  .param .b32 _ZN8__main__7add$241Eff_param_1,
+  .param .b32 _ZN8__main__7add$241Eff_param_2
+)
+{
+  ret;
+}
+)";
+
+  std::string expected = R"(
+__device__ __inline__ void GENERIC_OP(
+  float* _ZN8__main__7add_241Eff_param_0,
+  int _ZN8__main__7add_241Eff_param_1,
+  int _ZN8__main__7add_241Eff_param_2
+){
+ asm volatile ("{");
+ asm volatile ("bra RETTGT;");
+ asm volatile ("RETTGT:}");}
+)";
+
+  std::string cuda_source =
+    cudf::jit::parse_single_function_ptx(raw_ptx, "GENERIC_OP", "float", {0});
+
+  EXPECT_TRUE(ptx_equal(cuda_source, expected));
+}
+
+TEST_F(JitParseTest, PTXWithPragma)
+{
+  std::string raw_ptx = R"(
+.visible .func _ZN3cub17CUB_101702_750_NS11EmptyKernelIvEEvv()
+{
+$L__BB0_151:
+  .pragma "nounroll";
+  mov.u32 % r1517, % r1516;
+  mov.u32 % r1516, % r1515;
+  mov.u32 % r1515, % r1505;
+  mov.u32 % r1457, 0;
+$L__BB0_152:
+  .pragma "nounroll";
+})";
+
+  std::string expected = R"(
+__device__ __inline__ void EmptyKern(){
+ asm volatile ("{");  asm volatile (" $L__BB0_151:  .pragma \"nounroll\";");
+   /**   $L__BB0_151:
+  .pragma "nounroll"  */
+
+  asm volatile ("  mov.u32 _ r1517, _ r1516;");
+   /**   mov.u32 % r1517, % r1516  */
+
+  asm volatile ("  mov.u32 _ r1516, _ r1515;");
+   /**   mov.u32 % r1516, % r1515  */
+
+  asm volatile ("  mov.u32 _ r1515, _ r1505;");
+   /**   mov.u32 % r1515, % r1505  */
+
+  asm volatile ("  mov.u32 _ r1457, 0;");
+   /**   mov.u32 % r1457, 0  */
+
+  asm volatile (" $L__BB0_152:  .pragma \"nounroll\";");
+   /**   $L__BB0_152:
+  .pragma "nounroll"  */
+
+ asm volatile ("RETTGT:}");}
+)";
+
+  std::string cuda_source = cudf::jit::parse_single_function_ptx(raw_ptx, "EmptyKern", "void", {0});
+  EXPECT_TRUE(ptx_equal(cuda_source, expected));
+}
+
+TEST_F(JitParseTest, PTXWithPragmaWithSpaces)
+{
+  std::string raw_ptx = R"(
+.visible .func _ZN3cub17CUB_101702_750_NS11EmptyKernelIvEEvv()
+{
+  $L__BB0_58:
+    ld.param.u32 % r1419, [% rd419 + 80];
+    setp.ne.s32 % p394, % r1419, 22;
+    mov.u32 % r2050, 0;
+    mov.u32 % r2048, % r2050;
+    @ % p394 bra $L__BB0_380;
+
+    ld.param.u8 % rs1369, [% rd419 + 208];
+    setp.eq.s16 % p395, % rs1369, 0;
+    selp.b32 % r1422, % r1925, 0, % p395;
+    ld.param.u32 % r1423, [% rd419 + 112];
+    add.s32 % r427, % r1422, % r1423;
+    ld.param.u64 % rd1249, [% rd419 + 120];
+    cvta.to.global.u64 % rd1250, % rd1249;
+    .pragma "used_bytes_mask 4095";
+    ld.global.v4.u32{ % r1424, % r1425, % r1426, % r1427}, [% rd1250];
+    ld.global.v2.u64{ % rd1251, % rd1252}, [% rd1250 + 16];
+    ld.global.s32 % rd230, [% rd1250 + 32];
+    setp.gt.s32 % p396, % r1424, 6;
+    @ % p396 bra $L__BB0_376;
+}
+}
+)";
+
+  std::string expected = R"(
+__device__ __inline__ void LongKernel(){
+ asm volatile ("{");  asm volatile (" $L__BB0_58:  cvt.u32.u32 _  %0, [_ rd419 + 80];": : "r"(r1419));
+   /**   $L__BB0_58:
+    ld.param.u32 % r1419, [% rd419 + 80]  */
+
+  asm volatile ("  setp.ne.s32 _ p394, _ r1419, 22;");
+   /**   setp.ne.s32 % p394, % r1419, 22  */
+
+  asm volatile ("  mov.u32 _ r2050, 0;");
+   /**   mov.u32 % r2050, 0  */
+
+  asm volatile ("  mov.u32 _ r2048, _ r2050;");
+   /**   mov.u32 % r2048, % r2050  */
+
+  asm volatile ("  @ _ p394 bra $L__BB0_380;");
+   /**   @ % p394 bra $L__BB0_380  */
+
+  asm volatile ("  cvt.u8.u8 _  %0, [_ rd419 + 208];": : "h"( static_cast<short>(rs1369)));
+   /**   ld.param.u8 % rs1369, [% rd419 + 208]  */
+
+  asm volatile ("  setp.eq.s16 _ p395, _ rs1369, 0;");
+   /**   setp.eq.s16 % p395, % rs1369, 0  */
+
+  asm volatile ("  selp.b32 _ r1422, _ r1925, 0, _ p395;");
+   /**   selp.b32 % r1422, % r1925, 0, % p395  */
+
+  asm volatile ("  cvt.u32.u32 _  %0, [_ rd419 + 112];": : "r"(r1423));
+   /**   ld.param.u32 % r1423, [% rd419 + 112]  */
+
+  asm volatile ("  add.s32 _ r427, _ r1422, _ r1423;");
+   /**   add.s32 % r427, % r1422, % r1423  */
+
+  asm volatile ("  mov.u64 _  %0, [_ rd419 + 120];": : "l"(rd1249));
+   /**   ld.param.u64 % rd1249, [% rd419 + 120]  */
+
+  asm volatile ("  cvta.to.global.u64 _ rd1250, _ rd1249;");
+   /**   cvta.to.global.u64 % rd1250, % rd1249  */
+
+  asm volatile ("  .pragma \"used_bytes_mask 4095\";");
+   /**   .pragma "used_bytes_mask 4095"  */
+
+  asm volatile ("  ld.global.v4.u32{ _ r1424, _ r1425, _ r1426, _ r1427}, [_ rd1250];");
+   /**   ld.global.v4.u32{ % r1424, % r1425, % r1426, % r1427}, [% rd1250]  */
+
+  asm volatile ("  ld.global.v2.u64{ _ rd1251, _ rd1252}, [_ rd1250 + 16];");
+   /**   ld.global.v2.u64{ % rd1251, % rd1252}, [% rd1250 + 16]  */
+
+  asm volatile ("  ld.global.s32 _ rd230, [_ rd1250 + 32];");
+   /**   ld.global.s32 % rd230, [% rd1250 + 32]  */
+
+  asm volatile ("  setp.gt.s32 _ p396, _ r1424, 6;");
+   /**   setp.gt.s32 % p396, % r1424, 6  */
+
+  asm volatile ("  @ _ p396 bra $L__BB0_376;");
+   /**   @ % p396 bra $L__BB0_376  */
+
+  asm volatile ("RETTGT:}");}
+ )";
+
+  std::string cuda_source =
+    cudf::jit::parse_single_function_ptx(raw_ptx, "LongKernel", "void", {0});
+  EXPECT_TRUE(ptx_equal(cuda_source, expected));
+}
+
+CUDF_TEST_PROGRAM_MAIN()

From 9be4de53268d49665bc0d700f12f1192207fff79 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 9 Nov 2023 16:00:43 -0800
Subject: [PATCH 089/118] Upgrade to nvCOMP 3.0.4 (#13815)

Update the nvCOMP version used for cuIO compression/decompression to 3.0.4.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/13815
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-120_arch-x86_64.yaml | 2 +-
 conda/recipes/libcudf/conda_build_config.yaml    | 2 +-
 dependencies.yaml                                | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 4d5c56e4a7d..a479d517c24 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -60,7 +60,7 @@ dependencies:
 - numpy>=1.21,<1.25
 - numpydoc
 - nvcc_linux-64=11.8
-- nvcomp==2.6.1
+- nvcomp==3.0.4
 - nvtx>=0.2.1
 - packaging
 - pandas>=1.3,<1.6.0dev0
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 8606932dae4..d1779aaeeac 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -59,7 +59,7 @@ dependencies:
 - numba>=0.57,<0.58
 - numpy>=1.21,<1.25
 - numpydoc
-- nvcomp==2.6.1
+- nvcomp==3.0.4
 - nvtx>=0.2.1
 - packaging
 - pandas>=1.3,<1.6.0dev0
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 05b2135184b..fa06ed048b7 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -38,7 +38,7 @@ spdlog_version:
   - ">=1.11.0,<1.12"
 
 nvcomp_version:
-  - "=2.6.1"
+  - "=3.0.4"
 
 zlib_version:
   - ">=1.2.13"
diff --git a/dependencies.yaml b/dependencies.yaml
index 35d08239a4c..3850347aa63 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -244,7 +244,7 @@ dependencies:
           - libarrow-all==14.0.1.*
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
-          - nvcomp==2.6.1
+          - nvcomp==3.0.4
           - spdlog>=1.11.0,<1.12
   build_wheels:
     common:

From 87d2a36f04f431a8c5236d2aee723ec79b9dc5f9 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 9 Nov 2023 16:54:25 -0800
Subject: [PATCH 090/118] Remove Cython libcpp wrappers (#14382)

All of these wrappers have now been upstreamed into Cython as of Cython 3.0.3.

Contributes to #14023

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/14382
---
 .../all_cuda-118_arch-x86_64.yaml             |  2 +-
 .../all_cuda-120_arch-x86_64.yaml             |  2 +-
 conda/recipes/cudf/meta.yaml                  |  2 +-
 conda/recipes/cudf_kafka/meta.yaml            |  2 +-
 dependencies.yaml                             |  2 +-
 python/cudf/cudf/_lib/column.pyx              |  3 +-
 python/cudf/cudf/_lib/concat.pyx              |  3 +-
 python/cudf/cudf/_lib/copying.pyx             |  5 +-
 python/cudf/cudf/_lib/cpp/copying.pxd         |  2 +-
 python/cudf/cudf/_lib/cpp/groupby.pxd         |  4 +-
 python/cudf/cudf/_lib/cpp/io/orc.pxd          |  2 +-
 python/cudf/cudf/_lib/cpp/io/parquet.pxd      |  4 +-
 python/cudf/cudf/_lib/cpp/io/timezone.pxd     |  2 +-
 python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd |  0
 python/cudf/cudf/_lib/cpp/libcpp/__init__.py  |  0
 .../cudf/cudf/_lib/cpp/libcpp/functional.pxd  |  7 ---
 python/cudf/cudf/_lib/cpp/libcpp/memory.pxd   | 12 -----
 python/cudf/cudf/_lib/cpp/libcpp/optional.pxd | 50 -------------------
 python/cudf/cudf/_lib/expressions.pyx         |  3 +-
 python/cudf/cudf/_lib/groupby.pyx             |  3 +-
 python/cudf/cudf/_lib/join.pyx                |  3 +-
 python/cudf/cudf/_lib/null_mask.pyx           |  3 +-
 python/cudf/cudf/_lib/parquet.pyx             |  3 +-
 python/cudf/cudf/_lib/timezone.pyx            |  2 +-
 python/cudf/pyproject.toml                    |  2 +-
 python/cudf_kafka/cudf_kafka/_lib/kafka.pyx   |  3 +-
 python/cudf_kafka/pyproject.toml              |  2 +-
 27 files changed, 27 insertions(+), 101 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/__init__.py
 delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/functional.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/optional.pxd

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index a479d517c24..adf4fcad32d 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -24,7 +24,7 @@ dependencies:
 - cudatoolkit
 - cupy>=12.0.0
 - cxx-compiler
-- cython>=3.0.0
+- cython>=3.0.3
 - dask-core>=2023.9.2
 - dask-cuda==23.12.*
 - dask>=2023.9.2
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index d1779aaeeac..a69ef587570 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -26,7 +26,7 @@ dependencies:
 - cuda-version=12.0
 - cupy>=12.0.0
 - cxx-compiler
-- cython>=3.0.0
+- cython>=3.0.3
 - dask-core>=2023.9.2
 - dask-cuda==23.12.*
 - dask>=2023.9.2
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 9b5c5f3d14b..27edde1c98a 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -57,7 +57,7 @@ requirements:
   host:
     - protobuf ==4.24.*
     - python
-    - cython >=3.0.0
+    - cython >=3.0.3
     - scikit-build >=0.13.1
     - setuptools
     - dlpack >=0.5,<0.6.0a0
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index cdc547b4d68..9440f8bf124 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -51,7 +51,7 @@ requirements:
     {% endif %}
   host:
     - python
-    - cython >=3.0.0
+    - cython >=3.0.3
     - cuda-version ={{ cuda_version }}
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 3850347aa63..a16b51f4483 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -256,7 +256,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cython>=3.0.0
+          - cython>=3.0.3
           # TODO: Pin to numpy<1.25 until cudf requires pandas 2
           - &numpy numpy>=1.21,<1.25
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index f751d73b142..0edf9f8aa95 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -24,7 +24,7 @@ from cudf.utils.dtypes import _get_base_dtype
 
 from cpython.buffer cimport PyObject_CheckBuffer
 from libc.stdint cimport uintptr_t
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -47,7 +47,6 @@ from cudf._lib.cpp.column.column_factories cimport (
     make_numeric_column,
 )
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.null_mask cimport null_count as cpp_null_count
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.scalar cimport DeviceScalar
diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
index feaf75ef237..1ec4719631e 100644
--- a/python/cudf/cudf/_lib/concat.pyx
+++ b/python/cudf/cudf/_lib/concat.pyx
@@ -1,7 +1,7 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -12,7 +12,6 @@ from cudf._lib.cpp.concatenate cimport (
     concatenate_masks as libcudf_concatenate_masks,
     concatenate_tables as libcudf_concatenate_tables,
 )
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table cimport table, table_view
 from cudf._lib.utils cimport (
     data_from_unique_ptr,
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index f57bc15ed57..ea6ee76c14a 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -24,12 +24,13 @@ from cudf._lib.utils cimport table_view_from_columns, table_view_from_table
 from cudf._lib.reduce import minmax
 from cudf.core.abc import Serializable
 
+from libcpp.functional cimport reference_wrapper
+from libcpp.memory cimport make_unique
+
 cimport cudf._lib.cpp.contiguous_split as cpp_contiguous_split
 cimport cudf._lib.cpp.copying as cpp_copying
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.lists.gather cimport (
     segmented_gather as cpp_segmented_gather,
 )
diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd
index 20725c252fc..5637b55ac1c 100644
--- a/python/cudf/cudf/_lib/cpp/copying.pxd
+++ b/python/cudf/cudf/_lib/cpp/copying.pxd
@@ -2,6 +2,7 @@
 
 from libc.stdint cimport int32_t, int64_t, uint8_t
 from libcpp cimport bool
+from libcpp.functional cimport reference_wrapper
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
@@ -9,7 +10,6 @@ from rmm._lib.device_buffer cimport device_buffer
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/cpp/groupby.pxd
index 2ecdf76842f..0266404fc50 100644
--- a/python/cudf/cudf/_lib/cpp/groupby.pxd
+++ b/python/cudf/cudf/_lib/cpp/groupby.pxd
@@ -1,6 +1,7 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
+from libcpp.functional cimport reference_wrapper
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
@@ -11,7 +12,6 @@ from cudf._lib.cpp.aggregation cimport (
 )
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
 from cudf._lib.cpp.replace cimport replace_policy
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index dd6f919a74d..d5ac8574fe4 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -4,12 +4,12 @@ from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.libcpp.optional cimport optional
 from cudf._lib.cpp.types cimport data_type, size_type
 
 
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index a6a7ba034aa..cdd1bde0274 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -2,16 +2,16 @@
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
+from libcpp.functional cimport reference_wrapper
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.table.table_view as cudf_table_view
 from cudf._lib.cpp.expressions cimport expression
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
-from cudf._lib.cpp.libcpp.optional cimport optional
 from cudf._lib.cpp.types cimport data_type, size_type
 
 
diff --git a/python/cudf/cudf/_lib/cpp/io/timezone.pxd b/python/cudf/cudf/_lib/cpp/io/timezone.pxd
index ba481d9a1d3..927c2118473 100644
--- a/python/cudf/cudf/_lib/cpp/io/timezone.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/timezone.pxd
@@ -2,9 +2,9 @@
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 
-from cudf._lib.cpp.libcpp.optional cimport optional
 from cudf._lib.cpp.table.table cimport table
 
 
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd b/python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/__init__.py b/python/cudf/cudf/_lib/cpp/libcpp/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd
deleted file mode 100644
index f3e2d6d0878..00000000000
--- a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-
-cdef extern from "<functional>" namespace "std" nogil:
-    cdef cppclass reference_wrapper[T]:
-        reference_wrapper()
-        reference_wrapper(T)
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd b/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
deleted file mode 100644
index 2178f1a940c..00000000000
--- a/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-
-cdef extern from "<memory>" namespace "std" nogil:
-    # The Cython standard header does not have except +, so C++
-    # exceptions from make_unique are not caught and translated to
-    # Python ones. This is not perfectly ergonomic, we always have to
-    # wrap make_unique in move, but at least we can catch exceptions.
-    # See https://github.com/cython/cython/issues/5560
-    unique_ptr[T] make_unique[T](...) except +
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd
deleted file mode 100644
index a78c18f3f7a..00000000000
--- a/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd
+++ /dev/null
@@ -1,50 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION &
-# AFFILIATES. All rights reserved.  SPDX-License-Identifier:
-# Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from libcpp cimport bool
-
-
-cdef extern from "<optional>" namespace "std" nogil:
-    cdef cppclass nullopt_t:
-        nullopt_t()
-
-    cdef nullopt_t nullopt
-
-    cdef cppclass optional[T]:
-        ctypedef T value_type
-        optional()
-        optional(nullopt_t)
-        optional(optional&) except +
-        optional(T&) except +
-        bool has_value()
-        T& value()
-        T& value_or[U](U& default_value)
-        void swap(optional&)
-        void reset()
-        T& emplace(...)
-        T& operator*()
-        optional& operator=(optional&)
-        optional& operator=[U](U&)
-        bool operator bool()
-        bool operator!()
-        bool operator==[U](optional&, U&)
-        bool operator!=[U](optional&, U&)
-        bool operator<[U](optional&, U&)
-        bool operator>[U](optional&, U&)
-        bool operator<=[U](optional&, U&)
-        bool operator>=[U](optional&, U&)
-
-    optional[T] make_optional[T](...) except +
diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx
index 8d7545ffe15..01a080f635f 100644
--- a/python/cudf/cudf/_lib/expressions.pyx
+++ b/python/cudf/cudf/_lib/expressions.pyx
@@ -4,12 +4,11 @@ from enum import Enum
 
 from cython.operator cimport dereference
 from libc.stdint cimport int64_t
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.cpp cimport expressions as libcudf_exp
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.types cimport size_type
 
 # Necessary for proper casting, see below.
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index a26d820de6f..b3778e45cde 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -24,6 +24,8 @@ from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf._lib.scalar import as_device_scalar
 
+from libcpp.functional cimport reference_wrapper
+
 cimport cudf._lib.cpp.groupby as libcudf_groupby
 cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.aggregation cimport (
@@ -33,7 +35,6 @@ from cudf._lib.aggregation cimport (
     make_groupby_scan_aggregation,
 )
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
 from cudf._lib.cpp.replace cimport replace_policy
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table, table_view
diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx
index 416680aae24..378be978cc0 100644
--- a/python/cudf/cudf/_lib/join.pyx
+++ b/python/cudf/cudf/_lib/join.pyx
@@ -2,7 +2,7 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
@@ -11,7 +11,6 @@ from rmm._lib.device_buffer cimport device_buffer
 cimport cudf._lib.cpp.join as cpp_join
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type, type_id
 from cudf._lib.utils cimport table_view_from_columns
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index 5b4538629f6..1f98140d9e4 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -6,13 +6,12 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.null_mask cimport (
     bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes,
     bitmask_and as cpp_bitmask_and,
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index d8d363686cc..4acb1ce10b1 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -32,7 +32,7 @@ from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
@@ -52,7 +52,6 @@ from cudf._lib.cpp.io.parquet cimport (
     write_parquet as parquet_writer,
 )
 from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type
 from cudf._lib.io.datasource cimport NativeFileDatasource
diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx
index 4d76cbfcdb5..808d1321b0b 100644
--- a/python/cudf/cudf/_lib/timezone.pyx
+++ b/python/cudf/cudf/_lib/timezone.pyx
@@ -1,13 +1,13 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from libcpp.optional cimport make_optional
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.cpp.io.timezone cimport (
     make_timezone_transition_table as cpp_make_timezone_transition_table,
 )
-from cudf._lib.cpp.libcpp.optional cimport make_optional
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.utils cimport columns_from_unique_ptr
 
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 1c687269e55..b38970271d7 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -4,7 +4,7 @@
 build-backend = "setuptools.build_meta"
 requires = [
     "cmake>=3.26.4",
-    "cython>=3.0.0",
+    "cython>=3.0.3",
     "ninja",
     "numpy>=1.21,<1.25",
     "protoc-wheel",
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
index 4d732478723..2fbaacff7c6 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
@@ -3,12 +3,11 @@
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool, nullptr
 from libcpp.map cimport map
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.cpp.io.datasource cimport datasource
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 
 from cudf_kafka._lib.kafka cimport kafka_consumer
 
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index f5cbd480e9c..4829f06ab09 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -3,7 +3,7 @@
 [build-system]
 
 requires = [
-    "cython>=3.0.0",
+    "cython>=3.0.3",
     "numpy>=1.21,<1.25",
     "pyarrow==14.0.1.*",
     "setuptools",

From 04d13d81b0bb4c2b3db2bfc9d9e28432e0a73c44 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 13 Nov 2023 09:05:19 -0500
Subject: [PATCH 091/118] Normalizing offsets iterator (#14234)

Creates a normalizing offsets iterator that returns an int64 value given either a int32 or int64 column data.
Depends on #14206

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/14234
---
 .../cudf/column/column_device_view.cuh        |   8 +-
 cpp/include/cudf/detail/indexalator.cuh       | 151 ++++++++++++++--
 .../cudf/detail/normalizing_iterator.cuh      | 160 +----------------
 cpp/include/cudf/detail/offsets_iterator.cuh  | 165 ++++++++++++++++++
 .../cudf/detail/offsets_iterator_factory.cuh  |  47 +++++
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/iterator/indexalator_test.cu        |  37 ----
 cpp/tests/iterator/offsetalator_test.cu       | 140 +++++++++++++++
 8 files changed, 502 insertions(+), 207 deletions(-)
 create mode 100644 cpp/include/cudf/detail/offsets_iterator.cuh
 create mode 100644 cpp/include/cudf/detail/offsets_iterator_factory.cuh
 create mode 100644 cpp/tests/iterator/offsetalator_test.cu

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 35851a99822..b1ff0bbaea7 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/offsets_iterator.cuh>
 #include <cudf/detail/utilities/alignment.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/lists/list_view.hpp>
@@ -442,10 +443,11 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   __device__ T element(size_type element_index) const noexcept
   {
     size_type index       = element_index + offset();  // account for this view's _offset
-    auto const* d_offsets = d_children[strings_column_view::offsets_column_index].data<int32_t>();
     char const* d_strings = d_children[strings_column_view::chars_column_index].data<char>();
-    size_type offset      = d_offsets[index];
-    return string_view{d_strings + offset, d_offsets[index + 1] - offset};
+    auto const offsets    = d_children[strings_column_view::offsets_column_index];
+    auto const itr        = cudf::detail::input_offsetalator(offsets.head(), offsets.type());
+    auto const offset     = itr[index];
+    return string_view{d_strings + offset, static_cast<cudf::size_type>(itr[index + 1] - offset)};
   }
 
  private:
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index 6532dae3695..4d261c54b29 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -56,10 +56,69 @@ namespace detail {
  *  auto result = thrust::find(thrust::device, begin, end, size_type{12} );
  * @endcode
  */
-using input_indexalator = input_normalator<cudf::size_type>;
+struct input_indexalator : base_normalator<input_indexalator, cudf::size_type> {
+  friend struct base_normalator<input_indexalator, cudf::size_type>;  // for CRTP
+
+  using reference = cudf::size_type const;  // this keeps STL and thrust happy
+
+  input_indexalator()                                    = default;
+  input_indexalator(input_indexalator const&)            = default;
+  input_indexalator(input_indexalator&&)                 = default;
+  input_indexalator& operator=(input_indexalator const&) = default;
+  input_indexalator& operator=(input_indexalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns the value at the current iterator position
+   */
+  __device__ inline cudf::size_type operator*() const { return operator[](0); }
+
+  /**
+   * @brief Dispatch functor for resolving a Integer value from any integer type
+   */
+  struct normalize_type {
+    template <typename T, CUDF_ENABLE_IF(cudf::is_index_type<T>())>
+    __device__ cudf::size_type operator()(void const* tp)
+    {
+      return static_cast<cudf::size_type>(*static_cast<T const*>(tp));
+    }
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_index_type<T>())>
+    __device__ cudf::size_type operator()(void const*)
+    {
+      CUDF_UNREACHABLE("only integral types are supported");
+    }
+  };
+
+  /**
+   * @brief Array subscript operator returns a value at the input
+   * `idx` position as a `Integer` value.
+   */
+  __device__ inline cudf::size_type operator[](size_type idx) const
+  {
+    void const* tp = p_ + (idx * this->width_);
+    return type_dispatcher(this->dtype_, normalize_type{}, tp);
+  }
+
+  /**
+   * @brief Create an input index normalizing iterator
+   *
+   * Use the indexalator_factory to create an iterator instance.
+   *
+   * @param data   Pointer to an integer array in device memory.
+   * @param dtype  Type of data in data
+   * @param offset Applied to the data pointer per size of the type
+   */
+  CUDF_HOST_DEVICE input_indexalator(void const* data, data_type dtype, cudf::size_type offset = 0)
+    : base_normalator<input_indexalator, cudf::size_type>(dtype), p_{static_cast<char const*>(data)}
+  {
+    p_ += offset * this->width_;
+  }
+
+ protected:
+  char const* p_;  /// pointer to the integer data in device memory
+};
 
 /**
- * @brief The index normalizing output iterator.
+ * @brief The index normalizing output iterator
  *
  * This is an iterator that can be used for index types (integers) without
  * requiring a type-specific instance. It can be used for any iterator
@@ -82,7 +141,75 @@ using input_indexalator = input_normalator<cudf::size_type>;
  *                      thrust::less<Element>());
  * @endcode
  */
-using output_indexalator = output_normalator<cudf::size_type>;
+struct output_indexalator : base_normalator<output_indexalator, cudf::size_type> {
+  friend struct base_normalator<output_indexalator, cudf::size_type>;  // for CRTP
+
+  using reference = output_indexalator const&;  // required for output iterators
+
+  output_indexalator()                                     = default;
+  output_indexalator(output_indexalator const&)            = default;
+  output_indexalator(output_indexalator&&)                 = default;
+  output_indexalator& operator=(output_indexalator const&) = default;
+  output_indexalator& operator=(output_indexalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns this iterator instance in order
+   * to capture the `operator=(Integer)` calls.
+   */
+  __device__ inline reference operator*() const { return *this; }
+
+  /**
+   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
+   *
+   * This allows capturing the subsequent `operator=(Integer)` call in this class.
+   */
+  __device__ inline output_indexalator const operator[](size_type idx) const
+  {
+    output_indexalator tmp{*this};
+    tmp.p_ += (idx * this->width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Dispatch functor for setting the index value from a size_type value.
+   */
+  struct normalize_type {
+    template <typename T, CUDF_ENABLE_IF(cudf::is_index_type<T>())>
+    __device__ void operator()(void* tp, cudf::size_type const value)
+    {
+      (*static_cast<T*>(tp)) = static_cast<T>(value);
+    }
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_index_type<T>())>
+    __device__ void operator()(void*, cudf::size_type const)
+    {
+      CUDF_UNREACHABLE("only index types are supported");
+    }
+  };
+
+  /**
+   * @brief Assign an Integer value to the current iterator position
+   */
+  __device__ inline reference operator=(cudf::size_type const value) const
+  {
+    void* tp = p_;
+    type_dispatcher(this->dtype_, normalize_type{}, tp, value);
+    return *this;
+  }
+
+  /**
+   * @brief Create an output normalizing iterator
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param dtype Type of data in data
+   */
+  CUDF_HOST_DEVICE output_indexalator(void* data, data_type dtype)
+    : base_normalator<output_indexalator, cudf::size_type>(dtype), p_{static_cast<char*>(data)}
+  {
+  }
+
+ protected:
+  char* p_;  /// pointer to the integer data in device memory
+};
 
 /**
  * @brief Use this class to create an indexalator instance.
@@ -92,14 +219,12 @@ struct indexalator_factory {
    * @brief A type_dispatcher functor to create an input iterator from an indices column.
    */
   struct input_indexalator_fn {
-    template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, CUDF_ENABLE_IF(is_index_type<IndexType>())>
     input_indexalator operator()(column_view const& indices)
     {
       return input_indexalator(indices.data<IndexType>(), indices.type());
     }
-    template <typename IndexType,
-              typename... Args,
-              std::enable_if_t<not is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, typename... Args, CUDF_ENABLE_IF(not is_index_type<IndexType>())>
     input_indexalator operator()(Args&&... args)
     {
       CUDF_FAIL("indices must be an index type");
@@ -110,16 +235,14 @@ struct indexalator_factory {
    * @brief Use this class to create an indexalator to a scalar index.
    */
   struct input_indexalator_scalar_fn {
-    template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, CUDF_ENABLE_IF(is_index_type<IndexType>())>
     input_indexalator operator()(scalar const& index)
     {
       // note: using static_cast<scalar_type_t<IndexType> const&>(index) creates a copy
       auto const scalar_impl = static_cast<scalar_type_t<IndexType> const*>(&index);
       return input_indexalator(scalar_impl->data(), index.type());
     }
-    template <typename IndexType,
-              typename... Args,
-              std::enable_if_t<not is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, typename... Args, CUDF_ENABLE_IF(not is_index_type<IndexType>())>
     input_indexalator operator()(Args&&... args)
     {
       CUDF_FAIL("scalar must be an index type");
@@ -130,14 +253,12 @@ struct indexalator_factory {
    * @brief A type_dispatcher functor to create an output iterator from an indices column.
    */
   struct output_indexalator_fn {
-    template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, CUDF_ENABLE_IF(is_index_type<IndexType>())>
     output_indexalator operator()(mutable_column_view const& indices)
     {
       return output_indexalator(indices.data<IndexType>(), indices.type());
     }
-    template <typename IndexType,
-              typename... Args,
-              std::enable_if_t<not is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, typename... Args, CUDF_ENABLE_IF(not is_index_type<IndexType>())>
     output_indexalator operator()(Args&&... args)
     {
       CUDF_FAIL("indices must be an index type");
diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh
index 35a695d47df..8f90afc3e57 100644
--- a/cpp/include/cudf/detail/normalizing_iterator.cuh
+++ b/cpp/include/cudf/detail/normalizing_iterator.cuh
@@ -33,7 +33,7 @@ namespace detail {
  * @tparam Integer The type the iterator normalizes to
  */
 template <class Derived, typename Integer>
-struct base_normalator {
+struct alignas(16) base_normalator {
   static_assert(cudf::is_index_type<Integer>());
   using difference_type   = std::ptrdiff_t;
   using value_type        = Integer;
@@ -204,7 +204,7 @@ struct base_normalator {
 
  private:
   struct integer_sizeof_fn {
-    template <typename T, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_fixed_width<T>())>
     CUDF_HOST_DEVICE constexpr std::size_t operator()() const
     {
 #ifndef __CUDA_ARCH__
@@ -213,7 +213,7 @@ struct base_normalator {
       CUDF_UNREACHABLE("only integral types are supported");
 #endif
     }
-    template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
+    template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_width<T>())>
     CUDF_HOST_DEVICE constexpr std::size_t operator()() const noexcept
     {
       return sizeof(T);
@@ -229,160 +229,16 @@ struct base_normalator {
     width_ = static_cast<int32_t>(type_dispatcher(dtype, integer_sizeof_fn{}));
   }
 
-  int32_t width_;    /// integer type width = 1,2,4, or 8
-  data_type dtype_;  /// for type-dispatcher calls
-};
-
-/**
- * @brief The integer normalizing input iterator
- *
- * This is an iterator that can be used for index types (integers) without
- * requiring a type-specific instance. It can be used for any iterator
- * interface for reading an array of integer values of type
- * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
- * Reading specific elements always return a type of `Integer`
- *
- * @tparam Integer Type returned by all read functions
- */
-template <typename Integer>
-struct input_normalator : base_normalator<input_normalator<Integer>, Integer> {
-  friend struct base_normalator<input_normalator<Integer>, Integer>;  // for CRTP
-
-  using reference = Integer const;  // this keeps STL and thrust happy
-
-  input_normalator()                                   = default;
-  input_normalator(input_normalator const&)            = default;
-  input_normalator(input_normalator&&)                 = default;
-  input_normalator& operator=(input_normalator const&) = default;
-  input_normalator& operator=(input_normalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns the value at the current iterator position
-   */
-  __device__ inline Integer operator*() const { return operator[](0); }
-
-  /**
-   * @brief Dispatch functor for resolving a Integer value from any integer type
-   */
-  struct normalize_type {
-    template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
-    __device__ Integer operator()(void const* tp)
-    {
-      return static_cast<Integer>(*static_cast<T const*>(tp));
-    }
-    template <typename T, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
-    __device__ Integer operator()(void const*)
-    {
-      CUDF_UNREACHABLE("only integral types are supported");
-    }
-  };
-
   /**
-   * @brief Array subscript operator returns a value at the input
-   * `idx` position as a `Integer` value.
-   */
-  __device__ inline Integer operator[](size_type idx) const
-  {
-    void const* tp = p_ + (idx * this->width_);
-    return type_dispatcher(this->dtype_, normalize_type{}, tp);
-  }
-
-  /**
-   * @brief Create an input index normalizing iterator.
-   *
-   * Use the indexalator_factory to create an iterator instance.
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param data_type Type of data in data
-   */
-  CUDF_HOST_DEVICE input_normalator(void const* data, data_type dtype, cudf::size_type offset = 0)
-    : base_normalator<input_normalator<Integer>, Integer>(dtype), p_{static_cast<char const*>(data)}
-  {
-    p_ += offset * this->width_;
-  }
-
-  char const* p_;  /// pointer to the integer data in device memory
-};
-
-/**
- * @brief The integer normalizing output iterator
- *
- * This is an iterator that can be used for index types (integers) without
- * requiring a type-specific instance. It can be used for any iterator
- * interface for writing an array of integer values of type
- * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
- * Setting specific elements always accept the `Integer` type values.
- *
- * @tparam Integer The type used for all write functions
- */
-template <typename Integer>
-struct output_normalator : base_normalator<output_normalator<Integer>, Integer> {
-  friend struct base_normalator<output_normalator<Integer>, Integer>;  // for CRTP
-
-  using reference = output_normalator const&;  // required for output iterators
-
-  output_normalator()                                    = default;
-  output_normalator(output_normalator const&)            = default;
-  output_normalator(output_normalator&&)                 = default;
-  output_normalator& operator=(output_normalator const&) = default;
-  output_normalator& operator=(output_normalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns this iterator instance in order
-   * to capture the `operator=(Integer)` calls.
-   */
-  __device__ inline output_normalator const& operator*() const { return *this; }
-
-  /**
-   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
-   *
-   * This allows capturing the subsequent `operator=(Integer)` call in this class.
-   */
-  __device__ inline output_normalator const operator[](size_type idx) const
-  {
-    output_normalator tmp{*this};
-    tmp.p_ += (idx * this->width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Dispatch functor for setting the index value from a size_type value.
-   */
-  struct normalize_type {
-    template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
-    __device__ void operator()(void* tp, Integer const value)
-    {
-      (*static_cast<T*>(tp)) = static_cast<T>(value);
-    }
-    template <typename T, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
-    __device__ void operator()(void*, Integer const)
-    {
-      CUDF_UNREACHABLE("only index types are supported");
-    }
-  };
-
-  /**
-   * @brief Assign an Integer value to the current iterator position
-   */
-  __device__ inline output_normalator const& operator=(Integer const value) const
-  {
-    void* tp = p_;
-    type_dispatcher(this->dtype_, normalize_type{}, tp, value);
-    return *this;
-  }
-
-  /**
-   * @brief Create an output normalizing iterator
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param data_type Type of data in data
+   * @brief Constructor assigns width and type member variables for base class.
    */
-  CUDF_HOST_DEVICE output_normalator(void* data, data_type dtype)
-    : base_normalator<output_normalator<Integer>, Integer>(dtype), p_{static_cast<char*>(data)}
+  explicit CUDF_HOST_DEVICE base_normalator(data_type dtype, int32_t width)
+    : width_(width), dtype_(dtype)
   {
   }
 
-  char* p_;  /// pointer to the integer data in device memory
+  int32_t width_;    /// integer type width = 1,2,4, or 8
+  data_type dtype_;  /// for type-dispatcher calls
 };
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/offsets_iterator.cuh b/cpp/include/cudf/detail/offsets_iterator.cuh
new file mode 100644
index 00000000000..3eb77b32353
--- /dev/null
+++ b/cpp/include/cudf/detail/offsets_iterator.cuh
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/normalizing_iterator.cuh>
+#include <cudf/types.hpp>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief The offsets normalizing input iterator
+ *
+ * This is an iterator that can be used for offsets where the underlying
+ * type may be int32_t or int64_t.
+ *
+ * Use the offsetalator_factory to create an appropriate input iterator
+ * from an offsets column_view.
+ */
+struct input_offsetalator : base_normalator<input_offsetalator, int64_t> {
+  friend struct base_normalator<input_offsetalator, int64_t>;  // for CRTP
+
+  using reference = int64_t const;  // this keeps STL and thrust happy
+
+  input_offsetalator()                                     = default;
+  input_offsetalator(input_offsetalator const&)            = default;
+  input_offsetalator(input_offsetalator&&)                 = default;
+  input_offsetalator& operator=(input_offsetalator const&) = default;
+  input_offsetalator& operator=(input_offsetalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns the value at the current iterator position
+   */
+  __device__ inline int64_t operator*() const { return operator[](0); }
+
+  /**
+   * @brief Array subscript operator returns a value at the input
+   * `idx` position as a int64_t value.
+   */
+  __device__ inline int64_t operator[](size_type idx) const
+  {
+    void const* tp = p_ + (idx * this->width_);
+    return this->width_ == sizeof(int32_t) ? static_cast<int64_t>(*static_cast<int32_t const*>(tp))
+                                           : *static_cast<int64_t const*>(tp);
+  }
+
+  /**
+   * @brief Create an input index normalizing iterator.
+   *
+   * Use the indexalator_factory to create an iterator instance.
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param dtype Type of data in data
+   */
+  CUDF_HOST_DEVICE input_offsetalator(void const* data, data_type dtype)
+    : base_normalator<input_offsetalator, int64_t>(
+        dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)),
+      p_{static_cast<char const*>(data)}
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_EXPECTS(dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64,
+                 "Unexpected offsets type");
+#else
+    cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) &&
+                "Unexpected offsets type");
+#endif
+  }
+
+ protected:
+  char const* p_;  /// pointer to the integer data in device memory
+};
+
+/**
+ * @brief The offsets normalizing output iterator
+ *
+ * This is an iterator that can be used for storing offsets values
+ * where the underlying type may be either int32_t or int64_t.
+ *
+ * Use the offsetalator_factory to create an appropriate output iterator
+ * from a mutable_column_view.
+ *
+ */
+struct output_offsetalator : base_normalator<output_offsetalator, int64_t> {
+  friend struct base_normalator<output_offsetalator, int64_t>;  // for CRTP
+
+  using reference = output_offsetalator const&;  // required for output iterators
+
+  output_offsetalator()                                      = default;
+  output_offsetalator(output_offsetalator const&)            = default;
+  output_offsetalator(output_offsetalator&&)                 = default;
+  output_offsetalator& operator=(output_offsetalator const&) = default;
+  output_offsetalator& operator=(output_offsetalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns this iterator instance in order
+   * to capture the `operator=(int64)` calls.
+   */
+  __device__ inline output_offsetalator const& operator*() const { return *this; }
+
+  /**
+   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
+   *
+   * This allows capturing the subsequent `operator=(int64)` call in this class.
+   */
+  __device__ inline output_offsetalator const operator[](size_type idx) const
+  {
+    output_offsetalator tmp{*this};
+    tmp.p_ += (idx * this->width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Assign an offset value to the current iterator position
+   */
+  __device__ inline output_offsetalator const& operator=(int64_t const value) const
+  {
+    void* tp = p_;
+    if (this->width_ == sizeof(int32_t)) {
+      (*static_cast<int32_t*>(tp)) = static_cast<int32_t>(value);
+    } else {
+      (*static_cast<int64_t*>(tp)) = value;
+    }
+    return *this;
+  }
+
+  /**
+   * @brief Create an output offsets iterator
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param dtype Type of data in data
+   */
+  CUDF_HOST_DEVICE output_offsetalator(void* data, data_type dtype)
+    : base_normalator<output_offsetalator, int64_t>(
+        dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)),
+      p_{static_cast<char*>(data)}
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_EXPECTS(dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64,
+                 "Unexpected offsets type");
+#else
+    cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) &&
+                "Unexpected offsets type");
+#endif
+  }
+
+ protected:
+  char* p_;  /// pointer to the integer data in device memory
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/offsets_iterator_factory.cuh b/cpp/include/cudf/detail/offsets_iterator_factory.cuh
new file mode 100644
index 00000000000..5b4c6b825d2
--- /dev/null
+++ b/cpp/include/cudf/detail/offsets_iterator_factory.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/offsets_iterator.cuh>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Use this class to create an offsetalator instance.
+ */
+struct offsetalator_factory {
+  /**
+   * @brief Create an input offsetalator instance from an offsets column
+   */
+  static input_offsetalator make_input_iterator(column_view const& offsets)
+  {
+    return input_offsetalator(offsets.head(), offsets.type());
+  }
+
+  /**
+   * @brief Create an output offsetalator instance from an offsets column
+   */
+  static output_offsetalator make_output_iterator(mutable_column_view const& offsets)
+  {
+    return output_offsetalator(offsets.head(), offsets.type());
+  }
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b0382d15807..7b628649051 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -393,6 +393,7 @@ set_tests_properties(
 ConfigureTest(
   ITERATOR_TEST
   iterator/indexalator_test.cu
+  iterator/offsetalator_test.cu
   iterator/optional_iterator_test_chrono.cu
   iterator/optional_iterator_test_numeric.cu
   iterator/pair_iterator_test_chrono.cu
diff --git a/cpp/tests/iterator/indexalator_test.cu b/cpp/tests/iterator/indexalator_test.cu
index 3e8bcd5cb0d..0c10853ec02 100644
--- a/cpp/tests/iterator/indexalator_test.cu
+++ b/cpp/tests/iterator/indexalator_test.cu
@@ -157,40 +157,3 @@ TYPED_TEST(IndexalatorTest, output_iterator)
   expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 1, 1, 2, 3, 4, 5, 5, 7});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
 }
-
-/**
- * For testing creating and using the indexalator in device code.
- */
-struct device_functor_fn {
-  cudf::column_device_view const d_col;
-  __device__ cudf::size_type operator()(cudf::size_type idx)
-  {
-    auto itr = cudf::detail::input_indexalator(d_col.head(), d_col.type());
-    return itr[idx] * 3;
-  }
-};
-
-TYPED_TEST(IndexalatorTest, device_indexalator)
-{
-  using T = TypeParam;
-
-  auto d_col1 =
-    cudf::test::fixed_width_column_wrapper<T, int32_t>({0, 6, 7, 14, 23, 33, 43, 45, 63});
-  auto d_col2 =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 0, 0, 0, 0, 0, 0, 0, 0});
-  auto input  = cudf::column_view(d_col1);
-  auto output = cudf::mutable_column_view(d_col2);
-  auto stream = cudf::get_default_stream();
-
-  auto d_input = cudf::column_device_view::create(input, stream);
-
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::counting_iterator<int>(0),
-                    thrust::counting_iterator<int>(input.size()),
-                    output.begin<cudf::size_type>(),
-                    device_functor_fn{*d_input});
-
-  auto expected =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 18, 21, 42, 69, 99, 129, 135, 189});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
-}
diff --git a/cpp/tests/iterator/offsetalator_test.cu b/cpp/tests/iterator/offsetalator_test.cu
new file mode 100644
index 00000000000..e569e58f42a
--- /dev/null
+++ b/cpp/tests/iterator/offsetalator_test.cu
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+#include <tests/iterator/iterator_tests.cuh>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/offsets_iterator_factory.cuh>
+
+#include <thrust/binary_search.h>
+#include <thrust/gather.h>
+#include <thrust/host_vector.h>
+#include <thrust/optional.h>
+#include <thrust/pair.h>
+#include <thrust/scatter.h>
+#include <thrust/sequence.h>
+#include <thrust/transform.h>
+
+using TestingTypes = cudf::test::Types<int32_t, int64_t>;
+
+template <typename T>
+struct OffsetalatorTest : public IteratorTest<T> {};
+
+TYPED_TEST_SUITE(OffsetalatorTest, TestingTypes);
+
+TYPED_TEST(OffsetalatorTest, input_iterator)
+{
+  using T = TypeParam;
+
+  auto host_values = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
+
+  auto d_col = cudf::test::fixed_width_column_wrapper<T>(host_values.begin(), host_values.end());
+
+  auto expected_values = thrust::host_vector<cudf::size_type>(host_values.size());
+  std::transform(host_values.begin(), host_values.end(), expected_values.begin(), [](auto v) {
+    return static_cast<cudf::size_type>(v);
+  });
+
+  auto it_dev = cudf::detail::offsetalator_factory::make_input_iterator(d_col);
+  this->iterator_test_thrust(expected_values, it_dev, host_values.size());
+}
+
+TYPED_TEST(OffsetalatorTest, output_iterator)
+{
+  using T = TypeParam;
+
+  auto d_col1 = cudf::test::fixed_width_column_wrapper<int64_t>({0, 6, 7, 14, 23, 33, 43, 45, 63});
+  auto d_col2 = cudf::test::fixed_width_column_wrapper<T>({0, 0, 0, 0, 0, 0, 0, 0, 0});
+  auto itr    = cudf::detail::offsetalator_factory::make_output_iterator(d_col2);
+  auto input  = cudf::column_view(d_col1);
+  auto stream = cudf::get_default_stream();
+
+  auto map   = cudf::test::fixed_width_column_wrapper<int>({0, 2, 4, 6, 8, 1, 3, 5, 7});
+  auto d_map = cudf::column_view(map);
+  thrust::gather(rmm::exec_policy_nosync(stream),
+                 d_map.begin<int>(),
+                 d_map.end<int>(),
+                 input.begin<int64_t>(),
+                 itr);
+  auto expected = cudf::test::fixed_width_column_wrapper<T>({0, 7, 23, 43, 63, 6, 14, 33, 45});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::scatter(rmm::exec_policy_nosync(stream),
+                  input.begin<int64_t>(),
+                  input.end<int64_t>(),
+                  d_map.begin<int>(),
+                  itr);
+  expected = cudf::test::fixed_width_column_wrapper<T>({0, 33, 6, 43, 7, 45, 14, 63, 23});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::fill(rmm::exec_policy(stream), itr, itr + input.size(), 77);
+  expected = cudf::test::fixed_width_column_wrapper<T>({77, 77, 77, 77, 77, 77, 77, 77, 77});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::sequence(rmm::exec_policy(stream), itr, itr + input.size());
+  expected = cudf::test::fixed_width_column_wrapper<T>({0, 1, 2, 3, 4, 5, 6, 7, 8});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  auto offsets =
+    cudf::test::fixed_width_column_wrapper<int64_t>({0, 10, 20, 30, 40, 50, 60, 70, 80});
+  auto d_offsets = cudf::column_view(offsets);
+  thrust::lower_bound(rmm::exec_policy(stream),
+                      d_offsets.begin<int64_t>(),
+                      d_offsets.end<int64_t>(),
+                      input.begin<int64_t>(),
+                      input.end<int64_t>(),
+                      itr);
+  expected = cudf::test::fixed_width_column_wrapper<T>({0, 1, 1, 2, 3, 4, 5, 5, 7});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+}
+
+namespace {
+/**
+ * For testing creating and using the offsetalator in device code.
+ */
+struct device_functor_fn {
+  cudf::column_device_view const d_col;
+  __device__ int32_t operator()(int idx)
+  {
+    auto const itr = cudf::detail::input_offsetalator(d_col.head(), d_col.type());
+    return static_cast<int32_t>(itr[idx] * 3);
+  }
+};
+}  // namespace
+
+TYPED_TEST(OffsetalatorTest, device_offsetalator)
+{
+  using T = TypeParam;
+
+  auto d_col1 = cudf::test::fixed_width_column_wrapper<T>({0, 6, 7, 14, 23, 33, 43, 45, 63});
+  auto d_col2 = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 0, 0, 0, 0, 0, 0, 0});
+  auto input  = cudf::column_view(d_col1);
+  auto output = cudf::mutable_column_view(d_col2);
+  auto stream = cudf::get_default_stream();
+
+  auto d_input = cudf::column_device_view::create(input, stream);
+
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::counting_iterator<int>(0),
+                    thrust::counting_iterator<int>(input.size()),
+                    output.begin<int32_t>(),
+                    device_functor_fn{*d_input});
+
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<int32_t>({0, 18, 21, 42, 69, 99, 129, 135, 189});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+}

From 4313cfa9b3fcff41f67b48ac8797dc015d441ecc Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 13 Nov 2023 11:40:36 -0800
Subject: [PATCH 092/118] Use new rapids-dask-dependency metapackage for
 managing dask versions (#14364)

* Update dependency lists

* Update wheel building to stop needing manual installations

* Update wheel dependency with alpha spec

* Rename the package

* Update update-version.sh

* Update conda/recipes/dask-cudf/meta.yaml

Co-authored-by: GALI PREM SAGAR <sagarprem75@gmail.com>

* Make pip/conda dependencies consistent and fix recipe

* dfg

* Apply suggestions from code review

---------

Co-authored-by: GALI PREM SAGAR <sagarprem75@gmail.com>
---
 ci/build_wheel.sh                                | 2 ++
 ci/release/update-version.sh                     | 1 +
 ci/test_wheel_dask_cudf.sh                       | 3 ---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 4 +---
 conda/environments/all_cuda-120_arch-x86_64.yaml | 4 +---
 conda/recipes/dask-cudf/meta.yaml                | 8 +-------
 dependencies.yaml                                | 4 +---
 python/dask_cudf/pyproject.toml                  | 3 +--
 8 files changed, 8 insertions(+), 21 deletions(-)

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 08716cdb3d9..ae1d9c3fb1a 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -36,6 +36,8 @@ fi
 
 if [[ ${package_name} == "dask_cudf" ]]; then
     sed -r -i "s/cudf==(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}
 else
     sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
     # ptxcompiler and cubinlinker aren't version constrained
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 7574b4174e9..843abd3c3c1 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -80,6 +80,7 @@ DEPENDENCIES=(
   kvikio
   libkvikio
   librmm
+  rapids-dask-dependency
   rmm
 )
 for DEP in "${DEPENDENCIES[@]}"; do
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 118bea753d0..e9162b816aa 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -23,9 +23,6 @@ manylinux="manylinux_${manylinux_version}"
 RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
 python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
 
-# Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.12
-
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
 
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index adf4fcad32d..9b85888a7b3 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -25,10 +25,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-core>=2023.9.2
 - dask-cuda==23.12.*
-- dask>=2023.9.2
-- distributed>=2023.9.2
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -80,6 +77,7 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
+- rapids-dask-dependency==23.12.*
 - rich
 - rmm==23.12.*
 - s3fs>=2022.3.0
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index a69ef587570..da2b4e109b3 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -27,10 +27,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-core>=2023.9.2
 - dask-cuda==23.12.*
-- dask>=2023.9.2
-- distributed>=2023.9.2
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -78,6 +75,7 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
+- rapids-dask-dependency==23.12.*
 - rich
 - rmm==23.12.*
 - s3fs>=2022.3.0
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 9dc9f76d9f5..16638926492 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -37,17 +37,11 @@ build:
 requirements:
   host:
     - python
-    - cudf ={{ version }}
-    - dask >=2023.9.2
-    - dask-core >=2023.9.2
-    - distributed >=2023.9.2
     - cuda-version ={{ cuda_version }}
   run:
     - python
     - cudf ={{ version }}
-    - dask >=2023.9.2
-    - dask-core >=2023.9.2
-    - distributed >=2023.9.2
+    - rapids-dask-dependency ={{ minor_version }}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/dependencies.yaml b/dependencies.yaml
index a16b51f4483..b971a682571 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -500,12 +500,10 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask>=2023.9.2
-          - distributed>=2023.9.2
+          - rapids-dask-dependency==23.12.*
       - output_types: conda
         packages:
           - cupy>=12.0.0
-          - dask-core>=2023.9.2  # dask-core in conda is the actual package & dask is the meta package
       - output_types: pyproject
         packages:
           - &cudf cudf==23.12.*
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 32c7bb9fd15..0306da3de46 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -20,11 +20,10 @@ requires-python = ">=3.9"
 dependencies = [
     "cudf==23.12.*",
     "cupy-cuda11x>=12.0.0",
-    "dask>=2023.9.2",
-    "distributed>=2023.9.2",
     "fsspec>=0.6.0",
     "numpy>=1.21,<1.25",
     "pandas>=1.3,<1.6.0dev0",
+    "rapids-dask-dependency==23.12.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",

From 5d09d38bc8ea44e1bdf1fa29e11a820c7417bac5 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 14 Nov 2023 00:51:42 -0500
Subject: [PATCH 093/118] Always build nvbench statically so we don't need to
 package it (#14399)

Corrects failures seen in C++ CI where libnvbench.so can't be found

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14399
---
 cpp/cmake/thirdparty/get_nvbench.cmake             | 2 +-
 cpp/cmake/thirdparty/patches/nvbench_override.json | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_nvbench.cmake b/cpp/cmake/thirdparty/get_nvbench.cmake
index f0642145fa0..bbd22693ba4 100644
--- a/cpp/cmake/thirdparty/get_nvbench.cmake
+++ b/cpp/cmake/thirdparty/get_nvbench.cmake
@@ -21,7 +21,7 @@ function(find_and_configure_nvbench)
   set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
   rapids_cpm_package_override("${cudf_patch_dir}/nvbench_override.json")
 
-  rapids_cpm_nvbench()
+  rapids_cpm_nvbench(BUILD_STATIC)
 
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json
index 7be868081b6..ad9b19c29c1 100644
--- a/cpp/cmake/thirdparty/patches/nvbench_override.json
+++ b/cpp/cmake/thirdparty/patches/nvbench_override.json
@@ -7,11 +7,6 @@
           "file" : "${current_json_dir}/nvbench_global_setup.diff",
           "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]",
           "fixed_in" : ""
-        },
-        {
-          "file" : "nvbench/use_existing_fmt.diff",
-          "issue" : "Fix add support for using an existing fmt [https://github.com/NVIDIA/nvbench/pull/125]",
-          "fixed_in" : ""
         }
       ]
     }

From e982d3736f095e680298af85bde732d9b5a73122 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Tue, 14 Nov 2023 09:51:02 -0500
Subject: [PATCH 094/118] cudf.pandas: cuDF subpath checking in module
 `__getattr__` (#14388)

Closes https://github.com/rapidsai/cudf/issues/14384. `x.startswith(y)` is not a good enough check for if `x` is a subdirectory of `y`. It causes `pandasai` to be reported as a sub-package of `pandas`.

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/14388
---
 python/cudf/cudf/pandas/module_accelerator.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index eb35c4adaaf..180d75d96e8 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -10,6 +10,7 @@
 import importlib.abc
 import importlib.machinery
 import os
+import pathlib
 import sys
 import threading
 import warnings
@@ -554,9 +555,10 @@ def getattr_real_or_wrapped(
             frame = sys._getframe()
             # We cannot possibly be at the top level.
             assert frame.f_back
-            calling_module = frame.f_back.f_code.co_filename
+            calling_module = pathlib.PurePath(frame.f_back.f_code.co_filename)
             use_real = any(
-                calling_module.startswith(path) for path in loader._denylist
+                calling_module.is_relative_to(path)
+                for path in loader._denylist
             )
         try:
             if use_real:

From 7f3fba164c4dd28c701ea2941d0525fc782a639c Mon Sep 17 00:00:00 2001
From: Jeremy Dyer <jdye64@gmail.com>
Date: Tue, 14 Nov 2023 12:02:10 -0500
Subject: [PATCH 095/118] Refactor cudf_kafka to use skbuild (#14292)

Refactor the currently outdated cudf_kafka build setup to use skbuild instead.

Authors:
  - Jeremy Dyer (https://github.com/jdye64)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/14292
---
 build.sh                                      |  2 +-
 ci/release/update-version.sh                  |  1 +
 .../all_cuda-120_arch-x86_64.yaml             |  1 -
 conda/recipes/cudf_kafka/build.sh             | 13 ---
 .../cudf_kafka/conda_build_config.yaml        |  6 ++
 conda/recipes/cudf_kafka/meta.yaml            | 21 ++--
 cpp/libcudf_kafka/CMakeLists.txt              |  8 +-
 .../cmake/thirdparty/get_cudf.cmake           | 16 +--
 cpp/libcudf_kafka/tests/CMakeLists.txt        |  2 +-
 dependencies.yaml                             | 13 +--
 python/cudf/cudf/_lib/CMakeLists.txt          |  6 --
 python/cudf_kafka/CMakeLists.txt              | 47 +++++++++
 python/cudf_kafka/LICENSE                     |  1 +
 python/cudf_kafka/README.md                   |  1 +
 .../cudf_kafka/cudf_kafka/_lib/CMakeLists.txt | 62 ++++++++++++
 python/cudf_kafka/cudf_kafka/_lib/kafka.pxd   |  4 +-
 python/cudf_kafka/pyproject.toml              |  1 +
 python/cudf_kafka/setup.py                    | 97 ++-----------------
 18 files changed, 160 insertions(+), 142 deletions(-)
 create mode 100644 python/cudf_kafka/CMakeLists.txt
 create mode 120000 python/cudf_kafka/LICENSE
 create mode 120000 python/cudf_kafka/README.md
 create mode 100644 python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt

diff --git a/build.sh b/build.sh
index 2ad69712e5d..e5beb51dedf 100755
--- a/build.sh
+++ b/build.sh
@@ -369,7 +369,7 @@ fi
 # build cudf_kafka Python package
 if hasArg cudf_kafka; then
     cd ${REPODIR}/python/cudf_kafka
-    SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR}" \
+    SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS}" \
         SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL:-1}" \
         python -m pip install --no-build-isolation --no-deps .
 fi
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 843abd3c3c1..4f1cbc47d1d 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -43,6 +43,7 @@ sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g'
 
 # Python CMakeLists updates
 sed_runner 's/'"cudf_version .*)"'/'"cudf_version ${NEXT_FULL_TAG})"'/g' python/cudf/CMakeLists.txt
+sed_runner 's/'"cudf_kafka_version .*)"'/'"cudf_kafka_version ${NEXT_FULL_TAG})"'/g' python/cudf_kafka/CMakeLists.txt
 
 # cpp libcudf_kafka update
 sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/libcudf_kafka/CMakeLists.txt
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index da2b4e109b3..a3eeb3dd99f 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -17,7 +17,6 @@ dependencies:
 - cmake>=3.26.4
 - cramjam
 - cuda-cudart-dev
-- cuda-gdb
 - cuda-nvcc
 - cuda-nvrtc-dev
 - cuda-nvtx-dev
diff --git a/conda/recipes/cudf_kafka/build.sh b/conda/recipes/cudf_kafka/build.sh
index f4bb6e1bc91..9458349d101 100644
--- a/conda/recipes/cudf_kafka/build.sh
+++ b/conda/recipes/cudf_kafka/build.sh
@@ -1,16 +1,3 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-# This assumes the script is executed from the root of the repo directory
-# Need to set CUDA_HOME inside conda environments because the hacked together
-# setup.py for cudf-kafka searches that way.
-# TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-# cudf_kafka to use scikit-build
-CUDA_MAJOR=${RAPIDS_CUDA_VERSION%%.*}
-if [[ ${CUDA_MAJOR} == "12" ]]; then
-    target_name="x86_64-linux"
-    if [[ ! $(arch) == "x86_64" ]]; then
-        target_name="sbsa-linux"
-    fi
-    export CUDA_HOME="${PREFIX}/targets/${target_name}/"
-fi
 ./build.sh -v cudf_kafka
diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml
index b63a136ad2d..c98c2701653 100644
--- a/conda/recipes/cudf_kafka/conda_build_config.yaml
+++ b/conda/recipes/cudf_kafka/conda_build_config.yaml
@@ -9,3 +9,9 @@ sysroot_version:
 
 cmake_version:
   - ">=3.26.4"
+
+cuda_compiler:
+  - cuda-nvcc
+
+cuda11_compiler:
+  - nvcc
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 9440f8bf124..343ec2519f1 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -33,28 +33,31 @@ build:
     - SCCACHE_S3_KEY_PREFIX=cudf-kafka-linux64 # [linux64]
     - SCCACHE_S3_USE_SSL
     - SCCACHE_S3_NO_CREDENTIALS
-    # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-    # cudf_kafka to use scikit-build
-    - RAPIDS_CUDA_VERSION
+  ignore_run_exports_from:
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }}
+    {% endif %}
 
 requirements:
   build:
     - cmake {{ cmake_version }}
+    - ninja
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
-    - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
-    # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-    # cudf_kafka to use scikit-build
-    {% if cuda_major == "12" %}
-    - cuda-gdb
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    {% else %}
+    - {{ compiler('cuda') }}
     {% endif %}
+    - cuda-version ={{ cuda_version }}
+    - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
     - python
     - cython >=3.0.3
     - cuda-version ={{ cuda_version }}
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
+    - scikit-build >=0.13.1
     - setuptools
     {% if cuda_major == "12" %}
     - cuda-cudart-dev
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index 1a15a3ec2cd..4128afa3935 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -21,7 +21,7 @@ include(rapids-export)
 include(rapids-find)
 
 project(
-  CUDA_KAFKA
+  CUDF_KAFKA
   VERSION 23.12.00
   LANGUAGES CXX
 )
@@ -64,7 +64,7 @@ add_library(cudf_kafka SHARED src/kafka_consumer.cpp src/kafka_callback.cpp)
 # ##################################################################################################
 # * include paths ---------------------------------------------------------------------------------
 target_include_directories(
-  cudf_kafka PUBLIC "$<BUILD_INTERFACE:${CUDA_KAFKA_SOURCE_DIR}/include>"
+  cudf_kafka PUBLIC "$<BUILD_INTERFACE:${CUDF_KAFKA_SOURCE_DIR}/include>"
                     "$<INSTALL_INTERFACE:include>"
 )
 
@@ -85,6 +85,8 @@ set_target_properties(
              CXX_STANDARD_REQUIRED ON
 )
 
+add_library(cudf_kafka::cudf_kafka ALIAS cudf_kafka)
+
 # ##################################################################################################
 # * cudf_kafka Install ----------------------------------------------------------------------------
 rapids_cmake_install_lib_dir(lib_dir)
@@ -94,7 +96,7 @@ install(
   EXPORT cudf_kafka-exports
 )
 
-install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include DESTINATION include)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
 rapids_export(
   INSTALL cudf_kafka
diff --git a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
index aa4c5b60e7a..20aa9873f43 100644
--- a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
+++ b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -35,21 +35,21 @@ function(find_and_configure_cudf VERSION)
   endif()
 endfunction()
 
-set(CUDA_KAFKA_MIN_VERSION_cudf
-    "${CUDA_KAFKA_VERSION_MAJOR}.${CUDA_KAFKA_VERSION_MINOR}.${CUDA_KAFKA_VERSION_PATCH}"
+set(CUDF_KAFKA_MIN_VERSION
+    "${CUDF_KAFKA_VERSION_MAJOR}.${CUDF_KAFKA_VERSION_MINOR}.${CUDF_KAFKA_VERSION_PATCH}"
 )
-find_and_configure_cudf(${CUDA_KAFKA_MIN_VERSION_cudf})
+find_and_configure_cudf(${CUDF_KAFKA_MIN_VERSION})
 
 if(cudf_REQUIRES_CUDA)
-  rapids_cuda_init_architectures(CUDA_KAFKA)
+  rapids_cuda_init_architectures(CUDF_KAFKA)
 
   # Since we are building cudf as part of ourselves we need to enable the CUDA language in the
   # top-most scope
   enable_language(CUDA)
 
-  # Since CUDA_KAFKA only enables CUDA optionally we need to manually include the file that
+  # Since CUDF_KAFKA only enables CUDA optionally we need to manually include the file that
   # rapids_cuda_init_architectures relies on `project` calling
-  if(DEFINED CMAKE_PROJECT_CUDA_KAFKA_INCLUDE)
-    include("${CMAKE_PROJECT_CUDA_KAFKA_INCLUDE}")
+  if(DEFINED CMAKE_PROJECT_CUDF_KAFKA_INCLUDE)
+    include("${CMAKE_PROJECT_CUDF_KAFKA_INCLUDE}")
   endif()
 endif()
diff --git a/cpp/libcudf_kafka/tests/CMakeLists.txt b/cpp/libcudf_kafka/tests/CMakeLists.txt
index 68a5327b455..b819cb6fc3b 100644
--- a/cpp/libcudf_kafka/tests/CMakeLists.txt
+++ b/cpp/libcudf_kafka/tests/CMakeLists.txt
@@ -26,7 +26,7 @@ function(ConfigureTest test_name)
   add_executable(${test_name} ${ARGN})
   set_target_properties(
     ${test_name}
-    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDA_KAFKA_BINARY_DIR}/gtests>"
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_KAFKA_BINARY_DIR}/gtests>"
                INSTALL_RPATH "\$ORIGIN/../../../lib"
   )
   target_link_libraries(
diff --git a/dependencies.yaml b/dependencies.yaml
index b971a682571..97149a5e2ba 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -9,8 +9,8 @@ files:
       - build_all
       - build_cpp
       - build_wheels
-      - build_python
       - build_python_common
+      - build_python_cudf
       - cudatoolkit
       - develop
       - docs
@@ -71,8 +71,8 @@ files:
       table: build-system
     includes:
       - build_all
-      - build_python
       - build_python_common
+      - build_python_cudf
       - build_wheels
   py_run_cudf:
     output: pyproject
@@ -138,8 +138,8 @@ files:
     extras:
       table: build-system
     includes:
-      - build_wheels
       - build_python_common
+      - build_wheels
   py_run_cudf_kafka:
     output: pyproject
     pyproject_dir: python/cudf_kafka
@@ -259,16 +259,16 @@ dependencies:
           - cython>=3.0.3
           # TODO: Pin to numpy<1.25 until cudf requires pandas 2
           - &numpy numpy>=1.21,<1.25
+          - scikit-build>=0.13.1
       - output_types: [conda, requirements, pyproject]
         packages:
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
           - pyarrow==14.0.1.*
-  build_python:
+  build_python_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - scikit-build>=0.13.1
           - rmm==23.12.*
       - output_types: conda
         packages:
@@ -302,9 +302,6 @@ dependencies:
               - cuda-nvrtc-dev
               - cuda-nvtx-dev
               - libcurand-dev
-              # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-              # cudf_kafka to use scikit-build
-              - cuda-gdb
           - matrix:
               cuda: "11.8"
             packages:
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 1b543b94589..c041c7f4842 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -81,12 +81,6 @@ target_link_libraries(strings_udf cudf_strings_udf)
 # necessary. The relevant command is tar -xf /opt/_internal/static-libs-for-embedding-only.tar.xz -C
 # /opt/_internal"
 find_package(NumPy REQUIRED)
-set(targets_using_numpy interop avro csv orc json parquet)
-foreach(target IN LISTS targets_using_numpy)
-  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
-  # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24.
-  # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}")
-endforeach()
 
 set(targets_using_dlpack interop)
 foreach(target IN LISTS targets_using_dlpack)
diff --git a/python/cudf_kafka/CMakeLists.txt b/python/cudf_kafka/CMakeLists.txt
new file mode 100644
index 00000000000..d55c3fdc076
--- /dev/null
+++ b/python/cudf_kafka/CMakeLists.txt
@@ -0,0 +1,47 @@
+# =============================================================================
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
+
+set(cudf_kafka_version 23.12.00)
+
+include(../../fetch_rapids.cmake)
+
+project(
+  cudf-kafka-python
+  VERSION ${cudf_kafka_version}
+  LANGUAGES # TODO: Building Python extension modules via the python_extension_module requires the C
+            # language to be enabled here. The test project that is built in scikit-build to verify
+            # various linking options for the python library is hardcoded to build with C, so until
+            # that is fixed we need to keep C.
+            C CXX
+)
+
+find_package(cudf_kafka ${cudf_kafka_version} REQUIRED)
+
+if(NOT cudf_kafka_FOUND)
+  message(
+    FATAL_ERROR
+      "cudf_kafka package not found. cudf_kafka C++ is required to build this Python package."
+  )
+endif()
+
+include(rapids-cython)
+rapids_cython_init()
+
+add_subdirectory(cudf_kafka/_lib)
+
+if(DEFINED cython_lib_dir)
+  rapids_cython_add_rpath_entries(TARGET cudf_kafka PATHS "${cython_lib_dir}")
+endif()
diff --git a/python/cudf_kafka/LICENSE b/python/cudf_kafka/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/cudf_kafka/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/cudf_kafka/README.md b/python/cudf_kafka/README.md
new file mode 120000
index 00000000000..fe840054137
--- /dev/null
+++ b/python/cudf_kafka/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
new file mode 100644
index 00000000000..3262b7d5ebe
--- /dev/null
+++ b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
@@ -0,0 +1,62 @@
+# =============================================================================
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources kafka.pyx)
+set(linked_libraries cudf_kafka::cudf_kafka)
+
+rapids_cython_create_modules(
+  CXX ASSOCIATED_TARGETS cudf_kafka
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}"
+)
+
+# TODO: Finding NumPy currently requires finding Development due to a bug in CMake. This bug was
+# fixed in https://gitlab.kitware.com/cmake/cmake/-/merge_requests/7410 and will be available in
+# CMake 3.24, so we can remove the Development component once we upgrade to CMake 3.24.
+# find_package(Python REQUIRED COMPONENTS Development NumPy)
+
+# Note: The bug noted above prevents us from finding NumPy successfully using FindPython.cmake
+# inside the manylinux images used to build wheels because manylinux images do not contain
+# libpython.so and therefore Development cannot be found. Until we upgrade to CMake 3.24, we should
+# use FindNumpy.cmake instead (provided by scikit-build). When we switch to 3.24 we can try
+# switching back, but it may not work if that implicitly still requires Python libraries. In that
+# case we'll need to follow up with the CMake team to remove that dependency.  The stopgap solution
+# is to unpack the static lib tarballs in the wheel building jobs so that there are at least static
+# libs to be found, but that should be a last resort since it implies a dependency that isn't really
+# necessary. The relevant command is tar -xf /opt/_internal/static-libs-for-embedding-only.tar.xz -C
+# /opt/_internal"
+find_package(NumPy REQUIRED)
+
+find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
+
+execute_process(
+  COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())"
+  OUTPUT_VARIABLE PYARROW_INCLUDE_DIR
+  ERROR_VARIABLE PYARROW_ERROR
+  RESULT_VARIABLE PYARROW_RESULT
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+if(${PYARROW_RESULT})
+  message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}")
+endif()
+
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers.
+# These requirements will go away once all scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
index ca729c62512..068837d04ee 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
@@ -11,12 +11,12 @@ from cudf._lib.cpp.io.datasource cimport datasource
 from cudf._lib.io.datasource cimport Datasource
 
 
-cdef extern from "kafka_callback.hpp" \
+cdef extern from "cudf_kafka/kafka_callback.hpp" \
         namespace "cudf::io::external::kafka" nogil:
     ctypedef object (*python_callable_type)()
 
 
-cdef extern from "kafka_consumer.hpp" \
+cdef extern from "cudf_kafka/kafka_consumer.hpp" \
         namespace "cudf::io::external::kafka" nogil:
 
     cpdef cppclass kafka_consumer:
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 4829f06ab09..15431161d75 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -6,6 +6,7 @@ requires = [
     "cython>=3.0.3",
     "numpy>=1.21,<1.25",
     "pyarrow==14.0.1.*",
+    "scikit-build>=0.13.1",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cudf_kafka/setup.py b/python/cudf_kafka/setup.py
index 6f3909d4528..6a99e9ed968 100644
--- a/python/cudf_kafka/setup.py
+++ b/python/cudf_kafka/setup.py
@@ -1,96 +1,13 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-import os
-import shutil
-import sysconfig
-from distutils.sysconfig import get_python_lib
-
-import numpy as np
-import pyarrow as pa
-from Cython.Build import cythonize
-from setuptools import find_packages, setup
-from setuptools.extension import Extension
-
-cython_files = ["cudf_kafka/_lib/*.pyx"]
-
-CUDA_HOME = os.environ.get("CUDA_HOME", False)
-if not CUDA_HOME:
-    path_to_cuda_gdb = shutil.which("cuda-gdb")
-    if path_to_cuda_gdb is None:
-        raise OSError(
-            "Could not locate CUDA. "
-            "Please set the environment variable "
-            "CUDA_HOME to the path to the CUDA installation "
-            "and try again."
-        )
-    CUDA_HOME = os.path.dirname(os.path.dirname(path_to_cuda_gdb))
-
-if not os.path.isdir(CUDA_HOME):
-    raise OSError(f"Invalid CUDA_HOME: directory does not exist: {CUDA_HOME}")
-
-cuda_include_dir = os.path.join(CUDA_HOME, "include")
-
-CUDF_ROOT = os.environ.get(
-    "CUDF_ROOT",
-    os.path.abspath(
-        os.path.join(
-            os.path.dirname(os.path.abspath(__file__)), "../../cpp/build/"
-        )
-    ),
-)
-CUDF_KAFKA_ROOT = os.environ.get(
-    "CUDF_KAFKA_ROOT", "../../cpp/libcudf_kafka/build"
-)
-
-try:
-    nthreads = int(os.environ.get("PARALLEL_LEVEL", "0") or "0")
-except Exception:
-    nthreads = 0
-
-extensions = [
-    Extension(
-        "*",
-        sources=cython_files,
-        include_dirs=[
-            os.path.abspath(os.path.join(CUDF_ROOT, "../include/cudf")),
-            os.path.abspath(os.path.join(CUDF_ROOT, "../include")),
-            os.path.abspath(
-                os.path.join(CUDF_ROOT, "../libcudf_kafka/include/cudf_kafka")
-            ),
-            os.path.join(CUDF_ROOT, "include"),
-            os.path.join(CUDF_ROOT, "_deps/libcudacxx-src/include"),
-            os.path.join(
-                os.path.dirname(sysconfig.get_path("include")),
-                "rapids/libcudacxx",
-            ),
-            os.path.dirname(sysconfig.get_path("include")),
-            np.get_include(),
-            pa.get_include(),
-            cuda_include_dir,
-        ],
-        library_dirs=(
-            [
-                get_python_lib(),
-                os.path.join(os.sys.prefix, "lib"),
-                CUDF_KAFKA_ROOT,
-            ]
-        ),
-        libraries=["cudf", "cudf_kafka"],
-        language="c++",
-        extra_compile_args=["-std=c++17", "-DFMT_HEADER_ONLY=1"],
-    )
-]
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+from setuptools import find_packages
+from skbuild import setup
 
 packages = find_packages(include=["cudf_kafka*"])
+
 setup(
-    # Include the separately-compiled shared library
-    ext_modules=cythonize(
-        extensions,
-        nthreads=nthreads,
-        compiler_directives=dict(
-            profile=False, language_level=3, embedsignature=True
-        ),
-    ),
     packages=packages,
-    package_data={key: ["VERSION", "*.pxd"] for key in packages},
+    package_data={
+        key: ["VERSION", "*.pxd", "*.hpp", "*.cuh"] for key in packages
+    },
     zip_safe=False,
 )

From b0c1b7b82ccdf1a7e4159cb3bffa1984092440d4 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 14 Nov 2023 12:48:32 -0500
Subject: [PATCH 096/118] Add BytePairEncoder class to cuDF (#13891)

Adds a new BytePairEncoding class to cuDF
```
>>> import cudf
>>> from cudf.core.byte_pair_encoding import BytePairEncoder
>>> mps = cudf.read_text('merges.txt', delimiter='\n', strip_delimiters=True)
>>> bpe = BytePairEncoder(mps)
>>> str_series = cudf.Series(['This is a sentence', 'thisisit'])
>>> bpe(str_series)
0    This is a sent ence
1             this is it
dtype: object
```
This class wraps the existing `nvtext::byte_pair_encoding` APIs to load the merge-pairs data and encode a column of strings.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13891
---
 .../cudf/_lib/cpp/nvtext/byte_pair_encode.pxd | 24 ++++++++
 python/cudf/cudf/_lib/nvtext/CMakeLists.txt   |  4 +-
 .../cudf/_lib/nvtext/byte_pair_encode.pyx     | 50 ++++++++++++++++
 python/cudf/cudf/core/byte_pair_encoding.py   | 59 +++++++++++++++++++
 .../cudf/cudf/tests/text/test_text_methods.py | 41 +++++++++++++
 5 files changed, 176 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
 create mode 100644 python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
 create mode 100644 python/cudf/cudf/core/byte_pair_encoding.py

diff --git a/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
new file mode 100644
index 00000000000..e678e4e84db
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
@@ -0,0 +1,24 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+
+
+cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil:
+
+    cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs":
+        pass
+
+    cdef unique_ptr[bpe_merge_pairs] load_merge_pairs(
+        const column_view &merge_pairs
+    ) except +
+
+    cdef unique_ptr[column] byte_pair_encoding(
+        const column_view &strings,
+        const bpe_merge_pairs &merge_pairs,
+        const string_scalar &separator
+    ) except +
diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
index d4e2392ee04..d7cbdeb5bda 100644
--- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
@@ -13,8 +13,8 @@
 # =============================================================================
 
 set(cython_sources
-    edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx
-    replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
+    byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
+    ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
 )
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
new file mode 100644
index 00000000000..cfc76afa8a5
--- /dev/null
+++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
@@ -0,0 +1,50 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+
+from cudf.core.buffer import acquire_spill_lock
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.nvtext.byte_pair_encode cimport (
+    bpe_merge_pairs as cpp_bpe_merge_pairs,
+    byte_pair_encoding as cpp_byte_pair_encoding,
+    load_merge_pairs as cpp_load_merge_pairs,
+)
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.scalar cimport DeviceScalar
+
+
+cdef class BPEMergePairs:
+    cdef unique_ptr[cpp_bpe_merge_pairs] c_obj
+
+    def __cinit__(self, Column merge_pairs):
+        cdef column_view c_pairs = merge_pairs.view()
+        with nogil:
+            self.c_obj = move(cpp_load_merge_pairs(c_pairs))
+
+
+@acquire_spill_lock()
+def byte_pair_encoding(
+    Column strings,
+    BPEMergePairs merge_pairs,
+    object separator
+):
+    cdef column_view c_strings = strings.view()
+    cdef DeviceScalar d_separator = separator.device_value
+    cdef const string_scalar* c_separator = <const string_scalar*>d_separator\
+        .get_raw_ptr()
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = move(
+            cpp_byte_pair_encoding(
+                c_strings,
+                merge_pairs.c_obj.get()[0],
+                c_separator[0]
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
new file mode 100644
index 00000000000..4c881022ecf
--- /dev/null
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import cudf
+from cudf._lib.nvtext.byte_pair_encode import (
+    BPEMergePairs as cpp_merge_pairs,
+    byte_pair_encoding as cpp_byte_pair_encoding,
+)
+
+
+class BytePairEncoder:
+    """
+    Given a merge pairs strings series, performs byte pair encoding on
+    a strings series using the provided separator.
+
+    Parameters
+    ----------
+    merges_pairs : str
+        Strings column of merge pairs
+
+    Returns
+    -------
+    BytePairEncoder
+    """
+
+    def __init__(self, merges_pair: "cudf.Series"):
+        self.merge_pairs = cpp_merge_pairs(merges_pair._column)
+
+    def __call__(self, text, separator: str = " "):
+        """
+
+        Parameters
+        ----------
+        text : cudf string series
+            The strings to be encoded.
+
+        Returns
+        -------
+        Encoded strings
+
+        Examples
+        --------
+        >>> import cudf
+        >>> from cudf.core.byte_pair_encoding import BytePairEncoder
+        >>> mps = cudf.Series(["e n", "i t", "i s", "e s", "en t",
+        ...                    "c e", "es t", "en ce", "T h", "Th is",
+        ...                    "t est", "s ent", "t h", "th is"])
+        >>> bpe = BytePairEncoder(mps)
+        >>> str_series = cudf.Series(['This is the sentence', 'thisisit'])
+        >>> bpe(str_series)
+        0    This is a sent ence
+        1             this is it
+        dtype: object
+        """
+        sep = cudf.Scalar(separator, dtype="str")
+        result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep)
+
+        return cudf.Series(result)
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index e565df8f3da..2dccd583b23 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -7,6 +7,7 @@
 import pytest
 
 import cudf
+from cudf.core.byte_pair_encoding import BytePairEncoder
 from cudf.core.tokenize_vocabulary import TokenizeVocabulary
 from cudf.testing._utils import assert_eq
 
@@ -1024,3 +1025,43 @@ def test_jaccard_index_random_strings():
 
     actual = str1.str.jaccard_index(str2, jaccard_width)
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "separator, input, results",
+    [
+        (" ", "thetestsentence", "the test sent ence"),
+        ("_", "sentenceistest", "sent_ence_is_test"),
+        ("$", "istestsentencehere", "is$test$sent$ence$he$r$e"),
+    ],
+)
+def test_byte_pair_encoding(separator, input, results):
+    pairs_table = cudf.Series(
+        [
+            "t he",
+            "h e",
+            "e n",
+            "i t",
+            "i s",
+            "e s",
+            "en t",
+            "c e",
+            "es t",
+            "en ce",
+            "t h",
+            "h i",
+            "th is",
+            "t est",
+            "s i",
+            "s ent",
+        ]
+    )
+    encoder = BytePairEncoder(pairs_table)
+
+    strings = cudf.Series([input, None, "", input])
+
+    expected = cudf.Series([results, None, "", results])
+
+    actual = encoder(strings, separator)
+    assert type(expected) == type(actual)
+    assert_eq(expected, actual)

From b446a6f187241e765c925da1053ece2679313a06 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 14 Nov 2023 12:49:19 -0500
Subject: [PATCH 097/118] Fix token-count logic in
 nvtext::tokenize_with_vocabulary (#14393)

Fixes a bug introduced in #14336 when trying to simplify the token-counting logic as per this discussion https://github.com/rapidsai/cudf/pull/14336#discussion_r1378173552
The simplification caused an error which was found when running the nvtext benchmarks.
The appropriate gtest has been updated to cover this case now.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/14393
---
 cpp/benchmarks/text/vocab.cpp       |  2 +-
 cpp/src/text/vocabulary_tokenize.cu |  8 ++++++--
 cpp/tests/text/tokenize_tests.cpp   | 12 ++++++------
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp
index 6922b7214ff..80942e2697d 100644
--- a/cpp/benchmarks/text/vocab.cpp
+++ b/cpp/benchmarks/text/vocab.cpp
@@ -53,7 +53,7 @@ static void bench_vocab_tokenize(nvbench::state& state)
 
   auto const vocab_col = [] {
     data_profile const profile = data_profile_builder().no_validity().distribution(
-      cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
+      cudf::type_id::STRING, distribution_id::NORMAL, 0, 15);
     auto const col = create_random_column(cudf::type_id::STRING, row_count{100}, profile);
     return cudf::strings::filter_characters_of_type(
       cudf::strings_column_view(col->view()),
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index 41f8c0a8731..511f1995374 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -276,8 +276,12 @@ __global__ void token_counts_fn(cudf::column_device_view const d_strings,
   __syncwarp();
 
   for (auto itr = d_output + lane_idx + 1; itr < d_output_end; itr += cudf::detail::warp_size) {
-    // add one if at the edge of a token or at the string's end
-    count += ((*itr && !(*(itr - 1))) || (itr + 1 == d_output_end));
+    // add one if at the edge of a token or if at the string's end
+    if (*itr) {
+      count += !(*(itr - 1));
+    } else {
+      count += (itr + 1 == d_output_end);
+    }
   }
   __syncwarp();
 
diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp
index 8118183a458..ea36e13de6f 100644
--- a/cpp/tests/text/tokenize_tests.cpp
+++ b/cpp/tests/text/tokenize_tests.cpp
@@ -246,14 +246,14 @@ TEST_F(TextTokenizeTest, Vocabulary)
 
 TEST_F(TextTokenizeTest, VocabularyLongStrings)
 {
-  cudf::test::strings_column_wrapper vocabulary(  // leaving out 'cat' on purpose
+  cudf::test::strings_column_wrapper vocabulary(
     {"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"});
   auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary));
 
   std::vector<std::string> h_strings(
     4,
     "the fox jumped chased the dog cheese mouse at the over there dog mouse cat plus the horse "
-    "jumped over the mouse house with the dog");
+    "jumped  over  the mousé  house with the dog  ");
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end());
   auto input_view = cudf::strings_column_view(input);
   auto delimiter  = cudf::string_scalar(" ");
@@ -262,10 +262,10 @@ TEST_F(TextTokenizeTest, VocabularyLongStrings)
 
   using LCW = cudf::test::lists_column_wrapper<cudf::size_type>;
   // clang-format off
-  LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
-                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
-                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
-                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3}});
+  LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3}});
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 

From 8106a0c3d2050786f42152a280bd9315b897379e Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 14 Nov 2023 16:03:54 -0600
Subject: [PATCH 098/118] Cleanup remaining usages of dask dependencies
 (#14407)

This PR switches remaining usages of `dask` dependencies to use `rapids-dask-dependency`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Jake Awe (https://github.com/AyodeAwe)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14407
---
 conda/recipes/custreamz/meta.yaml   |  4 +---
 conda/recipes/dask-cudf/run_test.sh | 36 -----------------------------
 2 files changed, 1 insertion(+), 39 deletions(-)
 delete mode 100644 conda/recipes/dask-cudf/run_test.sh

diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index fb6efabffd4..b8c5918ea60 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -45,9 +45,7 @@ requirements:
     - streamz
     - cudf ={{ version }}
     - cudf_kafka ={{ version }}
-    - dask >=2023.9.2
-    - dask-core >=2023.9.2
-    - distributed >=2023.9.2
+    - rapids-dask-dependency ={{ version }}
     - python-confluent-kafka >=1.9.0,<1.10.0a0
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh
deleted file mode 100644
index e7238d00f2b..00000000000
--- a/conda/recipes/dask-cudf/run_test.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-
-set -e
-
-# Logger function for build status output
-function logger() {
-  echo -e "\n>>>> $@\n"
-}
-
-# Importing cudf on arm64 CPU only nodes is currently not working due to a
-# difference in reported gpu devices between arm64 and amd64
-ARCH=$(arch)
-
-if [ "${ARCH}" = "aarch64" ]; then
-  logger "Skipping tests on arm64"
-  exit 0
-fi
-
-# Dask & Distributed option to install main(nightly) or `conda-forge` packages.
-export INSTALL_DASK_MAIN=1
-
-# Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2023.9.2"
-
-# Install the conda-forge or nightly version of dask and distributed
-if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
-    rapids-logger "rapids-mamba-retry install -c dask/label/dev 'dask/label/dev::dask' 'dask/label/dev::distributed'"
-    rapids-mamba-retry install -c dask/label/dev "dask/label/dev::dask" "dask/label/dev::distributed"
-else
-    rapids-logger "rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall"
-    rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall
-fi
-
-logger "python -c 'import dask_cudf'"
-python -c "import dask_cudf"

From 27b052d01ebdfd3690b90588971817423614acc0 Mon Sep 17 00:00:00 2001
From: shrshi <shruti.shivakumar@gmail.com>
Date: Tue, 14 Nov 2023 14:39:07 -0800
Subject: [PATCH 099/118] Added streams to CSV reader and writer api (#14340)

This PR contributes to https://github.com/rapidsai/cudf/issues/13744.
-Added stream parameters to public APIs
`cudf::io::read_csv`
`cudf::io::write_csv`
-Added stream gtests

Authors:
  - https://github.com/shrshi
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/14340
---
 cpp/include/cudf/io/csv.hpp              |   4 +
 cpp/include/cudf/io/detail/csv.hpp       |   1 -
 cpp/include/cudf_test/column_wrapper.hpp |  16 ++--
 cpp/src/io/csv/writer_impl.cu            |  38 +++++----
 cpp/src/io/functions.cpp                 |  12 ++-
 cpp/tests/CMakeLists.txt                 |   1 +
 cpp/tests/streams/io/csv_test.cpp        | 102 +++++++++++++++++++++++
 7 files changed, 150 insertions(+), 24 deletions(-)
 create mode 100644 cpp/tests/streams/io/csv_test.cpp

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index ac885c54356..435583e805d 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -1307,6 +1307,7 @@ class csv_reader_options_builder {
  * @endcode
  *
  * @param options Settings for controlling reading behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the table in the returned
  * table_with_metadata
  *
@@ -1314,6 +1315,7 @@ class csv_reader_options_builder {
  */
 table_with_metadata read_csv(
   csv_reader_options options,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
@@ -1715,9 +1717,11 @@ class csv_writer_options_builder {
  * @endcode
  *
  * @param options Settings for controlling writing behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
 void write_csv(csv_writer_options const& options,
+               rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 9fdc7a47fb9..40ddcf385b0 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/io/csv.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index e94dfea9dcf..b9f2e0d9868 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -803,7 +803,8 @@ class strings_column_wrapper : public detail::column_wrapper {
       offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
     auto d_bitmask = cudf::detail::make_device_uvector_sync(
       null_mask, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
-    wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask, null_count);
+    wrapped = cudf::make_strings_column(
+      d_chars, d_offsets, d_bitmask, null_count, cudf::test::get_default_stream());
   }
 
   /**
@@ -1846,7 +1847,8 @@ class structs_column_wrapper : public detail::column_wrapper {
                    child_column_wrappers.end(),
                    std::back_inserter(child_columns),
                    [&](auto const& column_wrapper) {
-                     return std::make_unique<cudf::column>(column_wrapper.get());
+                     return std::make_unique<cudf::column>(column_wrapper.get(),
+                                                           cudf::test::get_default_stream());
                    });
     init(std::move(child_columns), validity);
   }
@@ -1882,7 +1884,8 @@ class structs_column_wrapper : public detail::column_wrapper {
                    child_column_wrappers.end(),
                    std::back_inserter(child_columns),
                    [&](auto const& column_wrapper) {
-                     return std::make_unique<cudf::column>(column_wrapper.get());
+                     return std::make_unique<cudf::column>(column_wrapper.get(),
+                                                           cudf::test::get_default_stream());
                    });
     init(std::move(child_columns), validity_iter);
   }
@@ -1906,8 +1909,11 @@ class structs_column_wrapper : public detail::column_wrapper {
       return cudf::test::detail::make_null_mask(validity.begin(), validity.end());
     }();
 
-    wrapped = cudf::make_structs_column(
-      num_rows, std::move(child_columns), null_count, std::move(null_mask));
+    wrapped = cudf::make_structs_column(num_rows,
+                                        std::move(child_columns),
+                                        null_count,
+                                        std::move(null_mask),
+                                        cudf::test::get_default_stream());
   }
 
   template <typename V>
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 8c586306ad5..6e9c634804c 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -146,6 +146,12 @@ struct column_to_strings_fn {
   {
   }
 
+  ~column_to_strings_fn()                                      = default;
+  column_to_strings_fn(column_to_strings_fn const&)            = delete;
+  column_to_strings_fn& operator=(column_to_strings_fn const&) = delete;
+  column_to_strings_fn(column_to_strings_fn&&)                 = delete;
+  column_to_strings_fn& operator=(column_to_strings_fn&&)      = delete;
+
   // Note: `null` replacement with `na_rep` deferred to `concatenate()`
   // instead of column-wise; might be faster
   //
@@ -160,8 +166,9 @@ struct column_to_strings_fn {
   std::enable_if_t<std::is_same_v<column_type, bool>, std::unique_ptr<column>> operator()(
     column_view const& column) const
   {
-    return cudf::strings::detail::from_booleans(
-      column, options_.get_true_value(), options_.get_false_value(), stream_, mr_);
+    string_scalar true_string{options_.get_true_value(), true, stream_};
+    string_scalar false_string{options_.get_false_value(), true, stream_};
+    return cudf::strings::detail::from_booleans(column, true_string, false_string, stream_, mr_);
   }
 
   // strings:
@@ -367,10 +374,10 @@ void write_chunked(data_sink* out_sink,
 
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
 
-  cudf::string_scalar newline{options.get_line_terminator()};
+  cudf::string_scalar newline{options.get_line_terminator(), true, stream};
   auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view,
                                                             newline,
-                                                            string_scalar("", false),
+                                                            string_scalar{"", false, stream},
                                                             stream,
                                                             rmm::mr::get_current_device_resource());
   strings_column_view strings_column{p_str_col_w_nl->view()};
@@ -455,12 +462,14 @@ void write_csv(data_sink* out_sink,
 
       // populate vector of string-converted columns:
       //
-      std::transform(sub_view.begin(),
-                     sub_view.end(),
-                     std::back_inserter(str_column_vec),
-                     [converter](auto const& current_col) {
-                       return cudf::type_dispatcher(current_col.type(), converter, current_col);
-                     });
+      std::transform(
+        sub_view.begin(),
+        sub_view.end(),
+        std::back_inserter(str_column_vec),
+        [&converter = std::as_const(converter)](auto const& current_col) {
+          return cudf::type_dispatcher<cudf::id_to_type_impl, column_to_strings_fn const&>(
+            current_col.type(), converter, current_col);
+        });
 
       // create string table view from str_column_vec:
       //
@@ -470,18 +479,19 @@ void write_csv(data_sink* out_sink,
       // concatenate columns in each row into one big string column
       // (using null representation and delimiter):
       //
-      std::string delimiter_str{options.get_inter_column_delimiter()};
       auto str_concat_col = [&] {
+        cudf::string_scalar delimiter_str{
+          std::string{options.get_inter_column_delimiter()}, true, stream};
+        cudf::string_scalar options_narep{options.get_na_rep(), true, stream};
         if (str_table_view.num_columns() > 1)
           return cudf::strings::detail::concatenate(str_table_view,
                                                     delimiter_str,
-                                                    options.get_na_rep(),
+                                                    options_narep,
                                                     strings::separator_on_nulls::YES,
                                                     stream,
                                                     rmm::mr::get_current_device_resource());
-        cudf::string_scalar narep{options.get_na_rep()};
         return cudf::strings::detail::replace_nulls(
-          str_table_view.column(0), narep, stream, rmm::mr::get_current_device_resource());
+          str_table_view.column(0), options_narep, stream, rmm::mr::get_current_device_resource());
       }();
 
       write_chunked(out_sink, str_concat_col->view(), options, stream, mr);
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 00d56008611..964e40e36cd 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -230,7 +230,9 @@ void write_json(json_writer_options const& options,
     mr);
 }
 
-table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_csv(csv_reader_options options,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -245,12 +247,14 @@ table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_
   return cudf::io::detail::csv::read_csv(  //
     std::move(datasources[0]),
     options,
-    cudf::get_default_stream(),
+    stream,
     mr);
 }
 
 // Freeform API wraps the detail writer class API
-void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resource* mr)
+void write_csv(csv_writer_options const& options,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr)
 {
   using namespace cudf::io::detail;
 
@@ -262,7 +266,7 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
     options.get_table(),
     options.get_names(),
     options,
-    cudf::get_default_stream(),
+    stream,
     mr);
 }
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 7b628649051..1be8566fb0f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -635,6 +635,7 @@ ConfigureTest(
 ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_CSVIO_TEST streams/io/csv_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/io/csv_test.cpp b/cpp/tests/streams/io/csv_test.cpp
new file mode 100644
index 00000000000..88514fa412c
--- /dev/null
+++ b/cpp/tests/streams/io/csv_test.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/io/csv.hpp>
+#include <cudf/io/detail/csv.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <string>
+#include <vector>
+
+auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
+  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+class CSVTest : public cudf::test::BaseFixture {};
+
+TEST_F(CSVTest, CSVWriter)
+{
+  constexpr auto num_rows = 10;
+
+  std::vector<size_t> zeros(num_rows, 0);
+  std::vector<size_t> ones(num_rows, 1);
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{12}};
+  });
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{-12}};
+  });
+
+  cudf::test::fixed_width_column_wrapper<bool> col0(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int8_t> col1(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int16_t> col2(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6(col6_data, col6_data + num_rows);
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7(col7_data, col7_data + num_rows);
+
+  std::vector<std::string> col8_data(num_rows, "rapids");
+  cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end());
+
+  cudf::table_view tab({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+
+  auto const filepath = temp_env->get_temp_dir() + "multicolumn.csv";
+  auto w_options      = cudf::io::csv_writer_options::builder(cudf::io::sink_info{filepath}, tab)
+                     .include_header(false)
+                     .inter_column_delimiter(',');
+  cudf::io::write_csv(w_options.build(), cudf::test::get_default_stream());
+}
+
+TEST_F(CSVTest, CSVReader)
+{
+  constexpr auto num_rows = 10;
+
+  std::vector<size_t> zeros(num_rows, 0);
+  std::vector<size_t> ones(num_rows, 1);
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{12}};
+  });
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{-12}};
+  });
+
+  cudf::test::fixed_width_column_wrapper<bool> col0(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int8_t> col1(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int16_t> col2(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6(col6_data, col6_data + num_rows);
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7(col7_data, col7_data + num_rows);
+
+  std::vector<std::string> col8_data(num_rows, "rapids");
+  cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end());
+
+  cudf::table_view tab({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+
+  auto const filepath = temp_env->get_temp_dir() + "multicolumn.csv";
+  auto w_options      = cudf::io::csv_writer_options::builder(cudf::io::sink_info{filepath}, tab)
+                     .include_header(false)
+                     .inter_column_delimiter(',');
+  cudf::io::write_csv(w_options.build(), cudf::test::get_default_stream());
+}

From 330d389b26a05676d9f079503a3d96b571762337 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 14 Nov 2023 17:56:55 -0500
Subject: [PATCH 100/118] Ensure nvbench initializes nvml context when built
 statically (#14411)

Port https://github.com/NVIDIA/nvbench/pull/148 to cudf so that nvbench benchmarks work now that we always use a static version of nvbench.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14411
---
 cpp/cmake/thirdparty/patches/nvbench_override.json | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json
index ad9b19c29c1..f85bdb9486c 100644
--- a/cpp/cmake/thirdparty/patches/nvbench_override.json
+++ b/cpp/cmake/thirdparty/patches/nvbench_override.json
@@ -7,6 +7,11 @@
           "file" : "${current_json_dir}/nvbench_global_setup.diff",
           "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]",
           "fixed_in" : ""
+        },
+        {
+          "file" : "nvbench/nvml_with_static_builds.diff",
+          "issue" : "Add support for nvml with static nvbench [https://github.com/NVIDIA/nvbench/pull/148]",
+          "fixed_in" : ""
         }
       ]
     }

From 8a0a08f34ff804a7329ea640aa1e0a9b188d2162 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 14 Nov 2023 17:55:16 -1000
Subject: [PATCH 101/118] Fix as_column(pd.Timestamp/Timedelta, length=) not
 respecting length (#14390)

Noticed this while trying to clean up `as_column`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14390
---
 python/cudf/cudf/core/column/column.py |  5 ++++-
 python/cudf/cudf/tests/test_column.py  | 13 ++++++++-----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index a5e99abd79e..b4f65693d85 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2102,7 +2102,10 @@ def as_column(
     elif isinstance(arbitrary, (pd.Timestamp, pd.Timedelta)):
         # This will always treat NaTs as nulls since it's not technically a
         # discrete value like NaN
-        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))
+        length = length or 1
+        data = as_column(
+            pa.array(pd.Series([arbitrary] * length), from_pandas=True)
+        )
         if dtype is not None:
             data = data.astype(dtype)
 
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index db0446d506c..0546638f388 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -193,12 +193,15 @@ def test_column_mixed_dtype(data, error):
 
 
 @pytest.mark.parametrize("nan_as_null", [True, False])
-def test_as_column_scalar_with_nan(nan_as_null):
-    size = 10
-    scalar = np.nan
-
+@pytest.mark.parametrize(
+    "scalar",
+    [np.nan, pd.Timedelta(days=1), pd.Timestamp(2020, 1, 1)],
+    ids=repr,
+)
+@pytest.mark.parametrize("size", [1, 10])
+def test_as_column_scalar_with_nan(nan_as_null, scalar, size):
     expected = (
-        cudf.Series([np.nan] * size, nan_as_null=nan_as_null)
+        cudf.Series([scalar] * size, nan_as_null=nan_as_null)
         .dropna()
         .to_numpy()
     )

From ab2248ea8e693143823d02bb8b806c65bfc3bf32 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 14 Nov 2023 23:30:51 -0800
Subject: [PATCH 102/118] Fix and disable encoding for nanosecond statistics in
 ORC writer (#14367)

Issue https://github.com/rapidsai/cudf/issues/14325

Use uint when reading/writing nano stats because nanoseconds have int32 encoding (different from both unit32 and sint32, _obviously_), which does not use zigzag.
sint32 uses zigzag, and unit32 does not allow negative numbers, so we can use uint since we'll never have negative nanoseconds.

Also disabled the nanoseconds because it should only be written after ORC-135; we don't write the version so readers get confused if nanoseconds are there. Planning to re-enable once we start writing the version.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14367
---
 cpp/include/cudf/io/orc_metadata.hpp |  8 +++----
 cpp/src/io/orc/orc.cpp               | 13 +++++++++++
 cpp/src/io/orc/orc.hpp               |  6 +++++
 cpp/src/io/orc/stats_enc.cu          | 35 +++++++++++++++++++++-------
 cpp/tests/io/orc_test.cpp            | 16 +++++++++----
 5 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index 82d59803c25..9531a012e49 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -141,10 +141,10 @@ using binary_statistics = sum_statistics<int64_t>;
  * the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC.
  */
 struct timestamp_statistics : minmax_statistics<int64_t> {
-  std::optional<int64_t> minimum_utc;    ///< minimum in milliseconds
-  std::optional<int64_t> maximum_utc;    ///< maximum in milliseconds
-  std::optional<int32_t> minimum_nanos;  ///< nanoseconds part of the minimum
-  std::optional<int32_t> maximum_nanos;  ///< nanoseconds part of the maximum
+  std::optional<int64_t> minimum_utc;     ///< minimum in milliseconds
+  std::optional<int64_t> maximum_utc;     ///< maximum in milliseconds
+  std::optional<uint32_t> minimum_nanos;  ///< nanoseconds part of the minimum
+  std::optional<uint32_t> maximum_nanos;  ///< nanoseconds part of the maximum
 };
 
 namespace orc {
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index bc399b75ef9..ee5fa4e8b5a 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -182,6 +182,19 @@ void ProtobufReader::read(timestamp_statistics& s, size_t maxlen)
                        field_reader(5, s.minimum_nanos),
                        field_reader(6, s.maximum_nanos));
   function_builder(s, maxlen, op);
+
+  // Adjust nanoseconds because they are encoded as (value + 1)
+  // Range [1, 1000'000] is translated here to [0, 999'999]
+  if (s.minimum_nanos.has_value()) {
+    auto& min_nanos = s.minimum_nanos.value();
+    CUDF_EXPECTS(min_nanos >= 1 and min_nanos <= 1000'000, "Invalid minimum nanoseconds");
+    --min_nanos;
+  }
+  if (s.maximum_nanos.has_value()) {
+    auto& max_nanos = s.maximum_nanos.value();
+    CUDF_EXPECTS(max_nanos >= 1 and max_nanos <= 1000'000, "Invalid maximum nanoseconds");
+    --max_nanos;
+  }
 }
 
 void ProtobufReader::read(column_statistics& s, size_t maxlen)
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 6f65e384d2d..783ed4206b6 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -41,6 +41,12 @@ static constexpr uint32_t block_header_size = 3;
 // Seconds from January 1st, 1970 to January 1st, 2015
 static constexpr int64_t orc_utc_epoch = 1420070400;
 
+// Used for the nanosecond remainder in timestamp statistics when the actual nanoseconds of min/max
+// are not included. As the timestamp statistics are stored as milliseconds + nanosecond remainder,
+// the maximum nanosecond remainder is 999,999 (nanoseconds in a millisecond - 1).
+static constexpr int32_t DEFAULT_MIN_NANOS = 0;
+static constexpr int32_t DEFAULT_MAX_NANOS = 999'999;
+
 struct PostScript {
   uint64_t footerLength       = 0;     // the length of the footer section in bytes
   CompressionKind compression = NONE;  // the kind of generic compression used
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 479a2dfada3..429fd5b929d 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -27,6 +27,10 @@ namespace cudf::io::orc::gpu {
 
 using strings::detail::fixed_point_string_size;
 
+// Nanosecond statistics should not be enabled until the spec version is set correctly in the output
+// files. See https://github.com/rapidsai/cudf/issues/14325 for more details
+constexpr bool enable_nanosecond_statistics = false;
+
 constexpr unsigned int init_threads_per_group = 32;
 constexpr unsigned int init_groups_per_block  = 4;
 constexpr unsigned int init_threads_per_block = init_threads_per_group * init_groups_per_block;
@@ -96,8 +100,10 @@ __global__ void __launch_bounds__(block_size, 1)
           stats_len = pb_fldlen_common + pb_fld_hdrlen + 2 * (pb_fld_hdrlen + pb_fldlen_int64);
           break;
         case dtype_timestamp64:
-          stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64) +
-                      2 * (pb_fld_hdrlen + pb_fldlen_int32);
+          stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64);
+          if constexpr (enable_nanosecond_statistics) {
+            stats_len += 2 * (pb_fld_hdrlen + pb_fldlen_int32);
+          }
           break;
         case dtype_float32:
         case dtype_float64:
@@ -405,7 +411,8 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  optional sint64 minimumUtc = 3; // min,max values saved as milliseconds since UNIX epoch
         //  optional sint64 maximumUtc = 4;
         //  optional int32 minimumNanos = 5; // lower 6 TS digits for min/max to achieve nanosecond
-        //  precision optional int32 maximumNanos = 6;
+        //  precision
+        // optional int32 maximumNanos = 6;
         // }
         if (s->chunk.has_minmax) {
           cur[0] = 9 * 8 + ProtofType::FIXEDLEN;
@@ -416,12 +423,22 @@ __global__ void __launch_bounds__(encode_threads_per_block)
             split_nanosecond_timestamp(s->chunk.max_value.i_val);
 
           // minimum/maximum are the same as minimumUtc/maximumUtc as we always write files in UTC
-          cur          = pb_put_int(cur, 1, min_ms);            // minimum
-          cur          = pb_put_int(cur, 2, max_ms);            // maximum
-          cur          = pb_put_int(cur, 3, min_ms);            // minimumUtc
-          cur          = pb_put_int(cur, 4, max_ms);            // maximumUtc
-          cur          = pb_put_int(cur, 5, min_ns_remainder);  // minimumNanos
-          cur          = pb_put_int(cur, 6, max_ns_remainder);  // maximumNanos
+          cur = pb_put_int(cur, 1, min_ms);  // minimum
+          cur = pb_put_int(cur, 2, max_ms);  // maximum
+          cur = pb_put_int(cur, 3, min_ms);  // minimumUtc
+          cur = pb_put_int(cur, 4, max_ms);  // maximumUtc
+
+          if constexpr (enable_nanosecond_statistics) {
+            if (min_ns_remainder != DEFAULT_MIN_NANOS) {
+              // using uint because positive values are not zigzag encoded
+              cur = pb_put_uint(cur, 5, min_ns_remainder + 1);  // minimumNanos
+            }
+            if (max_ns_remainder != DEFAULT_MAX_NANOS) {
+              // using uint because positive values are not zigzag encoded
+              cur = pb_put_uint(cur, 6, max_ns_remainder + 1);  // maximumNanos
+            }
+          }
+
           fld_start[1] = cur - (fld_start + 2);
         }
         break;
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 234716749ff..dca3886db14 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1054,8 +1054,12 @@ TEST_F(OrcStatisticsTest, Basic)
     EXPECT_EQ(*ts4.maximum, 3);
     EXPECT_EQ(*ts4.minimum_utc, -4);
     EXPECT_EQ(*ts4.maximum_utc, 3);
-    EXPECT_EQ(*ts4.minimum_nanos, 999994);
-    EXPECT_EQ(*ts4.maximum_nanos, 6);
+    // nanosecond precision can't be included until we write a writer version that includes ORC-135
+    // see https://github.com/rapidsai/cudf/issues/14325
+    // EXPECT_EQ(*ts4.minimum_nanos, 999994);
+    EXPECT_FALSE(ts4.minimum_nanos.has_value());
+    // EXPECT_EQ(*ts4.maximum_nanos, 6);
+    EXPECT_FALSE(ts4.maximum_nanos.has_value());
 
     auto& s5 = stats[5];
     EXPECT_EQ(*s5.number_of_values, 4ul);
@@ -1065,8 +1069,12 @@ TEST_F(OrcStatisticsTest, Basic)
     EXPECT_EQ(*ts5.maximum, 3000);
     EXPECT_EQ(*ts5.minimum_utc, -3001);
     EXPECT_EQ(*ts5.maximum_utc, 3000);
-    EXPECT_EQ(*ts5.minimum_nanos, 994000);
-    EXPECT_EQ(*ts5.maximum_nanos, 6000);
+    // nanosecond precision can't be included until we write a writer version that includes ORC-135
+    // see https://github.com/rapidsai/cudf/issues/14325
+    // EXPECT_EQ(*ts5.minimum_nanos, 994000);
+    EXPECT_FALSE(ts5.minimum_nanos.has_value());
+    // EXPECT_EQ(*ts5.maximum_nanos, 6000);
+    EXPECT_FALSE(ts5.maximum_nanos.has_value());
 
     auto& s6 = stats[6];
     EXPECT_EQ(*s6.number_of_values, 4ul);

From 8deb3dd7573000e7d87f18a9e2bbe39cf2932e10 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 15 Nov 2023 07:56:37 -0600
Subject: [PATCH 103/118] Raise error in `reindex` when `index` is not unique
 (#14400)

Fixes: #14398
This PR raises an error in `reindex` API when reindexing is performed on a non-unique index column.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14400
---
 python/cudf/cudf/core/indexed_frame.py   |  4 ++++
 python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++
 python/cudf/cudf/tests/test_series.py    | 12 ++++++++++++
 python/dask_cudf/dask_cudf/backends.py   | 13 ++++---------
 4 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 376bef6d0b2..4211a8c24bf 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -2607,6 +2607,10 @@ def _reindex(
 
         df = self
         if index is not None:
+            if not df._index.is_unique:
+                raise ValueError(
+                    "cannot reindex on an axis with duplicate labels"
+                )
             index = cudf.core.index.as_index(
                 index, name=getattr(index, "name", self._index.name)
             )
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d44cf594e8b..5677f97408a 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10723,3 +10723,15 @@ def test_dataframe_series_dot():
     expected = gser @ [12, 13]
 
     assert_eq(expected, actual)
+
+
+def test_dataframe_duplicate_index_reindex():
+    gdf = cudf.DataFrame({"a": [0, 1, 2, 3]}, index=[0, 0, 1, 1])
+    pdf = gdf.to_pandas()
+
+    assert_exceptions_equal(
+        gdf.reindex,
+        pdf.reindex,
+        lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+        rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+    )
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 8f8f87c20e0..c15a797713f 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2638,3 +2638,15 @@ def test_series_setitem_mixed_bool_dtype():
     s = cudf.Series([True, False, True])
     with pytest.raises(TypeError):
         s[0] = 10
+
+
+def test_series_duplicate_index_reindex():
+    gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1])
+    ps = gs.to_pandas()
+
+    assert_exceptions_equal(
+        gs.reindex,
+        ps.reindex,
+        lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+        rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+    )
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index b0da82eaeee..387643587d1 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -427,17 +427,12 @@ def union_categoricals_cudf(
     )
 
 
-@_dask_cudf_nvtx_annotate
-def safe_hash(frame):
-    return cudf.Series(frame.hash_values(), index=frame.index)
-
-
 @hash_object_dispatch.register((cudf.DataFrame, cudf.Series))
 @_dask_cudf_nvtx_annotate
 def hash_object_cudf(frame, index=True):
     if index:
-        return safe_hash(frame.reset_index())
-    return safe_hash(frame)
+        frame = frame.reset_index()
+    return frame.hash_values()
 
 
 @hash_object_dispatch.register(cudf.BaseIndex)
@@ -445,10 +440,10 @@ def hash_object_cudf(frame, index=True):
 def hash_object_cudf_index(ind, index=None):
 
     if isinstance(ind, cudf.MultiIndex):
-        return safe_hash(ind.to_frame(index=False))
+        return ind.to_frame(index=False).hash_values()
 
     col = cudf.core.column.as_column(ind)
-    return safe_hash(cudf.Series(col))
+    return cudf.Series(col).hash_values()
 
 
 @group_split_dispatch.register((cudf.Series, cudf.DataFrame))

From 9e7f8a5fdd03d6a24630687621d0ee14c2db26d7 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 15 Nov 2023 15:27:28 -0800
Subject: [PATCH 104/118] Fix dask dependency in custreamz (#14420)

#14407 added a dask dependency to custreamz, but it added too tight of a pinning by requiring the exact same version. This is not valid because rapids-dask-dependency won't release a new version corresponding to each new cudf release, so pinning to the exact same version up to the alpha creates an unsatisfiable constraint.

Authors:
   - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
   - Ray Douglass (https://github.com/raydouglass)
   - Bradley Dice (https://github.com/bdice)
   - GALI PREM SAGAR (https://github.com/galipremsagar)
---
 conda/recipes/custreamz/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index b8c5918ea60..755394e3936 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -45,7 +45,7 @@ requirements:
     - streamz
     - cudf ={{ version }}
     - cudf_kafka ={{ version }}
-    - rapids-dask-dependency ={{ version }}
+    - rapids-dask-dependency ={{ minor_version }}
     - python-confluent-kafka >=1.9.0,<1.10.0a0
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 

From f9c586d48aa2a879b2267318088d3cc38f398662 Mon Sep 17 00:00:00 2001
From: Ferdinand Xu <ferdinandx@nvidia.com>
Date: Thu, 16 Nov 2023 10:14:19 +0800
Subject: [PATCH 105/118] Support java AST String literal with desired encoding
 (#14402)

Authors:
  - Ferdinand Xu (https://github.com/winningsix)
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/14402
---
 java/src/main/java/ai/rapids/cudf/ast/Literal.java | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ast/Literal.java b/java/src/main/java/ai/rapids/cudf/ast/Literal.java
index 427dd286b0c..4e1e886c282 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/Literal.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/Literal.java
@@ -20,6 +20,7 @@
 
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
+import java.nio.charset.StandardCharsets;
 
 /** A literal value in an AST expression. */
 public final class Literal extends AstExpression {
@@ -205,7 +206,14 @@ public static Literal ofString(String value) {
     if (value == null) {
       return ofNull(DType.STRING);
     }
-    byte[] stringBytes = value.getBytes();
+    return ofUTF8String(value.getBytes(StandardCharsets.UTF_8));
+  }
+
+  /** Construct a string literal directly with byte array to skip transcoding. */
+  public static Literal ofUTF8String(byte[] stringBytes) {
+    if (stringBytes == null) {
+      return ofNull(DType.STRING);
+    }
     byte[] serializedValue = new byte[stringBytes.length + Integer.BYTES];
     ByteBuffer.wrap(serializedValue).order(ByteOrder.nativeOrder()).putInt(stringBytes.length);
     System.arraycopy(stringBytes, 0, serializedValue, Integer.BYTES, stringBytes.length);

From afd7d189b83cbcccba783877f42bb153b5cf315e Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Wed, 15 Nov 2023 23:33:28 -0500
Subject: [PATCH 106/118] Example code for blog on new row comparators (#13795)

Example code using a few libcudf APIs to demonstrate nested-type usage.

Authors:
  - Divye Gala (https://github.com/divyegala)
  - Karthikeyan (https://github.com/karthikeyann)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/13795
---
 ci/release/update-version.sh                |   3 +-
 cpp/examples/README.md                      |   1 +
 cpp/examples/basic/CMakeLists.txt           |  18 +-
 cpp/examples/build.sh                       |  31 ++-
 cpp/examples/fetch_dependencies.cmake       |  30 +++
 cpp/examples/nested_types/CMakeLists.txt    |  16 ++
 cpp/examples/nested_types/deduplication.cpp | 209 ++++++++++++++++++++
 cpp/examples/nested_types/example.json      |   5 +
 cpp/examples/strings/CMakeLists.txt         |  18 +-
 9 files changed, 279 insertions(+), 52 deletions(-)
 create mode 100644 cpp/examples/fetch_dependencies.cmake
 create mode 100644 cpp/examples/nested_types/CMakeLists.txt
 create mode 100644 cpp/examples/nested_types/deduplication.cpp
 create mode 100644 cpp/examples/nested_types/example.json

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 4f1cbc47d1d..16742465c32 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -101,8 +101,7 @@ sed_runner "s/version == ${CURRENT_SHORT_TAG}/version == ${NEXT_SHORT_TAG}/g" RE
 sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
 
 # Libcudf examples update
-sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt
-sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/strings/CMakeLists.txt
+sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/fetch_dependencies.cmake
 
 # CI files
 for FILE in .github/workflows/*.yaml; do
diff --git a/cpp/examples/README.md b/cpp/examples/README.md
index b2e8dd399d0..7f2b769f4a5 100644
--- a/cpp/examples/README.md
+++ b/cpp/examples/README.md
@@ -7,3 +7,4 @@ Current examples:
 
 - Basic: demonstrates a basic use case with libcudf and building a custom application with libcudf
 - Strings: demonstrates using libcudf for accessing and creating strings columns and for building custom kernels for strings
+- Nested Types: demonstrates using libcudf for some operations on nested types
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index 9ff716f41e4..759a43b5627 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -8,23 +8,7 @@ project(
   LANGUAGES CXX CUDA
 )
 
-set(CPM_DOWNLOAD_VERSION v0.35.3)
-file(
-  DOWNLOAD
-  https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
-  ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake
-)
-include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
-
-set(CUDF_TAG branch-23.12)
-CPMFindPackage(
-  NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
-  GIT_TAG ${CUDF_TAG}
-  GIT_SHALLOW
-    TRUE
-    SOURCE_SUBDIR
-    cpp
-)
+include(../fetch_dependencies.cmake)
 
 # Configure your project here
 add_executable(basic_example src/process_csv.cpp)
diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index 7d389cd318d..001cdeec694 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 # libcudf examples build script
 
@@ -14,18 +14,17 @@ LIB_BUILD_DIR=${LIB_BUILD_DIR:-$(readlink -f "${EXAMPLES_DIR}/../build")}
 ################################################################################
 # Add individual libcudf examples build scripts down below
 
-# Basic example
-BASIC_EXAMPLE_DIR=${EXAMPLES_DIR}/basic
-BASIC_EXAMPLE_BUILD_DIR=${BASIC_EXAMPLE_DIR}/build
-# Configure
-cmake -S ${BASIC_EXAMPLE_DIR} -B ${BASIC_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}"
-# Build
-cmake --build ${BASIC_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL}
-
-# Strings example
-STRINGS_EXAMPLE_DIR=${EXAMPLES_DIR}/strings
-STRINGS_EXAMPLE_BUILD_DIR=${STRINGS_EXAMPLE_DIR}/build
-# Configure
-cmake -S ${STRINGS_EXAMPLE_DIR} -B ${STRINGS_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}"
-# Build
-cmake --build ${STRINGS_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL}
+build_example() {
+  example_dir=${1}
+  example_dir="${EXAMPLES_DIR}/${example_dir}"
+  build_dir="${example_dir}/build"
+
+  # Configure
+  cmake -S ${example_dir} -B ${build_dir} -Dcudf_ROOT="${LIB_BUILD_DIR}"
+  # Build
+  cmake --build ${build_dir} -j${PARALLEL_LEVEL}
+}
+
+build_example basic
+build_example strings
+build_example nested_types
diff --git a/cpp/examples/fetch_dependencies.cmake b/cpp/examples/fetch_dependencies.cmake
new file mode 100644
index 00000000000..dc86c6a9aa5
--- /dev/null
+++ b/cpp/examples/fetch_dependencies.cmake
@@ -0,0 +1,30 @@
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+set(CPM_DOWNLOAD_VERSION v0.35.3)
+file(
+  DOWNLOAD
+  https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
+  ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake
+)
+include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
+
+set(CUDF_TAG branch-23.12)
+CPMFindPackage(
+  NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
+  GIT_TAG ${CUDF_TAG}
+  GIT_SHALLOW
+    TRUE
+    SOURCE_SUBDIR
+    cpp
+)
diff --git a/cpp/examples/nested_types/CMakeLists.txt b/cpp/examples/nested_types/CMakeLists.txt
new file mode 100644
index 00000000000..cb9430db237
--- /dev/null
+++ b/cpp/examples/nested_types/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.26.4)
+
+project(
+  nested_types
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+include(../fetch_dependencies.cmake)
+
+# Configure your project here
+add_executable(deduplication deduplication.cpp)
+target_link_libraries(deduplication PRIVATE cudf::cudf)
+target_compile_features(deduplication PRIVATE cxx_std_17)
diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp
new file mode 100644
index 00000000000..5969985cc72
--- /dev/null
+++ b/cpp/examples/nested_types/deduplication.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/io/json.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/join.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <chrono>
+#include <iostream>
+#include <string>
+
+/**
+ * @file deduplication.cpp
+ * @brief Demonstrates usage of the libcudf APIs to perform operations on nested-type tables.
+ *
+ * The algorithms chosen to be demonstrated are to showcase nested-type row operators of three
+ * kinds:
+ * 1. hashing: Used by functions `count_aggregate` and `join_count` to hash inputs of any type
+ * 2. equality: Used by functions `count_aggregate` and `join_count` in conjunction with hashing
+ * to determine equality for nested types
+ * 3. lexicographic: Used by function `sort_keys` to create a lexicographical order for nested-types
+ * so as to enable sorting
+ *
+ */
+
+/**
+ * @brief Create memory resource for libcudf functions
+ *
+ * @param pool Whether to use a pool memory resource.
+ * @return Memory resource instance
+ */
+std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool pool)
+{
+  auto cuda_mr = std::make_shared<rmm::mr::cuda_memory_resource>();
+  if (pool) { return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(cuda_mr); }
+  return cuda_mr;
+}
+
+/**
+ * @brief Read JSON input from file
+ *
+ * @param filepath path to input JSON file
+ * @return cudf::io::table_with_metadata
+ */
+cudf::io::table_with_metadata read_json(std::string filepath)
+{
+  auto source_info = cudf::io::source_info(filepath);
+  auto builder     = cudf::io::json_reader_options::builder(source_info).lines(true);
+  auto options     = builder.build();
+  return cudf::io::read_json(options);
+}
+
+/**
+ * @brief Write JSON output to file
+ *
+ * @param input table to write
+ * @param metadata metadata of input table read by JSON reader
+ * @param filepath path to output JSON file
+ */
+void write_json(cudf::table_view input, cudf::io::table_metadata metadata, std::string filepath)
+{
+  // write the data for inspection
+  auto sink_info = cudf::io::sink_info(filepath);
+  auto builder   = cudf::io::json_writer_options::builder(sink_info, input).lines(true);
+  builder.metadata(metadata);
+  auto options = builder.build();
+  cudf::io::write_json(options);
+}
+
+/**
+ * @brief Aggregate count of duplicate rows in nested-type column
+ *
+ * @param input table to aggregate
+ * @return std::unique_ptr<cudf::table>
+ */
+std::unique_ptr<cudf::table> count_aggregate(cudf::table_view input)
+{
+  // Get count for each key
+  auto keys = cudf::table_view{{input.column(0)}};
+  auto val  = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, keys.num_rows());
+
+  cudf::groupby::groupby grpby_obj(keys);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back(cudf::groupby::aggregation_request());
+  auto agg = cudf::make_count_aggregation<cudf::groupby_aggregation>();
+  requests[0].aggregations.push_back(std::move(agg));
+  requests[0].values = *val;
+  auto agg_results   = grpby_obj.aggregate(requests);
+  auto result_key    = std::move(agg_results.first);
+  auto result_val    = std::move(agg_results.second[0].results[0]);
+
+  auto left_cols = result_key->release();
+  left_cols.push_back(std::move(result_val));
+
+  return std::make_unique<cudf::table>(std::move(left_cols));
+}
+
+/**
+ * @brief Join each row with its duplicate counts
+ *
+ * @param left left table
+ * @param right right table
+ * @return std::unique_ptr<cudf::table>
+ */
+std::unique_ptr<cudf::table> join_count(cudf::table_view left, cudf::table_view right)
+{
+  auto [left_indices, right_indices] =
+    cudf::inner_join(cudf::table_view{{left.column(0)}}, cudf::table_view{{right.column(0)}});
+  auto new_left  = cudf::gather(left, cudf::device_span<cudf::size_type const>{*left_indices});
+  auto new_right = cudf::gather(right, cudf::device_span<cudf::size_type const>{*right_indices});
+
+  auto left_cols  = new_left->release();
+  auto right_cols = new_right->release();
+  left_cols.push_back(std::move(right_cols[1]));
+
+  return std::make_unique<cudf::table>(std::move(left_cols));
+}
+
+/**
+ * @brief Sort nested-type column
+ *
+ * @param input table to sort
+ * @return std::unique_ptr<cudf::table>
+ *
+ * @note if stability is desired, use `cudf::stable_sorted_order`
+ */
+std::unique_ptr<cudf::table> sort_keys(cudf::table_view input)
+{
+  auto sort_order = cudf::sorted_order(cudf::table_view{{input.column(0)}});
+  return cudf::gather(input, *sort_order);
+}
+
+/**
+ * @brief Main for nested_types examples
+ *
+ * Command line parameters:
+ * 1. JSON input file name/path (default: "example.json")
+ * 2. JSON output file name/path (default: "output.json")
+ * 3. Memory resource (optional): "pool" or "cuda" (default: "pool")
+ *
+ * Example invocation from directory `cudf/cpp/examples/nested_types`:
+ * ./build/deduplication example.json output.json pool
+ *
+ */
+int main(int argc, char const** argv)
+{
+  std::string input_filepath;
+  std::string output_filepath;
+  std::string mr_name;
+  if (argc != 4 && argc != 1) {
+    std::cout << "Either provide all command-line arguments, or none to use defaults" << std::endl;
+    return 1;
+  }
+  if (argc == 1) {
+    input_filepath  = "example.json";
+    output_filepath = "output.json";
+    mr_name         = "pool";
+  } else {
+    input_filepath  = argv[1];
+    output_filepath = argv[2];
+    mr_name         = argv[3];
+  }
+
+  auto pool     = mr_name == "pool";
+  auto resource = create_memory_resource(pool);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  std::cout << "Reading " << input_filepath << "..." << std::endl;
+  // read input file
+  auto [input, metadata] = read_json(input_filepath);
+
+  auto count = count_aggregate(input->view());
+
+  auto combined = join_count(input->view(), count->view());
+
+  auto sorted = sort_keys(combined->view());
+
+  metadata.schema_info.emplace_back("count");
+
+  std::cout << "Writing " << output_filepath << "..." << std::endl;
+  write_json(sorted->view(), metadata, output_filepath);
+
+  return 0;
+}
diff --git a/cpp/examples/nested_types/example.json b/cpp/examples/nested_types/example.json
new file mode 100644
index 00000000000..efaa37817d6
--- /dev/null
+++ b/cpp/examples/nested_types/example.json
@@ -0,0 +1,5 @@
+{"features": {"key": "a1", "values": [{"info": "message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_a", "quality": 0.7}
+{"features": {"key": "a2", "values": [{"info": "message_2", "type": "device_a", "dt": 1688750002}]}, "source": "network_a", "quality": 0.7}
+{"features": {"key": "a3", "values": [{"info": "message_3", "type": "device_a", "dt": 1688750003}]}, "source": "network_b", "quality": 0.8}
+{"features": {"key": "a1", "values": [{"info": "message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_b", "quality": 0.9}
+{"features": {"key": "a4", "values": [{"info": "message_4", "type": "device_a", "dt": 1688750004}]}, "source": "network_b", "quality": 0.9}
diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt
index 4b500d3a92e..c90fa9dde16 100644
--- a/cpp/examples/strings/CMakeLists.txt
+++ b/cpp/examples/strings/CMakeLists.txt
@@ -8,23 +8,7 @@ project(
   LANGUAGES CXX CUDA
 )
 
-set(CPM_DOWNLOAD_VERSION v0.35.3)
-file(
-  DOWNLOAD
-  https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
-  ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake
-)
-include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
-
-set(CUDF_TAG branch-23.12)
-CPMFindPackage(
-  NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
-  GIT_TAG ${CUDF_TAG}
-  GIT_SHALLOW
-    TRUE
-    SOURCE_SUBDIR
-    cpp
-)
+include(../fetch_dependencies.cmake)
 
 list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
 

From 53127de4d9e06f9fa172ac34952f85104eb7bac9 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 16 Nov 2023 08:28:44 -0600
Subject: [PATCH 107/118] Remove needs: wheel-build-cudf. (#14427)

This PR fixes a nightly test failure due to an extraneous `needs:` entry in `test.yaml`.

```
Invalid workflow file: .github/workflows/test.yaml#L100
The workflow is not valid. .github/workflows/test.yaml (Line: 100, Col: 12): Job 'unit-tests-cudf-pandas' depends on unknown job 'wheel-build-cudf'.
```

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14427
---
 .github/workflows/test.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 0b6b55069f6..0d4401160e1 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -97,7 +97,6 @@ jobs:
       sha: ${{ inputs.sha }}
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
-    needs: wheel-build-cudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:

From 8e1ef05b2b96775ce7e1a2f22894ec7a8ebb65a4 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Thu, 16 Nov 2023 16:43:29 -0500
Subject: [PATCH 108/118] Change `nullable()` to `has_nulls()` in
 `cudf::detail::gather` (#14363)

In https://github.com/rapidsai/cudf/pull/13795, we found out that `nullable()` causes severe perf degradation for the nested-type case when the input is read from file via `cudf::io::read_json`. This is because the JSON reader adds a null mask for columns that don't have NULLs. This change is a no-overhead replacement that checks the actual null count instead of checking if a null mask is present.

This PR also solves a bug in quantile/median groupby where NULLs were being [set](https://github.com/rapidsai/cudf/blob/8deb3dd7573000e7d87f18a9e2bbe39cf2932e10/cpp/src/groupby/sort/group_quantiles.cu#L73) but the null count was not updated.

Authors:
  - Divye Gala (https://github.com/divyegala)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14363
---
 cpp/include/cudf/detail/gather.cuh      | 22 ++++++----
 cpp/include/cudf/detail/null_mask.hpp   | 17 ++++++++
 cpp/include/cudf/table/table_view.hpp   | 17 ++++++++
 cpp/src/bitmask/null_mask.cu            | 15 +++++++
 cpp/src/groupby/sort/group_quantiles.cu | 17 +++++---
 cpp/tests/join/join_tests.cpp           | 56 -------------------------
 6 files changed, 75 insertions(+), 69 deletions(-)

diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 955f9914632..c9975ef2199 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -673,14 +673,20 @@ std::unique_ptr<table> gather(table_view const& source_table,
                                                    mr));
   }
 
-  auto const nullable = bounds_policy == out_of_bounds_policy::NULLIFY ||
-                        std::any_of(source_table.begin(), source_table.end(), [](auto const& col) {
-                          return col.nullable();
-                        });
-  if (nullable) {
-    auto const op = bounds_policy == out_of_bounds_policy::NULLIFY ? gather_bitmask_op::NULLIFY
-                                                                   : gather_bitmask_op::DONT_CHECK;
-    gather_bitmask(source_table, gather_map_begin, destination_columns, op, stream, mr);
+  auto needs_new_bitmask = bounds_policy == out_of_bounds_policy::NULLIFY ||
+                           cudf::has_nested_nullable_columns(source_table);
+  if (needs_new_bitmask) {
+    needs_new_bitmask = needs_new_bitmask || cudf::has_nested_nulls(source_table);
+    if (needs_new_bitmask) {
+      auto const op = bounds_policy == out_of_bounds_policy::NULLIFY
+                        ? gather_bitmask_op::NULLIFY
+                        : gather_bitmask_op::DONT_CHECK;
+      gather_bitmask(source_table, gather_map_begin, destination_columns, op, stream, mr);
+    } else {
+      for (size_type i = 0; i < source_table.num_columns(); ++i) {
+        set_all_valid_null_masks(source_table.column(i), *destination_columns[i], stream, mr);
+      }
+    }
   }
 
   return std::make_unique<table>(std::move(destination_columns));
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 8c10bbe416f..74e2ccd2ea1 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <cudf/column/column.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
@@ -259,6 +260,22 @@ cudf::size_type inplace_bitmask_and(device_span<bitmask_type> dest_mask,
                                     size_type mask_size_bits,
                                     rmm::cuda_stream_view stream);
 
+/**
+ * @brief Recursively set valid null masks for all children.
+ *
+ * This function applies all valid null masks to the output column if input column satisfies
+ * `nullable() == true` condition
+ *
+ * @param input input column to check for nullability
+ * @param output output column to mirror nullability of input
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+void set_all_valid_null_masks(column_view const& input,
+                              column& output,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index b90b2dac012..5d9c930d137 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -336,6 +336,23 @@ inline bool has_nested_nulls(table_view const& input)
   });
 }
 
+/**
+ * @brief Returns True if the table has a nullable column at any level of the column hierarchy
+ *
+ * @param input The table to check for nullable columns
+ * @return True if the table has nullable columns at any level of the column hierarchy, false
+ * otherwise
+ */
+inline bool has_nested_nullable_columns(table_view const& input)
+{
+  return std::any_of(input.begin(), input.end(), [](auto const& col) {
+    return col.nullable() ||
+           std::any_of(col.child_begin(), col.child_end(), [](auto const& child_col) {
+             return has_nested_nullable_columns(table_view{{child_col}});
+           });
+  });
+}
+
 /**
  * @brief The function to collect all nullable columns at all nested levels in a given table.
  *
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 3ff56eabe1e..1a1cbb17d15 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -510,6 +510,21 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
   return std::pair(std::move(null_mask), 0);
 }
 
+void set_all_valid_null_masks(column_view const& input,
+                              column& output,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
+{
+  if (input.nullable()) {
+    auto mask = detail::create_null_mask(output.size(), mask_state::ALL_VALID, stream, mr);
+    output.set_null_mask(std::move(mask), 0);
+
+    for (size_type i = 0; i < input.num_children(); ++i) {
+      set_all_valid_null_masks(input.child(i), output.child(i), stream, mr);
+    }
+  }
+}
+
 }  // namespace detail
 
 // Create a bitmask from a specific range
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index a9edcfecbf7..a456d4b5964 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -49,6 +49,7 @@ struct calculate_quantile_fn {
   double const* d_quantiles;
   size_type num_quantiles;
   interpolation interpolation;
+  size_type* null_count;
 
   __device__ void operator()(size_type i)
   {
@@ -68,11 +69,13 @@ struct calculate_quantile_fn {
     thrust::for_each_n(thrust::seq,
                        thrust::make_counting_iterator(0),
                        num_quantiles,
-                       [d_result = d_result, segment_size, offset](size_type j) {
-                         if (segment_size == 0)
+                       [d_result = d_result, segment_size, offset, this](size_type j) {
+                         if (segment_size == 0) {
                            d_result.set_null(offset + j);
-                         else
+                           atomicAdd(this->null_count, 1);
+                         } else {
                            d_result.set_valid(offset + j);
+                         }
                        });
   }
 };
@@ -104,6 +107,7 @@ struct quantiles_functor {
     auto values_view     = column_device_view::create(values, stream);
     auto group_size_view = column_device_view::create(group_sizes, stream);
     auto result_view     = mutable_column_device_view::create(result->mutable_view(), stream);
+    auto null_count      = rmm::device_scalar<cudf::size_type>(0, stream, mr);
 
     // For each group, calculate quantile
     if (!cudf::is_dictionary(values.type())) {
@@ -118,7 +122,8 @@ struct quantiles_functor {
                            group_offsets.data(),
                            quantile.data(),
                            static_cast<size_type>(quantile.size()),
-                           interpolation});
+                           interpolation,
+                           null_count.data()});
     } else {
       auto values_iter = cudf::dictionary::detail::make_dictionary_iterator<T>(*values_view);
       thrust::for_each_n(rmm::exec_policy(stream),
@@ -131,9 +136,11 @@ struct quantiles_functor {
                            group_offsets.data(),
                            quantile.data(),
                            static_cast<size_type>(quantile.size()),
-                           interpolation});
+                           interpolation,
+                           null_count.data()});
     }
 
+    result->set_null_count(null_count.value(stream));
     return result;
   }
 
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 089db315748..a416df0c7c3 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -1941,62 +1941,6 @@ TEST_F(JoinTest, FullJoinWithStructsAndNulls)
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
-TEST_F(JoinTest, Repro_StructsWithoutNullsPushedDown)
-{
-  // When joining on a STRUCT column, if the parent nulls are not reflected in
-  // the children, the join might produce incorrect results.
-  //
-  // In this test, a fact table of structs is joined against a dimension table.
-  // Both tables must match (only) on the NULL row. This will fail if the fact table's
-  // nulls are not pushed down into its children.
-  using ints    = column_wrapper<int32_t>;
-  using structs = cudf::test::structs_column_wrapper;
-  using namespace cudf::test::iterators;
-
-  auto make_table = [](auto&& col) {
-    auto columns = CVector{};
-    columns.push_back(std::move(col));
-    return cudf::table{std::move(columns)};
-  };
-
-  auto const fact_table = [make_table] {
-    auto fact_ints    = ints{0, 1, 2, 3, 4};
-    auto fact_structs = structs{{fact_ints}, no_nulls()}.release();
-    // Now set struct validity to invalidate index#3.
-    cudf::detail::set_null_mask(
-      fact_structs->mutable_view().null_mask(), 3, 4, false, cudf::get_default_stream());
-    // Struct row#3 is null, but Struct.child has a non-null value.
-    return make_table(std::move(fact_structs));
-  }();
-
-  auto const dimension_table = [make_table] {
-    auto dim_ints    = ints{999};
-    auto dim_structs = structs{{dim_ints}, null_at(0)};
-    return make_table(dim_structs.release());
-  }();
-
-  auto const result = inner_join(fact_table.view(), dimension_table.view(), {0}, {0});
-  EXPECT_EQ(result->num_rows(), 1);  // The null STRUCT rows should match.
-
-  // Note: Join result might not have nulls pushed down, since it's an output of gather().
-  // Must superimpose parent nulls before comparisons.
-  auto [superimposed_results, _] = cudf::structs::detail::push_down_nulls(
-    *result, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-
-  auto const expected = [] {
-    auto fact_ints    = ints{0};
-    auto fact_structs = structs{{fact_ints}, null_at(0)};
-    auto dim_ints     = ints{0};
-    auto dim_structs  = structs{{dim_ints}, null_at(0)};
-    auto columns      = CVector{};
-    columns.push_back(fact_structs.release());
-    columns.push_back(dim_structs.release());
-    return cudf::table{std::move(columns)};
-  }();
-
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(superimposed_results, expected);
-}
-
 using lcw = cudf::test::lists_column_wrapper<int32_t>;
 using cudf::test::iterators::null_at;
 

From bf63d1049db70c28ea961b677ad5f207aa648860 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Thu, 16 Nov 2023 14:47:36 -0800
Subject: [PATCH 109/118] Add decoder for DELTA_BYTE_ARRAY to Parquet reader
 (#14101)

Part of #13501. Adds ability to decode DELTA_BYTE_ARRAY encoded pages.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - https://github.com/nvdbaranec
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/14101
---
 cpp/src/io/parquet/delta_binary.cuh           |  58 ++-
 cpp/src/io/parquet/page_data.cu               |  12 +-
 cpp/src/io/parquet/page_decode.cuh            |  12 +-
 cpp/src/io/parquet/page_delta_decode.cu       | 490 +++++++++++++++++-
 cpp/src/io/parquet/page_hdr.cu                |  17 +-
 cpp/src/io/parquet/page_string_decode.cu      | 344 ++++++++++--
 cpp/src/io/parquet/parquet_gpu.hpp            |  79 ++-
 cpp/src/io/parquet/reader_impl.cpp            |  38 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu  |   4 +-
 .../tests/data/parquet/delta_byte_arr.parquet | Bin 0 -> 5783 bytes
 python/cudf/cudf/tests/test_parquet.py        | 104 ++++
 11 files changed, 1044 insertions(+), 114 deletions(-)
 create mode 100644 python/cudf/cudf/tests/data/parquet/delta_byte_arr.parquet

diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh
index e3b23f4c0a0..ccc28791071 100644
--- a/cpp/src/io/parquet/delta_binary.cuh
+++ b/cpp/src/io/parquet/delta_binary.cuh
@@ -39,15 +39,15 @@ namespace cudf::io::parquet::detail {
 // per mini-block. While encoding, the lowest delta value is subtracted from all the deltas in the
 // block to ensure that all encoded values are positive. The deltas for each mini-block are bit
 // packed using the same encoding as the RLE/Bit-Packing Hybrid encoder.
-//
-// DELTA_BYTE_ARRAY encoding (incremental encoding or front compression), is used for BYTE_ARRAY
-// columns. For each element in a sequence of strings, a prefix length from the preceding string
-// and a suffix is stored. The prefix lengths are DELTA_BINARY_PACKED encoded. The suffixes are
-// encoded with DELTA_LENGTH_BYTE_ARRAY encoding, which is a DELTA_BINARY_PACKED list of suffix
-// lengths, followed by the concatenated suffix data.
 
-// we decode one mini-block at a time. max mini-block size seen is 64.
-constexpr int delta_rolling_buf_size = 128;
+// The largest mini-block size we can currently support.
+constexpr int max_delta_mini_block_size = 64;
+
+// The first pass decodes `values_per_mb` values, and then the second pass does another
+// batch of size `values_per_mb`. The largest value for values_per_miniblock among the
+// major writers seems to be 64, so 2 * 64 should be good. We save the first value separately
+// since it is not encoded in the first mini-block.
+constexpr int delta_rolling_buf_size = 2 * max_delta_mini_block_size;
 
 /**
  * @brief Read a ULEB128 varint integer
@@ -90,7 +90,8 @@ struct delta_binary_decoder {
   uleb128_t mini_block_count;  // usually 4, chosen such that block_size/mini_block_count is a
                                // multiple of 32
   uleb128_t value_count;       // total values encoded in the block
-  zigzag128_t last_value;      // last value decoded, initialized to first_value from header
+  zigzag128_t first_value;     // initial value, stored in the header
+  zigzag128_t last_value;      // last value decoded
 
   uint32_t values_per_mb;      // block_size / mini_block_count, must be multiple of 32
   uint32_t current_value_idx;  // current value index, initialized to 0 at start of block
@@ -102,6 +103,13 @@ struct delta_binary_decoder {
 
   uleb128_t value[delta_rolling_buf_size];  // circular buffer of delta values
 
+  // returns the value stored in the `value` array at index
+  // `rolling_index<delta_rolling_buf_size>(idx)`. If `idx` is `0`, then return `first_value`.
+  constexpr zigzag128_t value_at(size_type idx)
+  {
+    return idx == 0 ? first_value : value[rolling_index<delta_rolling_buf_size>(idx)];
+  }
+
   // returns the number of values encoded in the block data. when all_values is true,
   // account for the first value in the header. otherwise just count the values encoded
   // in the mini-block data.
@@ -145,7 +153,8 @@ struct delta_binary_decoder {
     block_size       = get_uleb128(d_start, d_end);
     mini_block_count = get_uleb128(d_start, d_end);
     value_count      = get_uleb128(d_start, d_end);
-    last_value       = get_zz128(d_start, d_end);
+    first_value      = get_zz128(d_start, d_end);
+    last_value       = first_value;
 
     current_value_idx = 0;
     values_per_mb     = block_size / mini_block_count;
@@ -179,6 +188,28 @@ struct delta_binary_decoder {
     }
   }
 
+  // given start/end pointers in the data, find the end of the binary encoded block. when done,
+  // `this` will be initialized with the correct start and end positions. returns the end, which is
+  // start of data/next block. should only be called from thread 0.
+  inline __device__ uint8_t const* find_end_of_block(uint8_t const* start, uint8_t const* end)
+  {
+    // read block header
+    init_binary_block(start, end);
+
+    // test for no encoded values. a single value will be in the block header.
+    if (value_count <= 1) { return block_start; }
+
+    // read mini-block headers and skip over data
+    while (current_value_idx < num_encoded_values(false)) {
+      setup_next_mini_block(false);
+    }
+    // calculate the correct end of the block
+    auto const* const new_end = cur_mb == 0 ? block_start : cur_mb_start;
+    // re-init block with correct end
+    init_binary_block(start, new_end);
+    return new_end;
+  }
+
   // decode the current mini-batch of deltas, and convert to values.
   // called by all threads in a warp, currently only one warp supported.
   inline __device__ void calc_mini_block_values(int lane_id)
@@ -186,12 +217,9 @@ struct delta_binary_decoder {
     using cudf::detail::warp_size;
     if (current_value_idx >= value_count) { return; }
 
-    // need to save first value from header on first pass
+    // need to account for the first value from header on first pass
     if (current_value_idx == 0) {
-      if (lane_id == 0) {
-        current_value_idx++;
-        value[0] = last_value;
-      }
+      if (lane_id == 0) { current_value_idx++; }
       __syncwarp();
       if (current_value_idx >= value_count) { return; }
     }
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index a783b489c02..0c53877f7c7 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -449,8 +449,13 @@ __global__ void __launch_bounds__(decode_block_size)
   int out_thread0;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
-  if (!setupLocalPageInfo(
-        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{KERNEL_MASK_GENERAL}, true)) {
+  if (!setupLocalPageInfo(s,
+                          &pages[page_idx],
+                          chunks,
+                          min_row,
+                          num_rows,
+                          mask_filter{decode_kernel_mask::GENERAL},
+                          true)) {
     return;
   }
 
@@ -486,6 +491,7 @@ __global__ void __launch_bounds__(decode_block_size)
       target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
       if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); }
     }
+    // TODO(ets): see if this sync can be removed
     __syncthreads();
     if (t < 32) {
       // decode repetition and definition levels.
@@ -603,7 +609,7 @@ __global__ void __launch_bounds__(decode_block_size)
 }
 
 struct mask_tform {
-  __device__ uint32_t operator()(PageInfo const& p) { return p.kernel_mask; }
+  __device__ uint32_t operator()(PageInfo const& p) { return static_cast<uint32_t>(p.kernel_mask); }
 };
 
 }  // anonymous namespace
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index ab1cc68923d..4db9bd3904b 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -991,8 +991,15 @@ struct all_types_filter {
  * @brief Functor for setupLocalPageInfo that takes a mask of allowed types.
  */
 struct mask_filter {
-  int mask;
-  __device__ inline bool operator()(PageInfo const& page) { return (page.kernel_mask & mask) != 0; }
+  uint32_t mask;
+
+  __device__ mask_filter(uint32_t m) : mask(m) {}
+  __device__ mask_filter(decode_kernel_mask m) : mask(static_cast<uint32_t>(m)) {}
+
+  __device__ inline bool operator()(PageInfo const& page)
+  {
+    return BitAnd(mask, page.kernel_mask) != 0;
+  }
 };
 
 /**
@@ -1306,6 +1313,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           s->dict_run = 0;
         } break;
         case Encoding::DELTA_BINARY_PACKED:
+        case Encoding::DELTA_BYTE_ARRAY:
           // nothing to do, just don't error
           break;
         default: {
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index bb5e5066b69..bc025c6fc3e 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -27,6 +27,277 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
+constexpr int decode_block_size = 128;
+
+// DELTA_BYTE_ARRAY encoding (incremental encoding or front compression), is used for BYTE_ARRAY
+// columns. For each element in a sequence of strings, a prefix length from the preceding string
+// and a suffix is stored. The prefix lengths are DELTA_BINARY_PACKED encoded. The suffixes are
+// encoded with DELTA_LENGTH_BYTE_ARRAY encoding, which is a DELTA_BINARY_PACKED list of suffix
+// lengths, followed by the concatenated suffix data.
+struct delta_byte_array_decoder {
+  uint8_t const* last_string;       // pointer to last decoded string...needed for its prefix
+  uint8_t const* suffix_char_data;  // pointer to the start of character data
+
+  uint8_t* temp_buf;         // buffer used when skipping values
+  uint32_t start_val;        // decoded strings up to this index will be dumped to temp_buf
+  uint32_t last_string_len;  // length of the last decoded string
+
+  delta_binary_decoder prefixes;  // state of decoder for prefix lengths
+  delta_binary_decoder suffixes;  // state of decoder for suffix lengths
+
+  // initialize the prefixes and suffixes blocks
+  __device__ void init(uint8_t const* start, uint8_t const* end, uint32_t start_idx, uint8_t* temp)
+  {
+    auto const* suffix_start = prefixes.find_end_of_block(start, end);
+    suffix_char_data         = suffixes.find_end_of_block(suffix_start, end);
+    last_string              = nullptr;
+    temp_buf                 = temp;
+    start_val                = start_idx;
+  }
+
+  // kind of like an inclusive scan for strings. takes prefix_len bytes from preceding
+  // string and prepends to the suffix we've already copied into place. called from
+  // within loop over values_in_mb, so this only needs to handle a single warp worth of data
+  // at a time.
+  __device__ void string_scan(uint8_t* strings_out,
+                              uint8_t const* last_string,
+                              uint32_t start_idx,
+                              uint32_t end_idx,
+                              uint32_t offset,
+                              uint32_t lane_id)
+  {
+    using cudf::detail::warp_size;
+
+    // let p(n) === length(prefix(string_n))
+    //
+    // if p(n-1) > p(n), then string_n can be completed when string_n-2 is completed. likewise if
+    // p(m) > p(n), then string_n can be completed with string_m-1. however, if p(m) < p(n), then m
+    // is a "blocker" for string_n; string_n can be completed only after string_m is.
+    //
+    // we will calculate the nearest blocking position for each lane, and then fill in string_0. we
+    // then iterate, finding all lanes that have had their "blocker" filled in and completing them.
+    // when all lanes are filled in, we return. this will still hit the worst case if p(n-1) < p(n)
+    // for all n
+    __shared__ __align__(8) int64_t prefix_lens[warp_size];
+    __shared__ __align__(8) uint8_t const* offsets[warp_size];
+
+    uint32_t const ln_idx   = start_idx + lane_id;
+    uint64_t prefix_len     = ln_idx < end_idx ? prefixes.value_at(ln_idx) : 0;
+    uint8_t* const lane_out = ln_idx < end_idx ? strings_out + offset : nullptr;
+
+    prefix_lens[lane_id] = prefix_len;
+    offsets[lane_id]     = lane_out;
+
+    // if all prefix_len's are zero, then there's nothing to do
+    if (__all_sync(0xffff'ffff, prefix_len == 0)) { return; }
+
+    // find a neighbor to the left that has a prefix length less than this lane. once that
+    // neighbor is complete, this lane can be completed.
+    int blocker = lane_id - 1;
+    while (blocker > 0 && prefix_lens[blocker] != 0 && prefix_len <= prefix_lens[blocker]) {
+      blocker--;
+    }
+
+    // fill in lane 0 (if necessary)
+    if (lane_id == 0 && prefix_len > 0) {
+      memcpy(lane_out, last_string, prefix_len);
+      prefix_lens[0] = prefix_len = 0;
+    }
+    __syncwarp();
+
+    // now fill in blockers until done
+    for (uint32_t i = 1; i < warp_size && i + start_idx < end_idx; i++) {
+      if (prefix_len != 0 && prefix_lens[blocker] == 0 && lane_out != nullptr) {
+        memcpy(lane_out, offsets[blocker], prefix_len);
+        prefix_lens[lane_id] = prefix_len = 0;
+      }
+
+      // check for finished
+      if (__all_sync(0xffff'ffff, prefix_len == 0)) { return; }
+    }
+  }
+
+  // calculate a mini-batch of string values, writing the results to
+  // `strings_out`. starting at global index `start_idx` and decoding
+  // up to `num_values` strings.
+  // called by all threads in a warp. used for strings <= 32 chars.
+  // returns number of bytes written
+  __device__ size_t calculate_string_values(uint8_t* strings_out,
+                                            uint32_t start_idx,
+                                            uint32_t num_values,
+                                            uint32_t lane_id)
+  {
+    using cudf::detail::warp_size;
+    using WarpScan = cub::WarpScan<uint64_t>;
+    __shared__ WarpScan::TempStorage scan_temp;
+
+    if (start_idx >= suffixes.value_count) { return 0; }
+    auto end_idx = start_idx + min(suffixes.values_per_mb, num_values);
+    end_idx      = min(end_idx, static_cast<uint32_t>(suffixes.value_count));
+
+    auto p_strings_out = strings_out;
+    auto p_temp_out    = temp_buf;
+
+    auto copy_batch = [&](uint8_t* out, uint32_t idx, uint32_t end) {
+      uint32_t const ln_idx = idx + lane_id;
+
+      // calculate offsets into suffix data
+      uint64_t const suffix_len = ln_idx < end ? suffixes.value_at(ln_idx) : 0;
+      uint64_t suffix_off       = 0;
+      WarpScan(scan_temp).ExclusiveSum(suffix_len, suffix_off);
+
+      // calculate offsets into string data
+      uint64_t const prefix_len = ln_idx < end ? prefixes.value_at(ln_idx) : 0;
+      uint64_t const string_len = prefix_len + suffix_len;
+
+      // get offset into output for each lane
+      uint64_t string_off, warp_total;
+      WarpScan(scan_temp).ExclusiveSum(string_len, string_off, warp_total);
+      auto const so_ptr = out + string_off;
+
+      // copy suffixes into string data
+      if (ln_idx < end) { memcpy(so_ptr + prefix_len, suffix_char_data + suffix_off, suffix_len); }
+      __syncwarp();
+
+      // copy prefixes into string data.
+      string_scan(out, last_string, idx, end, string_off, lane_id);
+
+      // save the position of the last computed string. this will be used in
+      // the next iteration to reconstruct the string in lane 0.
+      if (ln_idx == end - 1 || (ln_idx < end && lane_id == 31)) {
+        // set last_string to this lane's string
+        last_string     = out + string_off;
+        last_string_len = string_len;
+        // and consume used suffix_char_data
+        suffix_char_data += suffix_off + suffix_len;
+      }
+
+      return warp_total;
+    };
+
+    uint64_t string_total = 0;
+    for (int idx = start_idx; idx < end_idx; idx += warp_size) {
+      auto const n_in_batch = min(warp_size, end_idx - idx);
+      // account for the case where start_val occurs in the middle of this batch
+      if (idx < start_val && idx + n_in_batch > start_val) {
+        // dump idx...start_val into temp_buf
+        copy_batch(p_temp_out, idx, start_val);
+        __syncwarp();
+
+        // start_val...idx + n_in_batch into strings_out
+        auto nbytes = copy_batch(p_strings_out, start_val, idx + n_in_batch);
+        p_strings_out += nbytes;
+        string_total = nbytes;
+      } else {
+        if (idx < start_val) {
+          p_temp_out += copy_batch(p_temp_out, idx, end_idx);
+        } else {
+          auto nbytes = copy_batch(p_strings_out, idx, end_idx);
+          p_strings_out += nbytes;
+          string_total += nbytes;
+        }
+      }
+      __syncwarp();
+    }
+
+    return string_total;
+  }
+
+  // character parallel version of CalculateStringValues(). This is faster for strings longer than
+  // 32 chars.
+  __device__ size_t calculate_string_values_cp(uint8_t* strings_out,
+                                               uint32_t start_idx,
+                                               uint32_t num_values,
+                                               uint32_t lane_id)
+  {
+    using cudf::detail::warp_size;
+    __shared__ __align__(8) uint8_t* so_ptr;
+
+    if (start_idx >= suffixes.value_count) { return; }
+    auto end_idx = start_idx + min(suffixes.values_per_mb, num_values);
+    end_idx      = min(end_idx, static_cast<uint32_t>(suffixes.value_count));
+
+    if (lane_id == 0) { so_ptr = start_idx < start_val ? temp_buf : strings_out; }
+    __syncwarp();
+
+    uint64_t string_total = 0;
+    for (int idx = start_idx; idx < end_idx; idx++) {
+      uint64_t const suffix_len = suffixes.value_at(idx);
+      uint64_t const prefix_len = prefixes.value_at(idx);
+      uint64_t const string_len = prefix_len + suffix_len;
+
+      // copy prefix and suffix data into current strings_out position
+      // for longer strings use a 4-byte version stolen from gather_chars_fn_string_parallel.
+      if (string_len > 64) {
+        if (prefix_len > 0) { wideStrcpy(so_ptr, last_string, prefix_len, lane_id); }
+        if (suffix_len > 0) {
+          wideStrcpy(so_ptr + prefix_len, suffix_char_data, suffix_len, lane_id);
+        }
+      } else {
+        for (int i = lane_id; i < string_len; i += warp_size) {
+          so_ptr[i] = i < prefix_len ? last_string[i] : suffix_char_data[i - prefix_len];
+        }
+      }
+      __syncwarp();
+
+      if (idx >= start_val) { string_total += string_len; }
+
+      if (lane_id == 0) {
+        last_string     = so_ptr;
+        last_string_len = string_len;
+        suffix_char_data += suffix_len;
+        if (idx == start_val - 1) {
+          so_ptr = strings_out;
+        } else {
+          so_ptr += string_len;
+        }
+      }
+      __syncwarp();
+    }
+
+    return string_total;
+  }
+
+  // dump strings before start_val to temp buf
+  __device__ void skip(bool use_char_ll)
+  {
+    using cudf::detail::warp_size;
+    int const t       = threadIdx.x;
+    int const lane_id = t % warp_size;
+
+    // is this even necessary? return if asking to skip the whole block.
+    if (start_val >= prefixes.num_encoded_values(true)) { return; }
+
+    // prefixes and suffixes will have the same parameters (it's checked earlier)
+    auto const batch_size = prefixes.values_per_mb;
+
+    uint32_t skip_pos = 0;
+    while (prefixes.current_value_idx < start_val) {
+      // warp 0 gets prefixes and warp 1 gets suffixes
+      auto* const db = t < 32 ? &prefixes : &suffixes;
+
+      // this will potentially decode past start_val, but that's ok
+      if (t < 64) { db->decode_batch(); }
+      __syncthreads();
+
+      // warp 0 decodes the batch.
+      if (t < 32) {
+        auto const num_to_decode = min(batch_size, start_val - skip_pos);
+        auto const bytes_written =
+          use_char_ll ? calculate_string_values_cp(temp_buf, skip_pos, num_to_decode, lane_id)
+                      : calculate_string_values(temp_buf, skip_pos, num_to_decode, lane_id);
+        // store last_string someplace safe in temp buffer
+        if (t == 0) {
+          memcpy(temp_buf + bytes_written, last_string, last_string_len);
+          last_string = temp_buf + bytes_written;
+        }
+      }
+      skip_pos += prefixes.values_per_mb;
+      __syncthreads();
+    }
+  }
+};
+
 // Decode page data that is DELTA_BINARY_PACKED encoded. This encoding is
 // only used for int32 and int64 physical types (and appears to only be used
 // with V2 page headers; see https://www.mail-archive.com/dev@parquet.apache.org/msg11826.html).
@@ -52,13 +323,9 @@ __global__ void __launch_bounds__(96)
   auto* const db        = &db_state;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
-  if (!setupLocalPageInfo(s,
-                          &pages[page_idx],
-                          chunks,
-                          min_row,
-                          num_rows,
-                          mask_filter{KERNEL_MASK_DELTA_BINARY},
-                          true)) {
+  auto const mask = decode_kernel_mask::DELTA_BINARY;
+  if (!setupLocalPageInfo(
+        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{mask}, true)) {
     return;
   }
 
@@ -78,6 +345,10 @@ __global__ void __launch_bounds__(96)
   __syncthreads();
 
   auto const batch_size = db->values_per_mb;
+  if (batch_size > max_delta_mini_block_size) {
+    set_error(static_cast<int32_t>(decode_error::DELTA_PARAMS_UNSUPPORTED), error_code);
+    return;
+  }
 
   // if skipped_leaf_values is non-zero, then we need to decode up to the first mini-block
   // that has a value we need.
@@ -93,6 +364,7 @@ __global__ void __launch_bounds__(96)
     } else {  // warp2
       target_pos = min(s->nz_count, src_pos + batch_size);
     }
+    // TODO(ets): see if this sync can be removed
     __syncthreads();
 
     // warp0 will decode the rep/def levels, warp1 will unpack a mini-batch of deltas.
@@ -125,23 +397,12 @@ __global__ void __launch_bounds__(96)
         // place value for this thread
         if (dst_pos >= 0 && sp < target_pos) {
           void* const dst = nesting_info_base[leaf_level_index].data_out + dst_pos * s->dtype_len;
+          auto const val  = db->value_at(sp + skipped_leaf_values);
           switch (s->dtype_len) {
-            case 1:
-              *static_cast<int8_t*>(dst) =
-                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
-              break;
-            case 2:
-              *static_cast<int16_t*>(dst) =
-                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
-              break;
-            case 4:
-              *static_cast<int32_t*>(dst) =
-                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
-              break;
-            case 8:
-              *static_cast<int64_t*>(dst) =
-                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
-              break;
+            case 1: *static_cast<int8_t*>(dst) = val; break;
+            case 2: *static_cast<int16_t*>(dst) = val; break;
+            case 4: *static_cast<int32_t*>(dst) = val; break;
+            case 8: *static_cast<int64_t*>(dst) = val; break;
           }
         }
       }
@@ -154,6 +415,164 @@ __global__ void __launch_bounds__(96)
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
 
+// Decode page data that is DELTA_BYTE_ARRAY packed. This encoding consists of a DELTA_BINARY_PACKED
+// array of prefix lengths, followed by a DELTA_BINARY_PACKED array of suffix lengths, followed by
+// the suffixes (technically the suffixes are DELTA_LENGTH_BYTE_ARRAY encoded). The latter two can
+// be used to create an offsets array for the suffix data, but then this needs to be combined with
+// the prefix lengths to do the final decode for each value. Because the lengths of the prefixes and
+// suffixes are not encoded in the header, we're going to have to first do a quick pass through them
+// to find the start/end of each structure.
+template <typename level_t>
+__global__ void __launch_bounds__(decode_block_size)
+  gpuDecodeDeltaByteArray(PageInfo* pages,
+                          device_span<ColumnChunkDesc const> chunks,
+                          size_t min_row,
+                          size_t num_rows,
+                          int32_t* error_code)
+{
+  using cudf::detail::warp_size;
+  __shared__ __align__(16) delta_byte_array_decoder db_state;
+  __shared__ __align__(16) page_state_s state_g;
+  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 0, 0> state_buffers;
+
+  page_state_s* const s = &state_g;
+  auto* const sb        = &state_buffers;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  int const lane_id     = t % warp_size;
+  auto* const prefix_db = &db_state.prefixes;
+  auto* const suffix_db = &db_state.suffixes;
+  auto* const dba       = &db_state;
+  [[maybe_unused]] null_count_back_copier _{s, t};
+
+  auto const mask = decode_kernel_mask::DELTA_BYTE_ARRAY;
+  if (!setupLocalPageInfo(
+        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{mask}, true)) {
+    return;
+  }
+
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
+  // choose a character parallel string copy when the average string is longer than a warp
+  auto const use_char_ll = (s->page.str_bytes / s->page.num_valids) > cudf::detail::warp_size;
+
+  // copying logic from gpuDecodePageData.
+  PageNestingDecodeInfo const* nesting_info_base = s->nesting_info;
+
+  __shared__ level_t rep[delta_rolling_buf_size];  // circular buffer of repetition level values
+  __shared__ level_t def[delta_rolling_buf_size];  // circular buffer of definition level values
+
+  // skipped_leaf_values will always be 0 for flat hierarchies.
+  uint32_t const skipped_leaf_values = s->page.skipped_leaf_values;
+
+  if (t == 0) {
+    // initialize the prefixes and suffixes blocks
+    dba->init(s->data_start, s->data_end, s->page.start_val, s->page.temp_string_buf);
+  }
+  __syncthreads();
+
+  // assert that prefix and suffix have same mini-block size
+  if (prefix_db->values_per_mb != suffix_db->values_per_mb or
+      prefix_db->block_size != suffix_db->block_size or
+      prefix_db->value_count != suffix_db->value_count) {
+    set_error(static_cast<int32_t>(decode_error::DELTA_PARAM_MISMATCH), error_code);
+    return;
+  }
+
+  // pointer to location to output final strings
+  int const leaf_level_index = s->col.max_nesting_depth - 1;
+  auto strings_data          = nesting_info_base[leaf_level_index].string_out;
+
+  auto const batch_size = prefix_db->values_per_mb;
+  if (batch_size > max_delta_mini_block_size) {
+    set_error(static_cast<int32_t>(decode_error::DELTA_PARAMS_UNSUPPORTED), error_code);
+    return;
+  }
+
+  // if this is a bounds page and nested, then we need to skip up front. non-nested will work
+  // its way through the page.
+  int string_pos          = has_repetition ? s->page.start_val : 0;
+  auto const is_bounds_pg = is_bounds_page(s, min_row, num_rows, has_repetition);
+  if (is_bounds_pg && string_pos > 0) { dba->skip(use_char_ll); }
+
+  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+    uint32_t target_pos;
+    uint32_t const src_pos = s->src_pos;
+
+    if (t < 3 * warp_size) {  // warp 0..2
+      target_pos = min(src_pos + 2 * batch_size, s->nz_count + s->first_row + batch_size);
+    } else {  // warp 3
+      target_pos = min(s->nz_count, src_pos + batch_size);
+    }
+    // TODO(ets): see if this sync can be removed
+    __syncthreads();
+
+    // warp0 will decode the rep/def levels, warp1 will unpack a mini-batch of prefixes, warp 2 will
+    // unpack a mini-batch of suffixes. warp3 waits one cycle for warps 0-2 to produce a batch, and
+    // then stuffs values into the proper location in the output.
+    if (t < warp_size) {
+      // decode repetition and definition levels.
+      // - update validity vectors
+      // - updates offsets (for nested columns)
+      // - produces non-NULL value indices in s->nz_idx for subsequent decoding
+      gpuDecodeLevels<delta_rolling_buf_size, level_t>(s, sb, target_pos, rep, def, t);
+
+    } else if (t < 2 * warp_size) {
+      // warp 1
+      prefix_db->decode_batch();
+
+    } else if (t < 3 * warp_size) {
+      // warp 2
+      suffix_db->decode_batch();
+
+    } else if (src_pos < target_pos) {
+      // warp 3
+
+      int const nproc = min(batch_size, s->page.end_val - string_pos);
+      strings_data += use_char_ll
+                        ? dba->calculate_string_values_cp(strings_data, string_pos, nproc, lane_id)
+                        : dba->calculate_string_values(strings_data, string_pos, nproc, lane_id);
+      string_pos += nproc;
+
+      // process the mini-block in batches of 32
+      for (uint32_t sp = src_pos + lane_id; sp < src_pos + batch_size; sp += 32) {
+        // the position in the output column/buffer
+        int dst_pos = sb->nz_idx[rolling_index<delta_rolling_buf_size>(sp)];
+
+        // handle skip_rows here. flat hierarchies can just skip up to first_row.
+        if (!has_repetition) { dst_pos -= s->first_row; }
+
+        if (dst_pos >= 0 && sp < target_pos) {
+          auto const offptr =
+            reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out) + dst_pos;
+          auto const src_idx = sp + skipped_leaf_values;
+          *offptr            = prefix_db->value_at(src_idx) + suffix_db->value_at(src_idx);
+        }
+        __syncwarp();
+      }
+
+      if (lane_id == 0) { s->src_pos = src_pos + batch_size; }
+    }
+
+    __syncthreads();
+  }
+
+  // now turn array of lengths into offsets
+  int value_count = nesting_info_base[leaf_level_index].value_count;
+
+  // if no repetition we haven't calculated start/end bounds and instead just skipped
+  // values until we reach first_row. account for that here.
+  if (!has_repetition) { value_count -= s->first_row; }
+
+  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+
+  if (t == 0 and s->error != 0) {
+    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
+    ref.fetch_or(s->error, cuda::std::memory_order_relaxed);
+  }
+}
+
 }  // anonymous namespace
 
 /**
@@ -181,4 +600,29 @@ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages
   }
 }
 
+/**
+ * @copydoc cudf::io::parquet::gpu::DecodeDeltaByteArray
+ */
+void __host__ DecodeDeltaByteArray(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                   cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                                   size_t num_rows,
+                                   size_t min_row,
+                                   int level_type_size,
+                                   int32_t* error_code,
+                                   rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
+
+  dim3 const dim_block(decode_block_size, 1);
+  dim3 const dim_grid(pages.size(), 1);  // 1 threadblock per page
+
+  if (level_type_size == 1) {
+    gpuDecodeDeltaByteArray<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  } else {
+    gpuDecodeDeltaByteArray<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  }
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 22add2fffc6..595dd40cdc2 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -146,18 +146,21 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type)
  * @param chunk Column chunk the page belongs to
  * @return `kernel_mask_bits` value for the given page
  */
-__device__ uint32_t kernel_mask_for_page(PageInfo const& page, ColumnChunkDesc const& chunk)
+__device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
+                                                   ColumnChunkDesc const& chunk)
 {
-  if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; }
+  if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return decode_kernel_mask::NONE; }
 
   if (page.encoding == Encoding::DELTA_BINARY_PACKED) {
-    return KERNEL_MASK_DELTA_BINARY;
+    return decode_kernel_mask::DELTA_BINARY;
+  } else if (page.encoding == Encoding::DELTA_BYTE_ARRAY) {
+    return decode_kernel_mask::DELTA_BYTE_ARRAY;
   } else if (is_string_col(chunk)) {
-    return KERNEL_MASK_STRING;
+    return decode_kernel_mask::STRING;
   }
 
   // non-string, non-delta
-  return KERNEL_MASK_GENERAL;
+  return decode_kernel_mask::GENERAL;
 }
 
 /**
@@ -380,7 +383,9 @@ __global__ void __launch_bounds__(128)
       bs->page.skipped_values      = -1;
       bs->page.skipped_leaf_values = 0;
       bs->page.str_bytes           = 0;
-      bs->page.kernel_mask         = 0;
+      bs->page.temp_string_size    = 0;
+      bs->page.temp_string_buf     = nullptr;
+      bs->page.kernel_mask         = decode_kernel_mask::NONE;
     }
     num_values     = bs->ck.num_values;
     page_info      = bs->ck.page_info;
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 4c7d8e3c20a..e29db042401 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -14,20 +14,28 @@
  * limitations under the License.
  */
 
+#include "delta_binary.cuh"
 #include "page_decode.cuh"
 #include "page_string_utils.cuh"
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/strings/detail/gather.cuh>
 
+#include <thrust/logical.h>
+#include <thrust/transform_scan.h>
+
+#include <bitset>
+
 namespace cudf::io::parquet::detail {
 
 namespace {
 
-constexpr int preprocess_block_size = 512;
-constexpr int decode_block_size     = 128;
-constexpr int rolling_buf_size      = decode_block_size * 2;
-constexpr int preproc_buf_size      = LEVEL_DECODE_BUF_SIZE;
+constexpr int preprocess_block_size    = 512;
+constexpr int decode_block_size        = 128;
+constexpr int delta_preproc_block_size = 64;
+constexpr int rolling_buf_size         = decode_block_size * 2;
+constexpr int preproc_buf_size         = LEVEL_DECODE_BUF_SIZE;
 
 /**
  * @brief Compute the start and end page value bounds for this page
@@ -450,12 +458,107 @@ __device__ size_t totalPlainEntriesSize(uint8_t const* data,
 }
 
 /**
- * @brief Kernel for computing string page output size information.
+ * @brief Compute string size information for DELTA_BYTE_ARRAY encoded strings.
+ *
+ * This traverses the packed prefix and suffix lengths, summing them to obtain the total
+ * number of bytes needed for the decoded string data. It also calculates an upper bound
+ * for the largest string length to obtain an upper bound on temporary space needed if
+ * rows will be skipped.
+ *
+ * Called with 64 threads.
+ *
+ * @param data Pointer to the start of the page data stream
+ * @param end Pointer to the end of the page data stream
+ * @param start_value Do not count values that occur before this index
+ * @param end_value Do not count values that occur after this index
+ * @return A pair of `size_t` values representing the total string size and temp buffer size
+ * required for decoding
+ */
+__device__ thrust::pair<size_t, size_t> totalDeltaByteArraySize(uint8_t const* data,
+                                                                uint8_t const* end,
+                                                                int start_value,
+                                                                int end_value)
+{
+  using cudf::detail::warp_size;
+  using WarpReduce = cub::WarpReduce<uleb128_t>;
+  __shared__ typename WarpReduce::TempStorage temp_storage[2];
+
+  __shared__ __align__(16) delta_binary_decoder prefixes;
+  __shared__ __align__(16) delta_binary_decoder suffixes;
+
+  int const t       = threadIdx.x;
+  int const lane_id = t % warp_size;
+  int const warp_id = t / warp_size;
+
+  if (t == 0) {
+    auto const* suffix_start = prefixes.find_end_of_block(data, end);
+    suffixes.init_binary_block(suffix_start, end);
+  }
+  __syncthreads();
+
+  // two warps will traverse the prefixes and suffixes and sum them up
+  auto const db = t < warp_size ? &prefixes : t < 2 * warp_size ? &suffixes : nullptr;
+
+  size_t total_bytes = 0;
+  uleb128_t max_len  = 0;
+
+  if (db != nullptr) {
+    // initialize with first value (which is stored in last_value)
+    if (lane_id == 0 && start_value == 0) { total_bytes = db->last_value; }
+
+    uleb128_t lane_sum = 0;
+    uleb128_t lane_max = 0;
+    while (db->current_value_idx < end_value &&
+           db->current_value_idx < db->num_encoded_values(true)) {
+      // calculate values for current mini-block
+      db->calc_mini_block_values(lane_id);
+
+      // get per lane sum for mini-block
+      for (uint32_t i = 0; i < db->values_per_mb; i += 32) {
+        uint32_t const idx = db->current_value_idx + i + lane_id;
+        if (idx >= start_value && idx < end_value && idx < db->value_count) {
+          lane_sum += db->value[rolling_index<delta_rolling_buf_size>(idx)];
+          lane_max = max(lane_max, db->value[rolling_index<delta_rolling_buf_size>(idx)]);
+        }
+      }
+
+      if (lane_id == 0) { db->setup_next_mini_block(true); }
+      __syncwarp();
+    }
+
+    // get sum for warp.
+    // note: warp_sum will only be valid on lane 0.
+    auto const warp_sum = WarpReduce(temp_storage[warp_id]).Sum(lane_sum);
+    auto const warp_max = WarpReduce(temp_storage[warp_id]).Reduce(lane_max, cub::Max());
+
+    if (lane_id == 0) {
+      total_bytes += warp_sum;
+      max_len = warp_max;
+    }
+  }
+  __syncthreads();
+
+  // now sum up total_bytes from the two warps
+  auto const final_bytes =
+    cudf::detail::single_lane_block_sum_reduce<delta_preproc_block_size, 0>(total_bytes);
+
+  // Sum up prefix and suffix max lengths to get a max possible string length. Multiply that
+  // by the number of strings in a mini-block, plus one to save the last string.
+  auto const temp_bytes =
+    cudf::detail::single_lane_block_sum_reduce<delta_preproc_block_size, 0>(max_len) *
+    (db->values_per_mb + 1);
+
+  return {final_bytes, temp_bytes};
+}
+
+/**
+ * @brief Kernel for computing string page bounds information.
  *
- * String columns need accurate data size information to preallocate memory in the column buffer to
- * store the char data. This calls a kernel to calculate information needed by the string decoding
- * kernel. On exit, the `str_bytes`, `num_nulls`, and `num_valids` fields of the PageInfo struct
- * are updated. This call ignores non-string columns.
+ * This kernel traverses the repetition and definition level data to determine start and end values
+ * for pages with string-like data. Also calculates the number of null and valid values in the
+ * page. Does nothing if the page mask is neither `STRING` nor `DELTA_BYTE_ARRAY`. On exit the
+ * `num_nulls`, `num_valids`, `start_val` and `end_val` fields of the `PageInfo` struct will be
+ * populated.
  *
  * @param pages All pages to be decoded
  * @param chunks All chunks to be decoded
@@ -464,7 +567,7 @@ __device__ size_t totalPlainEntriesSize(uint8_t const* data,
  * @tparam level_t Type used to store decoded repetition and definition levels
  */
 template <typename level_t>
-__global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
+__global__ void __launch_bounds__(preprocess_block_size) gpuComputeStringPageBounds(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
   __shared__ __align__(16) page_state_s state_g;
@@ -474,8 +577,13 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   int const t           = threadIdx.x;
   PageInfo* const pp    = &pages[page_idx];
 
-  // reset str_bytes to 0 in case it's already been calculated
-  if (t == 0) { pp->str_bytes = 0; }
+  if (t == 0) {
+    s->page.num_nulls  = 0;
+    s->page.num_valids = 0;
+    // reset str_bytes to 0 in case it's already been calculated (esp needed for chunked reads).
+    // TODO: need to rethink this once str_bytes is in the statistics
+    pp->str_bytes = 0;
+  }
 
   // whether or not we have repetition levels (lists)
   bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
@@ -491,23 +599,11 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
                                                                                       {rep_runs}};
 
   // setup page info
-  if (!setupLocalPageInfo(
-        s, pp, chunks, min_row, num_rows, mask_filter{KERNEL_MASK_STRING}, false)) {
-    return;
-  }
-
-  if (!t) {
-    s->page.num_nulls  = 0;
-    s->page.num_valids = 0;
-    s->page.str_bytes  = 0;
-  }
-  __syncthreads();
+  auto const mask = BitOr(decode_kernel_mask::STRING, decode_kernel_mask::DELTA_BYTE_ARRAY);
+  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, mask_filter{mask}, true)) { return; }
 
   bool const is_bounds_pg = is_bounds_page(s, min_row, num_rows, has_repetition);
 
-  // if we're skipping this page anyway, no need to count it
-  if (!is_bounds_pg && !is_page_contained(s, min_row, num_rows)) { return; }
-
   // find start/end value indices
   auto const [start_value, end_value] =
     page_bounds(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders);
@@ -516,7 +612,106 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   if (t == 0) {
     pp->num_nulls  = s->page.num_nulls;
     pp->num_valids = s->page.num_valids;
+    pp->start_val  = start_value;
+    pp->end_val    = end_value;
   }
+}
+
+/**
+ * @brief Kernel for computing string page output size information for delta_byte_array encoding.
+ *
+ * This call ignores columns that are not DELTA_BYTE_ARRAY encoded. On exit the `str_bytes` field
+ * of the `PageInfo` struct will be populated. Also fills in the `temp_string_size` field if rows
+ * are to be skipped.
+ *
+ * @param pages All pages to be decoded
+ * @param chunks All chunks to be decoded
+ * @param min_rows crop all rows below min_row
+ * @param num_rows Maximum number of rows to read
+ */
+__global__ void __launch_bounds__(delta_preproc_block_size) gpuComputeDeltaPageStringSizes(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+{
+  __shared__ __align__(16) page_state_s state_g;
+
+  page_state_s* const s = &state_g;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  PageInfo* const pp    = &pages[page_idx];
+
+  // whether or not we have repetition levels (lists)
+  bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
+
+  // setup page info
+  auto const mask = decode_kernel_mask::DELTA_BYTE_ARRAY;
+  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, mask_filter{mask}, true)) { return; }
+
+  auto const start_value = pp->start_val;
+
+  // if data size is known, can short circuit here
+  if ((chunks[pp->chunk_idx].data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
+    if (t == 0) {
+      pp->str_bytes = pp->num_valids * s->dtype_len_in;
+
+      // only need temp space if we're skipping values
+      if (start_value > 0) {
+        // just need to parse the header of the first delta binary block to get values_per_mb
+        delta_binary_decoder db;
+        db.init_binary_block(s->data_start, s->data_end);
+        // save enough for one mini-block plus some extra to save the last_string
+        pp->temp_string_size = s->dtype_len_in * (db.values_per_mb + 1);
+      }
+    }
+  } else {
+    // now process string info in the range [start_value, end_value)
+    // set up for decoding strings...can be either plain or dictionary
+    uint8_t const* data      = s->data_start;
+    uint8_t const* const end = s->data_end;
+    auto const end_value     = pp->end_val;
+
+    auto const [len, temp_bytes] = totalDeltaByteArraySize(data, end, start_value, end_value);
+
+    if (t == 0) {
+      // TODO check for overflow
+      pp->str_bytes = len;
+
+      // only need temp space if we're skipping values
+      if (start_value > 0) { pp->temp_string_size = temp_bytes; }
+    }
+  }
+}
+
+/**
+ * @brief Kernel for computing string page output size information.
+ *
+ * This call ignores non-string columns. On exit the `str_bytes` field of the `PageInfo` struct will
+ * be populated.
+ *
+ * @param pages All pages to be decoded
+ * @param chunks All chunks to be decoded
+ * @param min_rows crop all rows below min_row
+ * @param num_rows Maximum number of rows to read
+ */
+__global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+{
+  __shared__ __align__(16) page_state_s state_g;
+
+  page_state_s* const s = &state_g;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  PageInfo* const pp    = &pages[page_idx];
+
+  // whether or not we have repetition levels (lists)
+  bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
+
+  // setup page info
+  if (!setupLocalPageInfo(
+        s, pp, chunks, min_row, num_rows, mask_filter{decode_kernel_mask::STRING}, true)) {
+    return;
+  }
+
+  bool const is_bounds_pg = is_bounds_page(s, min_row, num_rows, has_repetition);
 
   auto const& col  = s->col;
   size_t str_bytes = 0;
@@ -530,6 +725,8 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
     uint8_t const* const end = s->data_end;
     uint8_t const* dict_base = nullptr;
     int dict_size            = 0;
+    auto const start_value   = pp->start_val;
+    auto const end_value     = pp->end_val;
 
     switch (pp->encoding) {
       case Encoding::PLAIN_DICTIONARY:
@@ -561,6 +758,9 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   if (t == 0) {
     // TODO check for overflow
     pp->str_bytes = str_bytes;
+
+    // only need temp space for delta
+    pp->temp_string_size = 0;
   }
 }
 
@@ -586,6 +786,7 @@ __global__ void __launch_bounds__(decode_block_size)
                           size_t num_rows,
                           int32_t* error_code)
 {
+  using cudf::detail::warp_size;
   __shared__ __align__(16) page_state_s state_g;
   __shared__ __align__(4) size_type last_offset;
   __shared__ __align__(16)
@@ -596,10 +797,12 @@ __global__ void __launch_bounds__(decode_block_size)
   auto* const sb        = &state_buffers;
   int const page_idx    = blockIdx.x;
   int const t           = threadIdx.x;
+  int const lane_id     = t % warp_size;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
+  auto const mask = decode_kernel_mask::STRING;
   if (!setupLocalPageInfo(
-        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{KERNEL_MASK_STRING}, true)) {
+        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{mask}, true)) {
     return;
   }
 
@@ -630,6 +833,7 @@ __global__ void __launch_bounds__(decode_block_size)
       target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
       if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); }
     }
+    // TODO(ets): see if this sync can be removed
     __syncthreads();
     if (t < 32) {
       // decode repetition and definition levels.
@@ -643,9 +847,9 @@ __global__ void __launch_bounds__(decode_block_size)
 
       // WARP1: Decode dictionary indices, booleans or string positions
       if (s->dict_base) {
-        src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first;
+        src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, lane_id).first;
       } else {
-        gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
+        gpuInitStringDescriptors<false>(s, sb, src_target_pos, lane_id);
       }
       if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
     } else {
@@ -748,6 +952,19 @@ __global__ void __launch_bounds__(decode_block_size)
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
 
+// Functor used to set the `temp_string_buf` pointer for each page. `data` points to a buffer
+// to be used when skipping rows in the delta_byte_array decoder. Given a page and an offset,
+// set the page's `temp_string_buf` to be `data + offset`.
+struct page_tform_functor {
+  uint8_t* const data;
+
+  __device__ PageInfo operator()(PageInfo& page, int64_t offset)
+  {
+    if (page.temp_string_size != 0) { page.temp_string_buf = data + offset; }
+    return page;
+  }
+};
+
 }  // anonymous namespace
 
 /**
@@ -755,20 +972,81 @@ __global__ void __launch_bounds__(decode_block_size)
  */
 void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
                             cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                            rmm::device_uvector<uint8_t>& temp_string_buf,
                             size_t min_row,
                             size_t num_rows,
                             int level_type_size,
+                            uint32_t kernel_mask,
                             rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(preprocess_block_size, 1);
-  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+  dim3 const dim_block(preprocess_block_size, 1);
+  dim3 const dim_grid(pages.size(), 1);  // 1 threadblock per page
   if (level_type_size == 1) {
-    gpuComputePageStringSizes<uint8_t>
+    gpuComputeStringPageBounds<uint8_t>
       <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   } else {
-    gpuComputePageStringSizes<uint16_t>
+    gpuComputeStringPageBounds<uint16_t>
       <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   }
+
+  // kernel mask may contain other kernels we don't need to count
+  int const count_mask =
+    kernel_mask & BitOr(decode_kernel_mask::DELTA_BYTE_ARRAY, decode_kernel_mask::STRING);
+  int const nkernels = std::bitset<32>(count_mask).count();
+  auto const streams = cudf::detail::fork_streams(stream, nkernels);
+
+  int s_idx = 0;
+  if (BitAnd(kernel_mask, decode_kernel_mask::DELTA_BYTE_ARRAY) != 0) {
+    dim3 dim_delta(delta_preproc_block_size, 1);
+    gpuComputeDeltaPageStringSizes<<<dim_grid, dim_delta, 0, streams[s_idx++].value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows);
+  }
+  if (BitAnd(kernel_mask, decode_kernel_mask::STRING) != 0) {
+    gpuComputePageStringSizes<<<dim_grid, dim_block, 0, streams[s_idx++].value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows);
+  }
+
+  // synchronize the streams
+  cudf::detail::join_streams(streams, stream);
+
+  // check for needed temp space for DELTA_BYTE_ARRAY
+  auto const need_sizes = thrust::any_of(
+    rmm::exec_policy(stream), pages.d_begin(), pages.d_end(), [] __device__(auto& page) {
+      return page.temp_string_size != 0;
+    });
+
+  if (need_sizes) {
+    // sum up all of the temp_string_sizes
+    auto const page_sizes = [] __device__(PageInfo const& page) { return page.temp_string_size; };
+    auto const total_size = thrust::transform_reduce(rmm::exec_policy(stream),
+                                                     pages.d_begin(),
+                                                     pages.d_end(),
+                                                     page_sizes,
+                                                     0L,
+                                                     thrust::plus<int64_t>{});
+
+    // now do an exclusive scan over the temp_string_sizes to get offsets for each
+    // page's chunk of the temp buffer
+    rmm::device_uvector<int64_t> page_string_offsets(pages.size(), stream);
+    thrust::transform_exclusive_scan(rmm::exec_policy_nosync(stream),
+                                     pages.d_begin(),
+                                     pages.d_end(),
+                                     page_string_offsets.begin(),
+                                     page_sizes,
+                                     0L,
+                                     thrust::plus<int64_t>{});
+
+    // allocate the temp space
+    temp_string_buf.resize(total_size, stream);
+
+    // now use the offsets array to set each page's temp_string_buf pointers
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      pages.d_begin(),
+                      pages.d_end(),
+                      page_string_offsets.begin(),
+                      pages.d_begin(),
+                      page_tform_functor{temp_string_buf.data()});
+  }
 }
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 68851e72663..129d4e4d28c 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -35,6 +35,7 @@
 
 #include <cuda_runtime.h>
 
+#include <type_traits>
 #include <vector>
 
 namespace cudf::io::parquet::detail {
@@ -64,7 +65,8 @@ constexpr bool is_supported_encoding(Encoding enc)
     case Encoding::PLAIN_DICTIONARY:
     case Encoding::RLE:
     case Encoding::RLE_DICTIONARY:
-    case Encoding::DELTA_BINARY_PACKED: return true;
+    case Encoding::DELTA_BINARY_PACKED:
+    case Encoding::DELTA_BYTE_ARRAY: return true;
     default: return false;
   }
 }
@@ -86,13 +88,15 @@ constexpr void set_error(int32_t error, int32_t* error_code)
  * These values are used as bitmasks, so they must be powers of 2.
  */
 enum class decode_error : int32_t {
-  DATA_STREAM_OVERRUN  = 0x1,
-  LEVEL_STREAM_OVERRUN = 0x2,
-  UNSUPPORTED_ENCODING = 0x4,
-  INVALID_LEVEL_RUN    = 0x8,
-  INVALID_DATA_TYPE    = 0x10,
-  EMPTY_PAGE           = 0x20,
-  INVALID_DICT_WIDTH   = 0x40,
+  DATA_STREAM_OVERRUN      = 0x1,
+  LEVEL_STREAM_OVERRUN     = 0x2,
+  UNSUPPORTED_ENCODING     = 0x4,
+  INVALID_LEVEL_RUN        = 0x8,
+  INVALID_DATA_TYPE        = 0x10,
+  EMPTY_PAGE               = 0x20,
+  INVALID_DICT_WIDTH       = 0x40,
+  DELTA_PARAM_MISMATCH     = 0x80,
+  DELTA_PARAMS_UNSUPPORTED = 0x100,
 };
 
 /**
@@ -145,6 +149,17 @@ constexpr uint32_t BitAnd(T1 a, T2 b)
   return static_cast<uint32_t>(a) & static_cast<uint32_t>(b);
 }
 
+template <class T1,
+          class T2,
+          typename std::enable_if_t<(is_scoped_enum<T1>::value and std::is_same_v<T1, T2>) or
+                                    (is_scoped_enum<T1>::value and std::is_same_v<uint32_t, T2>) or
+                                    (is_scoped_enum<T2>::value and std::is_same_v<uint32_t, T1>)>* =
+            nullptr>
+constexpr uint32_t BitOr(T1 a, T2 b)
+{
+  return static_cast<uint32_t>(a) | static_cast<uint32_t>(b);
+}
+
 /**
  * @brief Enums for the flags in the page header
  */
@@ -168,10 +183,12 @@ enum level_type {
  *
  * Used to control which decode kernels to run.
  */
-enum kernel_mask_bits {
-  KERNEL_MASK_GENERAL      = (1 << 0),  // Run catch-all decode kernel
-  KERNEL_MASK_STRING       = (1 << 1),  // Run decode kernel for string data
-  KERNEL_MASK_DELTA_BINARY = (1 << 2)   // Run decode kernel for DELTA_BINARY_PACKED data
+enum class decode_kernel_mask {
+  NONE             = 0,
+  GENERAL          = (1 << 0),  // Run catch-all decode kernel
+  STRING           = (1 << 1),  // Run decode kernel for string data
+  DELTA_BINARY     = (1 << 2),  // Run decode kernel for DELTA_BINARY_PACKED data
+  DELTA_BYTE_ARRAY = (1 << 3)   // Run decode kernel for DELTA_BYTE_ARRAY encoded data
 };
 
 /**
@@ -252,9 +269,11 @@ struct PageInfo {
   int32_t num_input_values;
   int32_t chunk_row;  // starting row of this page relative to the start of the chunk
   int32_t num_rows;   // number of rows in this page
-  // the next two are calculated in gpuComputePageStringSizes
+  // the next four are calculated in gpuComputePageStringSizes
   int32_t num_nulls;       // number of null values (V2 header), but recalculated for string cols
   int32_t num_valids;      // number of non-null values, taking into account skip_rows/num_rows
+  int32_t start_val;       // index of first value of the string data stream to use
+  int32_t end_val;         // index of last value in string data stream
   int32_t chunk_idx;       // column chunk this page belongs to
   int32_t src_col_schema;  // schema index of this column
   uint8_t flags;           // PAGEINFO_FLAGS_XXX
@@ -291,7 +310,11 @@ struct PageInfo {
   // level decode buffers
   uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES];
 
-  uint32_t kernel_mask;
+  // temporary space for decoding DELTA_BYTE_ARRAY encoded strings
+  int64_t temp_string_size;
+  uint8_t* temp_string_buf;
+
+  decode_kernel_mask kernel_mask;
 };
 
 /**
@@ -597,16 +620,20 @@ void ComputePageSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
  *
  * @param[in,out] pages All pages to be decoded
  * @param[in] chunks All chunks to be decoded
+ * @param[out] temp_string_buf Temporary space needed for decoding DELTA_BYTE_ARRAY strings
  * @param[in] min_rows crop all rows below min_row
  * @param[in] num_rows Maximum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] kernel_mask Mask of kernels to run
  * @param[in] stream CUDA stream to use
  */
 void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
                             cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                            rmm::device_uvector<uint8_t>& temp_string_buf,
                             size_t min_row,
                             size_t num_rows,
                             int level_type_size,
+                            uint32_t kernel_mask,
                             rmm::cuda_stream_view stream);
 
 /**
@@ -665,7 +692,7 @@ void DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[out] error_code Error code for kernel failures
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
                        cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -675,6 +702,28 @@ void DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
                        int32_t* error_code,
                        rmm::cuda_stream_view stream);
 
+/**
+ * @brief Launches kernel for reading the DELTA_BYTE_ARRAY column data stored in the pages
+ *
+ * The page data will be written to the output pointed to in the page's
+ * associated column chunk.
+ *
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[in] num_rows Total number of rows to read
+ * @param[in] min_row Minimum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] error_code Error code for kernel failures
+ * @param[in] stream CUDA stream to use
+ */
+void DecodeDeltaByteArray(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                          cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                          size_t num_rows,
+                          size_t min_row,
+                          int level_type_size,
+                          int32_t* error_code,
+                          rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for initializing encoder row group fragments
  *
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 11c20d0e540..6e799424d01 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -21,7 +21,6 @@
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <rmm/cuda_stream_pool.hpp>
 
 #include <bitset>
 #include <numeric>
@@ -30,10 +29,15 @@ namespace cudf::io::parquet::detail {
 
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
-  auto& chunks              = _pass_itm_data->chunks;
-  auto& pages               = _pass_itm_data->pages_info;
-  auto& page_nesting        = _pass_itm_data->page_nesting_info;
-  auto& page_nesting_decode = _pass_itm_data->page_nesting_decode_info;
+  auto& chunks               = _pass_itm_data->chunks;
+  auto& pages                = _pass_itm_data->pages_info;
+  auto& page_nesting         = _pass_itm_data->page_nesting_info;
+  auto& page_nesting_decode  = _pass_itm_data->page_nesting_decode_info;
+  auto const level_type_size = _pass_itm_data->level_type_size;
+
+  // temporary space for DELTA_BYTE_ARRAY decoding. this only needs to live until
+  // gpu::DecodeDeltaByteArray returns.
+  rmm::device_uvector<uint8_t> delta_temp_buf(0, _stream);
 
   // Should not reach here if there is no page data.
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
@@ -52,11 +56,12 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   // doing a gather operation later on.
   // TODO: This step is somewhat redundant if size info has already been calculated (nested schema,
   // chunked reader).
-  auto const has_strings = (kernel_mask & KERNEL_MASK_STRING) != 0;
+  auto const has_strings =
+    (kernel_mask & BitOr(decode_kernel_mask::STRING, decode_kernel_mask::DELTA_BYTE_ARRAY)) != 0;
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
     ComputePageStringSizes(
-      pages, chunks, skip_rows, num_rows, _pass_itm_data->level_type_size, _stream);
+      pages, chunks, delta_temp_buf, skip_rows, num_rows, level_type_size, kernel_mask, _stream);
 
     col_sizes = calculate_page_string_offsets();
 
@@ -163,6 +168,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunks.host_to_device_async(_stream);
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
+  if (has_strings) { chunk_nested_str_data.host_to_device_async(_stream); }
 
   // create this before we fork streams
   kernel_error error_code(_stream);
@@ -171,25 +177,27 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   int const nkernels = std::bitset<32>(kernel_mask).count();
   auto streams       = cudf::detail::fork_streams(_stream, nkernels);
 
-  auto const level_type_size = _pass_itm_data->level_type_size;
-
   // launch string decoder
   int s_idx = 0;
-  if (has_strings) {
-    auto& stream = streams[s_idx++];
-    chunk_nested_str_data.host_to_device_async(stream);
+  if (BitAnd(kernel_mask, decode_kernel_mask::STRING) != 0) {
     DecodeStringPageData(
-      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), stream);
+      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
+  }
+
+  // launch delta byte array decoder
+  if (BitAnd(kernel_mask, decode_kernel_mask::DELTA_BYTE_ARRAY) != 0) {
+    DecodeDeltaByteArray(
+      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
   // launch delta binary decoder
-  if ((kernel_mask & KERNEL_MASK_DELTA_BINARY) != 0) {
+  if (BitAnd(kernel_mask, decode_kernel_mask::DELTA_BINARY) != 0) {
     DecodeDeltaBinary(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
   // launch the catch-all page decoder
-  if ((kernel_mask & KERNEL_MASK_GENERAL) != 0) {
+  if (BitAnd(kernel_mask, decode_kernel_mask::GENERAL) != 0) {
     DecodePageData(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 80a4d00a5a2..0bc492546e9 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1416,7 +1416,7 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
     page_index.begin(), page_to_string_size{pages.device_ptr(), chunks.device_ptr()});
 
   // do scan by key to calculate string offsets for each page
-  thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),
+  thrust::exclusive_scan_by_key(rmm::exec_policy_nosync(_stream),
                                 page_keys.begin(),
                                 page_keys.end(),
                                 val_iter,
@@ -1424,7 +1424,7 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
 
   // now sum up page sizes
   rmm::device_uvector<int> reduce_keys(col_sizes.size(), _stream);
-  thrust::reduce_by_key(rmm::exec_policy(_stream),
+  thrust::reduce_by_key(rmm::exec_policy_nosync(_stream),
                         page_keys.begin(),
                         page_keys.end(),
                         val_iter,
diff --git a/python/cudf/cudf/tests/data/parquet/delta_byte_arr.parquet b/python/cudf/cudf/tests/data/parquet/delta_byte_arr.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..7f6006a75bff0498e373d61f1265ff2e8458c917
GIT binary patch
literal 5783
zcmcII2V4``(;E^(DAHut1UN+xLQyH9NIw=L9WDW-1Uw8R0VI)_glYp74;un177(#e
z&J(bKD5wY)5JcrpKtWMdEC^Ttm9zYJ7lNGU-TlAc_kEjRcIVB!nKy4{cHYb~U_UdE
z2-e7fXJkS_4hSHBWsm>@YT7}n@c6~>rDG6z0t5SuVE_Q&;0!8EJ_7DL2*80fb*P}w
zw)j)`;!Xt*u6&`|i}u2Fc{z85&!0o*d-R73?#eZWXB;>bf2^`g3SGt^N3An!Yfq@j
z)~58>9NjXK(j#lBbH6?KH8)2uj2EobJjd%?^YKSb*rQ=*aLx)j9!43TfIaZ$P1z<V
z=Dhdjd9e+<akD-0JwPRj9#NS>o)Qrj5hs?!3b>-MXbz|ZaiA&^REejO?EkxMA~wp$
zgH@P~2$+qOP#akh4DkNZA~K#XGDd8HKhMD7Wyb6wAONj$+PaxG-(C6Rw}fN2Z4%2R
zAC|At>mA)(xoyga(l?J7-ZDlcH3d3xb>*Iqx4V|K-9v!kgzoCkZ{ozhmSW$PgS~kJ
zS@!bIu<B8b-?<k;r@71*b?wLFX7LZZ<mqV2`D--QCe!Db>1=BL&i2Oqjs-XcfX9Ei
zZiOfJ{t4x(X$y!3%YzyPnRh8d8L26hBErOMew>6YiH=>0S>ib!T8F3JAlUzTH^`Rj
zWWhSZ*I^+S7|rKLvjt>%n9ogQFXeDWlfs~a1W-Yj*^CW)lug;IR6_T<@{^*PK#T00
z$LcdLEiNzMO<ASX5K;uKWg6|4;7hVzI0Ke|89;{LS$CzsaxDFB=&}A~<-Hr@TnrV|
zNepYCwyHRFt&UPFJw-02HeyZIc0Y}@b+US=8J>D7D!8pb`;kZ+Sd8pVr<+)As*Xv-
z_{8d+j6CHBaqC>%&rVrRC>iCIQd2wg6dE)u)>F)1rJRbUWnEenW4|c&J}&kxlU_zy
zz0>&0ju$U>CSMRYdcDX{t6SkhKP$WWS^9U1(+;Ry5y-b~iJWC#&(D20>eg`P*vVs0
zu2j89pVb<2qI!hfF8*o2knB7suUXhW{fS=O!!paQbMK5aZ=T(&9^6$yl>u-En^V*$
z*`F6J5=+9CvIShs{`o{GUzS>lgyJ_%i}>M;EZBqj;m@Hb8yFQW;gX3kKb$L)pz-(i
zOZ<WI0Ck5eEv5JxPCRpWW)EkvPD*g7*PJ_kwCwGiNHwj~jeBy>E@MNy%U83zos1X~
zg;M8}pL~dP#2*&GtGPEW#s2s+qr9v6aq$+t>2#rj6N6zHHHTqYkXlqIcw-vx1ds}x
z$PhVVvs0npGV20+g6?$1$Hy3CFKg{xGVCDImL!vXx{W5B`KmJ{Tbmy@osZsHp7rk2
zN8p63=*;)V3*;*5xV)vapB~tEwKDJ6b&u=T@9$4tkrWeGJ-}X-k@)`ntpLZjosPSs
zPHwM=+S;#8JL}gfHh9bP{H3!%&*q1-+wA@rQ12b-&=S9o-gaK0`N#_6?9w+OT?xFC
zR}{|a`;`h>uh$MD%QzDQ4!#XLczWOB*RGP1ma8KhH)!Di6Tm>~6vc=zK6|oHGKo-?
zEcLQ7`c8apX2c;SIB*vZeKG#*5L_@B=lfM2=*IyZd<u?W3gBd906@<*JBvIkI7@c}
zTLYO1pcEO5lExuZveTKA^)o#nl;RtV1E8k`QWe37FrkRc<BCLF4wlTqczkvw7C<j>
zP#2E+R0CCZTm?kOjU@1nEUF`MQobS@Xasopxq0~Nc=&t6zI6XNItz{cjmdJbw=?YP
z0VC_;PkjlZVFe)P4udB2%2xiMr6$~QUlEWey(Hsm6?f%U`V7pSn<Z*$)4fg4-ZJ|v
zDQNZRCq*0UkhYFWyg%RwH~^U|>U(3#hI+g*Ms>U2yUlYy_n_!xO`o=^Q>y=Z0+H&J
zt{T1A;D(M2ZY@a;G!0EGxUgo;lqXJ3swAgZwdr|c<&<9P{%SeNTN`EN%`OOvaL4<n
zxQBx5H(Apub!C#y;|6T%UG48T{1DK{-7@9$`DbUmp4n!$a)-9;$7{uoR&>-B?tLHh
z+oFxjx5X*i;(u&-x<$})%3lASKfkazPqzBd6Yd%xi)^P$?;7mu-+A5C&yW7NqF!Sq
zjeaIZOnJ2Hb%o#1FI5KT*sA9gHafoQfnF4M9Nrge{4B{PZ7{VgY1FVX+4-l8U2RPr
zhE`Lkv}<v))V^ueIV<5niBSKCO92L3Yx);0!^r`-Mk+IOzn#y_=L?eL7`oI|keZc{
zDD|#NL|AyVND>tm5gWHuB1*#inj#O~AX1M`v$y(pw;^68gGQ*k(cjn+jvUUw*kQ!P
z0&V;zT~iJT`q;?mu|t3k1yHkR*jsS`01?#&f_18>Fd0l32|Md#40Og*zcG^e1?;6x
z5pEosh1C+4$2_`(W*iYKAkUgO6O6Is<TNjy79<u*Vq8TLmZo0Dv7$&M#1>DSl7%_|
zYoV^Hom0#$9DXD~pIN^EoR7;@_FYfBI}&i*&co2|YX3f69HBe4&nmQC5cFHku_zA$
zfAOZY2c{~!--i@=5O4!PAE2y0bzfSAVcP7x?Y^6yXPI0ax+Cv0gAxG0t|{x6XOYi^
z7%?4QA2jyV)w5VzaKt7lzV4c672CV7Pt(`x)Wd`OOODMR?%Qyw*mOaKz<anMjG6Ty
zb<uO$>BJ7JU(=U|FI#`ujp&d*>v%%30_$DN6{B{$ow|35sYN^)cyQaGf#N*1gwkZ|
z3ynOL$l(T4rgitJ{?Pi$gQeM3_inS}tkN6k589iiw<^@z+1&a**A_gj?2}fUy(jeZ
znWCwzJckz+v|{{r(lqZ3Sp`#0CSA#zwm^6kUb9wajf^HK1@aai-J$NTxaD#RPEpgI
z0HoZAve2Zc<JEt;Nm2!z9?k@SbzS?9*uTD=Gd(=0+W}d_xS)OT-h)b?eSQu!`pCW(
zv%2)7TU#rG9541LE!@gy!rJaT517Yz4yP6*wv`JuMVH^F4>$M*D(@+z=!V}?i9Yqv
z-NEyZ)TWLH1LxiH_?LCdJSu7iJ!i^x{__0VaEt2ufg=Oj>s99G@y`jJ&HQDO0`_XX
zI#Z<bVfKp1lP^u$?TU?Qt!_(=4qh!ew8gaX#HtyhcUqRE0)#g|;mP^sL7p>~9e-a(
zQX#TRb)OFzq(S!Gh;hOek7hgFT>D(G_eUZ0V7MhdF1jS4+L?b568j7{DCg>BpRP?$
zA(YSS(@ZlfEsr_umeYoRI2A5n5{qUr9qtK6Yv&Gp(we$xu*P!7QQhm#N3^Dztu#Q~
z|Bz{`Pag5|dcb+}_^#Zi>%m=bY#L7AOL~}+zPiz>KSz5(R(t2gIqyPPyKZjxa}d#g
ztBLJvmz}%IDlk`k_Khd_eW8-u*%oWXdWwF3({M|kQs0na>*nUr>!TIBViHfhoa<{9
z*xXRaSa#Fp!obD)Z7L_MU6l0Gw1yUJJ(5bz*?BrJsO3DrDEZMX*{2p4tZi+c55#p6
zJA3j6$trfYzW=OGuUm0QvQo$JM(-=5X?r(T8SVuG4qs`vJ9W$0Xi?<QOEg%z-XwQC
z;F!x@HFjFHdq~i&@5#2-8xkJB{aVRY<5Hg>9NH|<*3UK1^nYrz@zBYn+_bwPT@FW9
zI$Uh*Rmkl3{nZ`%HLRv{aHvjrHN)TGb#M4pp(a;7xve~)INfe^<zaDsBTqwa9gR6_
zkO#5mAOU~att{P%ZP;+Osb$LBVa9O7p=Q?uW`oshJJM$gTI!Vbb1kpM?@Noj#I7oj
zDND#t(arB)9d%(*Vdx<L{kvX<jo;1s4Z@SZ_l+`J&G;Xy>lE~JXHI`p7D(=?+~!x7
zkTavtH?)XrIZp;4C~e2-NmCS(7Lh~~%@1R9I3liCjHSgr)1dY0)Dj(gtN*i%grvMZ
zc<{zwS|B-^g(c0(!kUZ$<}@=Tc}W|degbgZo2r*-_^djnvdD9l<d2&ry9_kd?YcJ|
zwuURcX@!In_lIjXt<7xXTnmv@cn^x>VtyXku{1H{*NnZNv*TCf?6wVgRm1Avb$cke
z<zv%}(QturY@o@#PSel*0k?O=^=@A2-(Buwz9Bt|nG$76XSi!?Zj=A{Vy#&gS;tQ!
z1u#}5shO!7DNiS9>TX@<p6c}ISf<lOhDOiv6apZ%q561jBjUzKa}&ZiY{^*J`5F(E
z;HeoFNZI+<T~gF!D1UADCZVJ~AqY^AY6Y8)HbE3J+7F>f!Wi;PlRzO$%iBpj<FzY=
zJXT~)V#N7U$hhE12x*BzA%C-kL1z+?HE{?Vfc{YgXH1Fz;#8uL|MhS~1<H&Gl!gPc
zr*Ye%M8po-5DBcrqxBmC-XMVMq;Mq(%$FU<UseXIr0`P>unRYiKROM(K^%v(XMh3Y
zV2KX6UVR+SumDRiksyI`#9fIk62&Hji6f$rw**v2T&^ecRj;cDcxxltV3a?#eqRy3
z^dyrBlk4Ki_-_}ZlV^Ng&`hRD3y;aTuix~^v~NW4I2qzlv?+Pib9ilOwEwR=eJI}n
zgrK9=;7<Vccex3<f3h8xhW@5)hE6(wWGKT1G(z~NOy>V$VvPS^+CHcml~ZK{Is(*B
zQzzt1_6R2CpE?!j5h`QE2DAXE7oiCm-}W9R?HjEG^Z^xe*cO}xP*bK)2>IH<82kUE
zVuiLlB66E;!36;I>5K`v(%``4O{z+vBaVn)3hYod%4ieQzm6M>?f>C~fa*|f>+HY-
zIDlF?Z({v##3rV~KW10x4l3-CJ!lI+n}0yFlr~uj_VfEO(01%yCU?l1pjTp~3-aZS
zIsP5a3x*S5mq^$XMIlcZb{?lNkueN=pd$n&eIlb6<nLmNN{E2x!!SJo<?unrrU(Fg
zCLlLw^aeA~8@7}SY!Yu`;);zi5(3S49sA&iBApXZ`h0YKH(y@3n`<)1)h3A*#1gWC
zI6Rh{t0@bSFJa6>^v6{9LAj92*_kPo2g5>?o?wllmjuxQB*B4vaS+WlhUM<bk1+Rv
zE&qkz4C9X&=TD4h(fDyNoQxRFdLg>lgucv$()E!|CeKFb*j9thY@*pgo~CS9n<e4q
z0dX96PfM8oFZ?bTJph%D@FzqF0tBoe&m>l$A1~NE0JVbvvBy%B&S9Yb`}_T~P<nzG
znThT{F)G5`j~6a*m9UsDkqG4<?C-)v>Dc}d`?Cc6q%XYa%5;qW_x_{9lldcPu3}D*
zUlhlUA0LkT6b%ArCK@e2!m#yth()4fL{)5p)Sk}HekgM6LIMk!g6-EA#US=s!U^&<
z4X2s$BltE#7B<bn0U%R^Y|*kfu4JxAtP{@_iIMM+I_5NEQ)5%0jR*h?<dgJ&0A519
ALjV8(

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 0c59fd0e5aa..af4d0294293 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1284,6 +1284,15 @@ def test_parquet_reader_v2(tmpdir, simple_pdf):
     assert_eq(cudf.read_parquet(pdf_fname), simple_pdf)
 
 
+def test_parquet_delta_byte_array(datadir):
+    fname = datadir / "delta_byte_arr.parquet"
+    assert_eq(cudf.read_parquet(fname), pd.read_parquet(fname))
+
+
+def delta_num_rows():
+    return [1, 2, 23, 32, 33, 34, 64, 65, 66, 128, 129, 130, 20000, 50000]
+
+
 @pytest.mark.parametrize("nrows", [1, 100000])
 @pytest.mark.parametrize("add_nulls", [True, False])
 @pytest.mark.parametrize(
@@ -1320,6 +1329,7 @@ def test_delta_binary(nrows, add_nulls, dtype, tmpdir):
         version="2.6",
         column_encoding="DELTA_BINARY_PACKED",
         data_page_version="2.0",
+        data_page_size=64 * 1024,
         engine="pyarrow",
         use_dictionary=False,
     )
@@ -1350,6 +1360,100 @@ def test_delta_binary(nrows, add_nulls, dtype, tmpdir):
         assert_eq(cdf2, cdf)
 
 
+@pytest.mark.parametrize("nrows", delta_num_rows())
+@pytest.mark.parametrize("add_nulls", [True, False])
+@pytest.mark.parametrize("str_encoding", ["DELTA_BYTE_ARRAY"])
+def test_delta_byte_array_roundtrip(nrows, add_nulls, str_encoding, tmpdir):
+    null_frequency = 0.25 if add_nulls else 0
+
+    # Create a pandas dataframe with random data of mixed lengths
+    test_pdf = dg.rand_dataframe(
+        dtypes_meta=[
+            {
+                "dtype": "str",
+                "null_frequency": null_frequency,
+                "cardinality": nrows,
+                "max_string_length": 10,
+            },
+            {
+                "dtype": "str",
+                "null_frequency": null_frequency,
+                "cardinality": nrows,
+                "max_string_length": 100,
+            },
+        ],
+        rows=nrows,
+        seed=0,
+        use_threads=False,
+    ).to_pandas()
+
+    pdf_fname = tmpdir.join("pdfdeltaba.parquet")
+    test_pdf.to_parquet(
+        pdf_fname,
+        version="2.6",
+        column_encoding=str_encoding,
+        data_page_version="2.0",
+        data_page_size=64 * 1024,
+        engine="pyarrow",
+        use_dictionary=False,
+    )
+    cdf = cudf.read_parquet(pdf_fname)
+    pcdf = cudf.from_pandas(test_pdf)
+    assert_eq(cdf, pcdf)
+
+
+@pytest.mark.parametrize("nrows", delta_num_rows())
+@pytest.mark.parametrize("add_nulls", [True, False])
+@pytest.mark.parametrize("str_encoding", ["DELTA_BYTE_ARRAY"])
+def test_delta_struct_list(tmpdir, nrows, add_nulls, str_encoding):
+    # Struct<List<List>>
+    lists_per_row = 3
+    list_size = 4
+    num_rows = nrows
+    include_validity = add_nulls
+
+    def list_gen_wrapped(x, y):
+        return list_row_gen(
+            int_gen, x * list_size * lists_per_row, list_size, lists_per_row
+        )
+
+    def string_list_gen_wrapped(x, y):
+        return list_row_gen(
+            string_gen,
+            x * list_size * lists_per_row,
+            list_size,
+            lists_per_row,
+            include_validity,
+        )
+
+    data = struct_gen(
+        [int_gen, string_gen, list_gen_wrapped, string_list_gen_wrapped],
+        0,
+        num_rows,
+        include_validity,
+    )
+    test_pdf = pa.Table.from_pydict({"sol": data}).to_pandas()
+    pdf_fname = tmpdir.join("pdfdeltaba.parquet")
+    test_pdf.to_parquet(
+        pdf_fname,
+        version="2.6",
+        column_encoding={
+            "sol.col0": "DELTA_BINARY_PACKED",
+            "sol.col1": str_encoding,
+            "sol.col2.list.element.list.element": "DELTA_BINARY_PACKED",
+            "sol.col3.list.element.list.element": str_encoding,
+        },
+        data_page_version="2.0",
+        data_page_size=64 * 1024,
+        engine="pyarrow",
+        use_dictionary=False,
+    )
+    # sanity check to verify file is written properly
+    assert_eq(test_pdf, pd.read_parquet(pdf_fname))
+    cdf = cudf.read_parquet(pdf_fname)
+    assert_eq(cdf, cudf.from_pandas(test_pdf))
+
+
 @pytest.mark.parametrize(
     "data",
     [

From ba5ec4080be38b795053d11bf46cb3688c201893 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 17 Nov 2023 10:36:30 -0600
Subject: [PATCH 110/118] Enable build concurrency for nightly and merge
 triggers. (#14441)

---
 .github/workflows/build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 2539057c105..e27361ab263 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -22,7 +22,7 @@ on:
         default: nightly
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
   cancel-in-progress: true
 
 jobs:

From 6c2e972cefff05f6ffbba4fd9ba894e6849b041e Mon Sep 17 00:00:00 2001
From: Trent Nelson <trent@trent.me>
Date: Fri, 17 Nov 2023 13:29:23 -0800
Subject: [PATCH 111/118] Implement user_datasource_wrapper is_empty() and
 is_device_read_preferred(). (#14357)

These two routines are missing from the current `user_datasource_wrapper` impl.

Authors:
  - Trent Nelson (https://github.com/tpn)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/14357
---
 cpp/src/io/utilities/datasource.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 5cdd92ce3b7..a466ef84133 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -360,6 +360,11 @@ class user_datasource_wrapper : public datasource {
     return source->supports_device_read();
   }
 
+  [[nodiscard]] bool is_device_read_preferred(size_t size) const override
+  {
+    return source->is_device_read_preferred(size);
+  }
+
   size_t device_read(size_t offset,
                      size_t size,
                      uint8_t* dst,
@@ -385,6 +390,8 @@ class user_datasource_wrapper : public datasource {
 
   [[nodiscard]] size_t size() const override { return source->size(); }
 
+  [[nodiscard]] bool is_empty() const override { return source->is_empty(); }
+
  private:
   datasource* const source;  ///< A non-owning pointer to the user-implemented datasource
 };

From 723c565f7a03e3e9a842526cd4cc94bcf6f582e5 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 17 Nov 2023 17:37:47 -0800
Subject: [PATCH 112/118] Fix intermediate type checking in expression parsing
 (#14445)

When parsing expressions, device data references are reused if there are multiple that are identical. Equality is determined by comparing the fields of the reference, but previously the data type was omitted. For column and literal references, this is OK because the `data_index` uniquely identifies the reference. For intermediates, however, the index is not sufficient to disambiguate because an expression could reuse a given location even if the operation produces a different data type. Therefore, the data type must be part of the equality operator.

Resolves #14409

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14445
---
 .../cudf/ast/detail/expression_parser.hpp     |  4 +--
 cpp/tests/ast/transform_tests.cpp             | 27 +++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index db0abe435b0..a36a831a7aa 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -67,8 +67,8 @@ struct alignas(8) device_data_reference {
 
   bool operator==(device_data_reference const& rhs) const
   {
-    return std::tie(data_index, reference_type, table_source) ==
-           std::tie(rhs.data_index, rhs.reference_type, rhs.table_source);
+    return std::tie(data_index, data_type, reference_type, table_source) ==
+           std::tie(rhs.data_index, rhs.data_type, rhs.reference_type, rhs.table_source);
   }
 };
 
diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index c0109a40cec..624a781c5b9 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -316,6 +316,33 @@ TEST_F(TransformTest, ImbalancedTreeArithmetic)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
 }
 
+TEST_F(TransformTest, ImbalancedTreeArithmeticDeep)
+{
+  auto c_0   = column_wrapper<int64_t>{4, 5, 6};
+  auto table = cudf::table_view{{c_0}};
+
+  auto col_ref_0 = cudf::ast::column_reference(0);
+
+  // expression: (c0 < c0) == (c0 < (c0 + c0))
+  //              {false, false, false} == (c0 < {8, 10, 12})
+  //              {false, false, false} == {true, true, true}
+  //              {false, false, false}
+  auto expression_left_subtree =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_0);
+  auto expression_right_inner_subtree =
+    cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_0);
+  auto expression_right_subtree =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, expression_right_inner_subtree);
+
+  auto expression_tree = cudf::ast::operation(
+    cudf::ast::ast_operator::EQUAL, expression_left_subtree, expression_right_subtree);
+
+  auto result   = cudf::compute_column(table, expression_tree);
+  auto expected = column_wrapper<bool>{false, false, false};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
+}
+
 TEST_F(TransformTest, MultiLevelTreeComparator)
 {
   auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};

From 3ef13d07057e87cff1cad4e0aa9460b3b5c45459 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 20 Nov 2023 13:02:29 -0600
Subject: [PATCH 113/118] Fix io reference in docs. (#14452)

cuDF CI is failing to build docs due to an ambiguous reference `io`. This PR makes that reference unambiguous.

```
/__w/cudf/cudf/docs/cudf/source/user_guide/data-types.md:139: WARNING: Multiple matches found for 'io': pandas:std:label:io, pyarrow:std:label:io, python:py:module:io [myst.iref_ambiguous]
```

I used this output to help me find the object inventory that lists this.

```bash
python -m sphinx.ext.intersphinx https://docs.rapids.ai/api/cudf/stable/objects.inv
```

I also looked at the MyST docs on external references. https://mystmd.org/guide/external-references

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14452
---
 docs/cudf/source/user_guide/data-types.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/cudf/source/user_guide/data-types.md b/docs/cudf/source/user_guide/data-types.md
index 1f4cfbc7366..e6fe3109c57 100644
--- a/docs/cudf/source/user_guide/data-types.md
+++ b/docs/cudf/source/user_guide/data-types.md
@@ -136,7 +136,7 @@ dtype: struct
 StructDtype({'a': dtype('int64'), 'b': dtype('int64')})
 ```
 
-Or by reading them from disk, using a [file format that supports nested data](io).
+Or by reading them from disk, using a [file format that supports nested data](/user_guide/io/index.md).
 
 ```python
 >>> pdf = pd.DataFrame({"a": [[1, 2], [3, 4, 5], [6, 7, 8]]})

From 823d3214a9489e3c496aa31041b5d29f650e94b3 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 20 Nov 2023 16:33:28 -0600
Subject: [PATCH 114/118] Use `pynvjitlink` for CUDA 12+ MVC (#13650)

Fixes https://github.com/rapidsai/cudf/issues/12822

This PR provides minor version compatibility in the CUDA 12.x range through `nvjitlink` via the preliminary [nvjiitlink python binding](https://github.com/gmarkall/nvjitlink). Thus far this PR merely leverages a local installation of the library and should not be merged until `nvjitlink` is hosted on `conda-forge` and cuDF's dependencies are adjusted accordingly, likely as part of this PR.

Authors:
  - https://github.com/brandon-b-miller
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/13650
---
 python/cudf/cudf/tests/test_mvc.py          | 99 +++++++++++++++++++++
 python/cudf/cudf/tests/test_numba_import.py | 48 ----------
 python/cudf/cudf/utils/_numba.py            | 53 ++++++-----
 3 files changed, 128 insertions(+), 72 deletions(-)
 create mode 100644 python/cudf/cudf/tests/test_mvc.py
 delete mode 100644 python/cudf/cudf/tests/test_numba_import.py

diff --git a/python/cudf/cudf/tests/test_mvc.py b/python/cudf/cudf/tests/test_mvc.py
new file mode 100644
index 00000000000..7dd25ebc500
--- /dev/null
+++ b/python/cudf/cudf/tests/test_mvc.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+import subprocess
+import sys
+
+import pytest
+
+IS_CUDA_11 = False
+IS_CUDA_12 = False
+try:
+    from ptxcompiler.patch import safe_get_versions
+except ModuleNotFoundError:
+    from cudf.utils._ptxcompiler import safe_get_versions
+
+# do not test cuda 12 if pynvjitlink isn't present
+HAVE_PYNVJITLINK = False
+try:
+    import pynvjitlink  # noqa: F401
+
+    HAVE_PYNVJITLINK = True
+except ModuleNotFoundError:
+    pass
+
+
+versions = safe_get_versions()
+driver_version, runtime_version = versions
+
+if (11, 0) <= driver_version < (12, 0):
+    IS_CUDA_11 = True
+if (12, 0) <= driver_version < (13, 0):
+    IS_CUDA_12 = True
+
+
+TEST_BODY = """
+@numba.cuda.jit
+def test_kernel(x):
+    id = numba.cuda.grid(1)
+    if id < len(x):
+        x[id] += 1
+
+s = cudf.Series([1, 2, 3])
+with _CUDFNumbaConfig():
+    test_kernel.forall(len(s))(s)
+"""
+
+CUDA_11_TEST = (
+    """
+import numba.cuda
+import cudf
+from cudf.utils._numba import _CUDFNumbaConfig, patch_numba_linker_cuda_11
+
+
+patch_numba_linker_cuda_11()
+"""
+    + TEST_BODY
+)
+
+
+CUDA_12_TEST = (
+    """
+import numba.cuda
+import cudf
+from cudf.utils._numba import _CUDFNumbaConfig
+from pynvjitlink.patch import (
+    patch_numba_linker as patch_numba_linker_pynvjitlink,
+)
+
+patch_numba_linker_pynvjitlink()
+"""
+    + TEST_BODY
+)
+
+
+@pytest.mark.parametrize(
+    "test",
+    [
+        pytest.param(
+            CUDA_11_TEST,
+            marks=pytest.mark.skipif(
+                not IS_CUDA_11,
+                reason="Minor Version Compatibility test for CUDA 11",
+            ),
+        ),
+        pytest.param(
+            CUDA_12_TEST,
+            marks=pytest.mark.skipif(
+                not IS_CUDA_12 or not HAVE_PYNVJITLINK,
+                reason="Minor Version Compatibility test for CUDA 12",
+            ),
+        ),
+    ],
+)
+def test_numba_mvc(test):
+    cp = subprocess.run(
+        [sys.executable, "-c", test],
+        capture_output=True,
+        cwd="/",
+    )
+
+    assert cp.returncode == 0
diff --git a/python/cudf/cudf/tests/test_numba_import.py b/python/cudf/cudf/tests/test_numba_import.py
deleted file mode 100644
index 238a32a94fa..00000000000
--- a/python/cudf/cudf/tests/test_numba_import.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-import subprocess
-import sys
-
-import pytest
-
-IS_CUDA_11 = False
-try:
-    from ptxcompiler.patch import NO_DRIVER, safe_get_versions
-
-    versions = safe_get_versions()
-    if versions != NO_DRIVER:
-        driver_version, runtime_version = versions
-        if driver_version < (12, 0):
-            IS_CUDA_11 = True
-except ModuleNotFoundError:
-    pass
-
-TEST_NUMBA_MVC_ENABLED = """
-import numba.cuda
-import cudf
-from cudf.utils._numba import _CUDFNumbaConfig, _patch_numba_mvc
-
-
-_patch_numba_mvc()
-
-@numba.cuda.jit
-def test_kernel(x):
-    id = numba.cuda.grid(1)
-    if id < len(x):
-        x[id] += 1
-
-s = cudf.Series([1, 2, 3])
-with _CUDFNumbaConfig():
-    test_kernel.forall(len(s))(s)
-"""
-
-
-@pytest.mark.skipif(
-    not IS_CUDA_11, reason="Minor Version Compatibility test for CUDA 11"
-)
-def test_numba_mvc_enabled_cuda_11():
-    cp = subprocess.run(
-        [sys.executable, "-c", TEST_NUMBA_MVC_ENABLED],
-        capture_output=True,
-        cwd="/",
-    )
-    assert cp.returncode == 0
diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index 09afb5680bd..bc0d6f37d89 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -7,6 +7,19 @@
 
 from numba import config as numba_config
 
+try:
+    from pynvjitlink.patch import (
+        patch_numba_linker as patch_numba_linker_pynvjitlink,
+    )
+except ImportError:
+
+    def patch_numba_linker_pynvjitlink():
+        warnings.warn(
+            "CUDA Toolkit is newer than CUDA driver. "
+            "Numba features will not work in this configuration. "
+        )
+
+
 CC_60_PTX_FILE = os.path.join(
     os.path.dirname(__file__), "../core/udf/shim_60.ptx"
 )
@@ -65,7 +78,7 @@ def _get_ptx_file(path, prefix):
         return regular_result[1]
 
 
-def _patch_numba_mvc():
+def patch_numba_linker_cuda_11():
     # Enable the config option for minor version compatibility
     numba_config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1
 
@@ -106,29 +119,19 @@ def _setup_numba():
     versions = safe_get_versions()
     if versions != NO_DRIVER:
         driver_version, runtime_version = versions
-        if driver_version >= (12, 0) and runtime_version > driver_version:
-            warnings.warn(
-                f"Using CUDA toolkit version {runtime_version} with CUDA "
-                f"driver version {driver_version} requires minor version "
-                "compatibility, which is not yet supported for CUDA "
-                "driver versions 12.0 and above. It is likely that many "
-                "cuDF operations will not work in this state. Please "
-                f"install CUDA toolkit version {driver_version} to "
-                "continue using cuDF."
-            )
-        else:
-            # Support MVC for all CUDA versions in the 11.x range
-            ptx_toolkit_version = _get_cuda_version_from_ptx_file(
-                CC_60_PTX_FILE
-            )
-            # Numba thinks cubinlinker is only needed if the driver is older
-            # than the CUDA runtime, but when PTX files are present, it might
-            # also need to patch because those PTX files may be compiled by
-            # a CUDA version that is newer than the driver as well
-            if (driver_version < ptx_toolkit_version) or (
-                driver_version < runtime_version
-            ):
-                _patch_numba_mvc()
+        ptx_toolkit_version = _get_cuda_version_from_ptx_file(CC_60_PTX_FILE)
+
+        # MVC is required whenever any PTX is newer than the driver
+        # This could be the shipped PTX file or the PTX emitted by
+        # the version of NVVM on the user system, the latter aligning
+        # with the runtime version
+        if (driver_version < ptx_toolkit_version) or (
+            driver_version < runtime_version
+        ):
+            if driver_version < (12, 0):
+                patch_numba_linker_cuda_11()
+            else:
+                patch_numba_linker_pynvjitlink()
 
 
 def _get_cuda_version_from_ptx_file(path):
@@ -171,6 +174,8 @@ def _get_cuda_version_from_ptx_file(path):
         "7.8": (11, 8),
         "8.0": (12, 0),
         "8.1": (12, 1),
+        "8.2": (12, 2),
+        "8.3": (12, 3),
     }
 
     cuda_ver = ver_map.get(version)

From fc8c81f3d4bde674d4123ae4848c578bcc7158b6 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 28 Nov 2023 14:13:20 -0600
Subject: [PATCH 115/118] Fix function name typo in `cudf.pandas` profiler
 (#14514)

Fixes: #14512

This PR fixes a function name typo in `cudf.pandas` profiler.

Authors:
   - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
   - Bradley Dice (https://github.com/bdice)
---
 python/cudf/cudf/pandas/__main__.py           |  2 +-
 .../cudf_pandas_tests/data/profile_basic.py   | 13 ++++++
 .../cudf/cudf_pandas_tests/test_profiler.py   | 41 +++++++++++++++++++
 3 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf_pandas_tests/data/profile_basic.py

diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py
index 02e8e960678..fb8569fa1d0 100644
--- a/python/cudf/cudf/pandas/__main__.py
+++ b/python/cudf/cudf/pandas/__main__.py
@@ -33,7 +33,7 @@ def profile(function_profile, line_profile, fn):
     elif function_profile:
         with Profiler() as profiler:
             yield fn
-        profiler.print_per_func_stats()
+        profiler.print_per_function_stats()
     else:
         yield fn
 
diff --git a/python/cudf/cudf_pandas_tests/data/profile_basic.py b/python/cudf/cudf_pandas_tests/data/profile_basic.py
new file mode 100644
index 00000000000..f7b4ba89ce7
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/data/profile_basic.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+import pandas as pd
+
+df = pd.DataFrame(
+    {
+        "size": [10, 11, 12, 10, 11, 12, 10, 6, 11, 10],
+        "total_bill": [100, 200, 100, 200, 100, 100, 200, 50, 10, 560],
+    }
+)
+df["size"].value_counts()
+df.groupby("size").total_bill.mean()
+df.apply(list, axis=1)
diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py
index a947d67b724..4921446ab6b 100644
--- a/python/cudf/cudf_pandas_tests/test_profiler.py
+++ b/python/cudf/cudf_pandas_tests/test_profiler.py
@@ -2,6 +2,9 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+import os
+import subprocess
+
 from cudf.pandas import LOADED, Profiler
 
 if not LOADED:
@@ -68,3 +71,41 @@ def test_profiler_fast_slow_name_mismatch():
     with Profiler():
         df = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
         df.iloc[0, 1] = "foo"
+
+
+def test_profiler_commandline():
+    data_directory = os.path.dirname(os.path.abspath(__file__))
+    # Create a copy of the current environment variables
+    env = os.environ.copy()
+    # Setting the 'COLUMNS' environment variable to a large number
+    # because the terminal output shouldn't be compressed for
+    # text validations below.
+    env["COLUMNS"] = "10000"
+
+    sp_completed = subprocess.run(
+        [
+            "python",
+            "-m",
+            "cudf.pandas",
+            "--profile",
+            data_directory + "/data/profile_basic.py",
+        ],
+        capture_output=True,
+        text=True,
+        env=env,
+    )
+    assert sp_completed.returncode == 0
+    output = sp_completed.stdout
+
+    for string in [
+        "Total time",
+        "Stats",
+        "Function",
+        "GPU ncalls",
+        "GPU cumtime",
+        "GPU percall",
+        "CPU ncalls",
+        "CPU cumtime",
+        "CPU percall",
+    ]:
+        assert string in output

From 0a56305696a37870495867cb76941699c3b53fe6 Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Mon, 4 Dec 2023 14:11:11 -0500
Subject: [PATCH 116/118] Pin actions/labeler to v4 [skip ci] (#14562)

RAPIDS repos are using the `main` branch of https://github.com/actions/labeler which recently introduced [breaking changes](https://github.com/actions/labeler/releases/tag/v5.0.0).

This PR pins to the latest v4 release of the labeler action until we can evaluate the changes required for v5.

Authors:
   - Ray Douglass (https://github.com/raydouglass)

Approvers:
   - AJ Schmidt (https://github.com/ajschmidt8)
---
 .github/workflows/labeler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 23956a02fbd..31e78f82a62 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -6,6 +6,6 @@ jobs:
   triage:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/labeler@main
+    - uses: actions/labeler@v4
       with:
         repo-token: "${{ secrets.GITHUB_TOKEN }}"

From 31aedf2ddcd99cb4b572f8685f7790b743500149 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 5 Dec 2023 09:02:32 -0800
Subject: [PATCH 117/118] fix for skip_rows on with page-spanning rows (#14557)

Fixes an issue detected in Spark where string data was being corrupted due to an incorrect page size calculation.

Closes #14560

Authors:
   - Ed Seidl (https://github.com/etseidl)

Approvers:
   - Alessandro Bellina (https://github.com/abellina)
   - Yunsong Wang (https://github.com/PointKernel)
   - Vukasin Milovanovic (https://github.com/vuule)
   - Nghia Truong (https://github.com/ttnghia)
   - Mike Wilson (https://github.com/hyperbolic2346)
---
 cpp/src/io/parquet/page_string_decode.cu | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index e29db042401..e9ac3657e36 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -141,6 +141,25 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
     bool skipped_values_set = false;
     bool end_value_set      = false;
 
+    // If page_start_row >= min_row, then skipped_values is 0 and we don't have to search for
+    // start_value. If there's repetition then we've already calculated
+    // skipped_values/skipped_leaf_values.
+    // TODO(ets): If we hit this condition, and end_row > last row in page, then we can skip
+    // more of the processing below.
+    if (has_repetition or page_start_row >= min_row) {
+      if (t == 0) {
+        if (has_repetition) {
+          skipped_values      = pp->skipped_values;
+          skipped_leaf_values = pp->skipped_leaf_values;
+        } else {
+          skipped_values      = 0;
+          skipped_leaf_values = 0;
+        }
+      }
+      skipped_values_set = true;
+      __syncthreads();
+    }
+
     while (processed < s->page.num_input_values) {
       thread_index_type start_val = processed;
 
@@ -150,11 +169,6 @@ __device__ thrust::pair<int, int> page_bounds(page_state_s* const s,
 
         // special case where page does not begin at a row boundary
         if (processed == 0 && rep_decode[0] != 0) {
-          if (t == 0) {
-            skipped_values      = 0;
-            skipped_leaf_values = 0;
-          }
-          skipped_values_set = true;
           end_row++;  // need to finish off the previous row
           row_fudge = 0;
         }

From 8eacf8f2ecb70eedf917fec2dfca4403810399d1 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 6 Dec 2023 09:59:02 -0500
Subject: [PATCH 118/118] Update Changelog [skip ci]

---
 CHANGELOG.md | 137 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 137 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ecd547ab5b3..3cb6caa25ee 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,140 @@
+# cuDF 23.12.00 (6 Dec 2023)
+
+## 🚨 Breaking Changes
+
+- Raise error in `reindex` when `index` is not unique ([#14400](https://github.com/rapidsai/cudf/pull/14400)) [@galipremsagar](https://github.com/galipremsagar)
+- Expose stream parameter to get_json_object API ([#14297](https://github.com/rapidsai/cudf/pull/14297)) [@davidwendt](https://github.com/davidwendt)
+- Refactor cudf_kafka to use skbuild ([#14292](https://github.com/rapidsai/cudf/pull/14292)) [@jdye64](https://github.com/jdye64)
+- Expose stream parameter in public strings convert APIs ([#14255](https://github.com/rapidsai/cudf/pull/14255)) [@davidwendt](https://github.com/davidwendt)
+- Upgrade to nvCOMP 3.0.4 ([#13815](https://github.com/rapidsai/cudf/pull/13815)) [@vuule](https://github.com/vuule)
+
+## 🐛 Bug Fixes
+
+- Update actions/labeler to v4 ([#14562](https://github.com/rapidsai/cudf/pull/14562)) [@raydouglass](https://github.com/raydouglass)
+- Fix data corruption when skipping rows ([#14557](https://github.com/rapidsai/cudf/pull/14557)) [@etseidl](https://github.com/etseidl)
+- Fix function name typo in `cudf.pandas` profiler ([#14514](https://github.com/rapidsai/cudf/pull/14514)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix intermediate type checking in expression parsing ([#14445](https://github.com/rapidsai/cudf/pull/14445)) [@vyasr](https://github.com/vyasr)
+- Forward merge `branch-23.10` into `branch-23.12` ([#14435](https://github.com/rapidsai/cudf/pull/14435)) [@raydouglass](https://github.com/raydouglass)
+- Remove needs: wheel-build-cudf. ([#14427](https://github.com/rapidsai/cudf/pull/14427)) [@bdice](https://github.com/bdice)
+- Fix dask dependency in custreamz ([#14420](https://github.com/rapidsai/cudf/pull/14420)) [@vyasr](https://github.com/vyasr)
+- Ensure nvbench initializes nvml context when built statically ([#14411](https://github.com/rapidsai/cudf/pull/14411)) [@robertmaynard](https://github.com/robertmaynard)
+- Support java AST String literal with desired encoding ([#14402](https://github.com/rapidsai/cudf/pull/14402)) [@winningsix](https://github.com/winningsix)
+- Raise error in `reindex` when `index` is not unique ([#14400](https://github.com/rapidsai/cudf/pull/14400)) [@galipremsagar](https://github.com/galipremsagar)
+- Always build nvbench statically so we don&#39;t need to package it ([#14399](https://github.com/rapidsai/cudf/pull/14399)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix token-count logic in nvtext::tokenize_with_vocabulary ([#14393](https://github.com/rapidsai/cudf/pull/14393)) [@davidwendt](https://github.com/davidwendt)
+- Fix as_column(pd.Timestamp/Timedelta, length=) not respecting length ([#14390](https://github.com/rapidsai/cudf/pull/14390)) [@mroeschke](https://github.com/mroeschke)
+- cudf.pandas: cuDF subpath checking in module `__getattr__` ([#14388](https://github.com/rapidsai/cudf/pull/14388)) [@shwina](https://github.com/shwina)
+- Fix and disable encoding for nanosecond statistics in ORC writer ([#14367](https://github.com/rapidsai/cudf/pull/14367)) [@vuule](https://github.com/vuule)
+- Add the new manylinux builds to the build job ([#14351](https://github.com/rapidsai/cudf/pull/14351)) [@vyasr](https://github.com/vyasr)
+- cudf jit parser now supports .pragma instructions with quotes ([#14348](https://github.com/rapidsai/cudf/pull/14348)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix overflow check in `cudf::merge` ([#14345](https://github.com/rapidsai/cudf/pull/14345)) [@divyegala](https://github.com/divyegala)
+- Add cramjam ([#14344](https://github.com/rapidsai/cudf/pull/14344)) [@vyasr](https://github.com/vyasr)
+- Enable `dask_cudf/io` pytests in CI ([#14338](https://github.com/rapidsai/cudf/pull/14338)) [@galipremsagar](https://github.com/galipremsagar)
+- Temporarily avoid the current build of pydata-sphinx-theme ([#14332](https://github.com/rapidsai/cudf/pull/14332)) [@vyasr](https://github.com/vyasr)
+- Fix host buffer access from device function in the Parquet reader ([#14328](https://github.com/rapidsai/cudf/pull/14328)) [@vuule](https://github.com/vuule)
+- Run IO tests for Dask-cuDF ([#14327](https://github.com/rapidsai/cudf/pull/14327)) [@rjzamora](https://github.com/rjzamora)
+- Fix logical type issues in the Parquet writer ([#14322](https://github.com/rapidsai/cudf/pull/14322)) [@vuule](https://github.com/vuule)
+- Remove aws-sdk-pinning and revert to arrow 12.0.1 ([#14319](https://github.com/rapidsai/cudf/pull/14319)) [@vyasr](https://github.com/vyasr)
+- test is_valid before reading column data ([#14318](https://github.com/rapidsai/cudf/pull/14318)) [@etseidl](https://github.com/etseidl)
+- Fix gtest validity setting for TextTokenizeTest.Vocabulary ([#14312](https://github.com/rapidsai/cudf/pull/14312)) [@davidwendt](https://github.com/davidwendt)
+- Fixes stack context for json lines format that recovers from invalid JSON lines ([#14309](https://github.com/rapidsai/cudf/pull/14309)) [@elstehle](https://github.com/elstehle)
+- Downgrade to Arrow 12.0.0 for aws-sdk-cpp and fix cudf_kafka builds for new CI containers ([#14296](https://github.com/rapidsai/cudf/pull/14296)) [@vyasr](https://github.com/vyasr)
+- fixing thread index overflow issue ([#14290](https://github.com/rapidsai/cudf/pull/14290)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- Fix memset error in nvtext::edit_distance_matrix ([#14283](https://github.com/rapidsai/cudf/pull/14283)) [@davidwendt](https://github.com/davidwendt)
+- Changes JSON reader&#39;s recovery option&#39;s behaviour to ignore all characters after a valid JSON record ([#14279](https://github.com/rapidsai/cudf/pull/14279)) [@elstehle](https://github.com/elstehle)
+- Handle empty string correctly in Parquet statistics ([#14257](https://github.com/rapidsai/cudf/pull/14257)) [@etseidl](https://github.com/etseidl)
+- Fixes behaviour for incomplete lines when `recover_with_nulls` is enabled ([#14252](https://github.com/rapidsai/cudf/pull/14252)) [@elstehle](https://github.com/elstehle)
+- cudf::detail::pinned_allocator doesn&#39;t throw from `deallocate` ([#14251](https://github.com/rapidsai/cudf/pull/14251)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix strings replace for adjacent, identical multi-byte UTF-8 character targets ([#14235](https://github.com/rapidsai/cudf/pull/14235)) [@davidwendt](https://github.com/davidwendt)
+- Fix the precision when converting a decimal128 column to an arrow array ([#14230](https://github.com/rapidsai/cudf/pull/14230)) [@jihoonson](https://github.com/jihoonson)
+- Fixing parquet list of struct interpretation ([#13715](https://github.com/rapidsai/cudf/pull/13715)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+
+## 📖 Documentation
+
+- Fix io reference in docs. ([#14452](https://github.com/rapidsai/cudf/pull/14452)) [@bdice](https://github.com/bdice)
+- Update README ([#14374](https://github.com/rapidsai/cudf/pull/14374)) [@shwina](https://github.com/shwina)
+- Example code for blog on new row comparators ([#13795](https://github.com/rapidsai/cudf/pull/13795)) [@divyegala](https://github.com/divyegala)
+
+## 🚀 New Features
+
+- Expose streams in public unary APIs ([#14342](https://github.com/rapidsai/cudf/pull/14342)) [@vyasr](https://github.com/vyasr)
+- Add python tests for Parquet DELTA_BINARY_PACKED encoder ([#14316](https://github.com/rapidsai/cudf/pull/14316)) [@etseidl](https://github.com/etseidl)
+- Update rapids-cmake functions to non-deprecated signatures ([#14265](https://github.com/rapidsai/cudf/pull/14265)) [@robertmaynard](https://github.com/robertmaynard)
+- Expose streams in public null mask APIs ([#14263](https://github.com/rapidsai/cudf/pull/14263)) [@vyasr](https://github.com/vyasr)
+- Expose streams in binaryop APIs ([#14187](https://github.com/rapidsai/cudf/pull/14187)) [@vyasr](https://github.com/vyasr)
+- Add pylibcudf.Scalar that interoperates with Arrow scalars ([#14133](https://github.com/rapidsai/cudf/pull/14133)) [@vyasr](https://github.com/vyasr)
+- Add decoder for DELTA_BYTE_ARRAY to Parquet reader ([#14101](https://github.com/rapidsai/cudf/pull/14101)) [@etseidl](https://github.com/etseidl)
+- Add DELTA_BINARY_PACKED encoder for Parquet writer ([#14100](https://github.com/rapidsai/cudf/pull/14100)) [@etseidl](https://github.com/etseidl)
+- Add BytePairEncoder class to cuDF ([#13891](https://github.com/rapidsai/cudf/pull/13891)) [@davidwendt](https://github.com/davidwendt)
+- Upgrade to nvCOMP 3.0.4 ([#13815](https://github.com/rapidsai/cudf/pull/13815)) [@vuule](https://github.com/vuule)
+- Use `pynvjitlink` for CUDA 12+ MVC ([#13650](https://github.com/rapidsai/cudf/pull/13650)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+
+## 🛠️ Improvements
+
+- Build concurrency for nightly and merge triggers ([#14441](https://github.com/rapidsai/cudf/pull/14441)) [@bdice](https://github.com/bdice)
+- Cleanup remaining usages of dask dependencies ([#14407](https://github.com/rapidsai/cudf/pull/14407)) [@galipremsagar](https://github.com/galipremsagar)
+- Update to Arrow 14.0.1. ([#14387](https://github.com/rapidsai/cudf/pull/14387)) [@bdice](https://github.com/bdice)
+- Remove Cython libcpp wrappers ([#14382](https://github.com/rapidsai/cudf/pull/14382)) [@vyasr](https://github.com/vyasr)
+- Forward-merge branch-23.10 to branch-23.12 ([#14372](https://github.com/rapidsai/cudf/pull/14372)) [@bdice](https://github.com/bdice)
+- Upgrade to arrow 14 ([#14371](https://github.com/rapidsai/cudf/pull/14371)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix a pytest typo in `test_kurt_skew_error` ([#14368](https://github.com/rapidsai/cudf/pull/14368)) [@galipremsagar](https://github.com/galipremsagar)
+- Use new rapids-dask-dependency metapackage for managing dask versions ([#14364](https://github.com/rapidsai/cudf/pull/14364)) [@vyasr](https://github.com/vyasr)
+- Change `nullable()` to `has_nulls()` in `cudf::detail::gather` ([#14363](https://github.com/rapidsai/cudf/pull/14363)) [@divyegala](https://github.com/divyegala)
+- Split up scan_inclusive.cu to improve its compile time ([#14358](https://github.com/rapidsai/cudf/pull/14358)) [@davidwendt](https://github.com/davidwendt)
+- Implement user_datasource_wrapper is_empty() and is_device_read_preferred(). ([#14357](https://github.com/rapidsai/cudf/pull/14357)) [@tpn](https://github.com/tpn)
+- Added streams to CSV reader and writer api ([#14340](https://github.com/rapidsai/cudf/pull/14340)) [@shrshi](https://github.com/shrshi)
+- Upgrade wheels to use arrow 13 ([#14339](https://github.com/rapidsai/cudf/pull/14339)) [@vyasr](https://github.com/vyasr)
+- Rework nvtext::byte_pair_encoding API ([#14337](https://github.com/rapidsai/cudf/pull/14337)) [@davidwendt](https://github.com/davidwendt)
+- Improve performance of nvtext::tokenize_with_vocabulary for long strings ([#14336](https://github.com/rapidsai/cudf/pull/14336)) [@davidwendt](https://github.com/davidwendt)
+- Upgrade `arrow` to `13` ([#14330](https://github.com/rapidsai/cudf/pull/14330)) [@galipremsagar](https://github.com/galipremsagar)
+- Expose stream parameter in public nvtext replace APIs ([#14329](https://github.com/rapidsai/cudf/pull/14329)) [@davidwendt](https://github.com/davidwendt)
+- Drop `pyorc` dependency and use `pandas`/`pyarrow` instead ([#14323](https://github.com/rapidsai/cudf/pull/14323)) [@galipremsagar](https://github.com/galipremsagar)
+- Avoid `pyarrow.fs` import for local storage ([#14321](https://github.com/rapidsai/cudf/pull/14321)) [@rjzamora](https://github.com/rjzamora)
+- Unpin `dask` and `distributed` for `23.12` development ([#14320](https://github.com/rapidsai/cudf/pull/14320)) [@galipremsagar](https://github.com/galipremsagar)
+- Expose stream parameter in public nvtext tokenize APIs ([#14317](https://github.com/rapidsai/cudf/pull/14317)) [@davidwendt](https://github.com/davidwendt)
+- Added streams to JSON reader and writer api ([#14313](https://github.com/rapidsai/cudf/pull/14313)) [@shrshi](https://github.com/shrshi)
+- Minor improvements in `source_info` ([#14308](https://github.com/rapidsai/cudf/pull/14308)) [@vuule](https://github.com/vuule)
+- Forward-merge branch-23.10 to branch-23.12 ([#14307](https://github.com/rapidsai/cudf/pull/14307)) [@bdice](https://github.com/bdice)
+- Add stream parameter to Set Operations (Public List APIs) ([#14305](https://github.com/rapidsai/cudf/pull/14305)) [@SurajAralihalli](https://github.com/SurajAralihalli)
+- Expose stream parameter to get_json_object API ([#14297](https://github.com/rapidsai/cudf/pull/14297)) [@davidwendt](https://github.com/davidwendt)
+- Sort dictionary data alphabetically in the ORC writer ([#14295](https://github.com/rapidsai/cudf/pull/14295)) [@vuule](https://github.com/vuule)
+- Expose stream parameter in public strings filter APIs ([#14293](https://github.com/rapidsai/cudf/pull/14293)) [@davidwendt](https://github.com/davidwendt)
+- Refactor cudf_kafka to use skbuild ([#14292](https://github.com/rapidsai/cudf/pull/14292)) [@jdye64](https://github.com/jdye64)
+- Update `shared-action-workflows` references ([#14289](https://github.com/rapidsai/cudf/pull/14289)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Register ``partd`` encode dispatch in ``dask_cudf`` ([#14287](https://github.com/rapidsai/cudf/pull/14287)) [@rjzamora](https://github.com/rjzamora)
+- Update versioning strategy ([#14285](https://github.com/rapidsai/cudf/pull/14285)) [@vyasr](https://github.com/vyasr)
+- Move and rename byte-pair-encoding source files ([#14284](https://github.com/rapidsai/cudf/pull/14284)) [@davidwendt](https://github.com/davidwendt)
+- Expose stream parameter in public strings combine APIs ([#14281](https://github.com/rapidsai/cudf/pull/14281)) [@davidwendt](https://github.com/davidwendt)
+- Expose stream parameter in public strings contains APIs ([#14280](https://github.com/rapidsai/cudf/pull/14280)) [@davidwendt](https://github.com/davidwendt)
+- Add stream parameter to List Sort and Filter APIs ([#14272](https://github.com/rapidsai/cudf/pull/14272)) [@SurajAralihalli](https://github.com/SurajAralihalli)
+- Use branch-23.12 workflows. ([#14271](https://github.com/rapidsai/cudf/pull/14271)) [@bdice](https://github.com/bdice)
+- Refactor LogicalType for Parquet ([#14264](https://github.com/rapidsai/cudf/pull/14264)) [@etseidl](https://github.com/etseidl)
+- Centralize chunked reading code in the parquet reader to reader_impl_chunking.cu ([#14262](https://github.com/rapidsai/cudf/pull/14262)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Expose stream parameter in public strings replace APIs ([#14261](https://github.com/rapidsai/cudf/pull/14261)) [@davidwendt](https://github.com/davidwendt)
+- Expose stream parameter in public strings APIs ([#14260](https://github.com/rapidsai/cudf/pull/14260)) [@davidwendt](https://github.com/davidwendt)
+- Cleanup of namespaces in parquet code. ([#14259](https://github.com/rapidsai/cudf/pull/14259)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Make parquet schema index type consistent ([#14256](https://github.com/rapidsai/cudf/pull/14256)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- Expose stream parameter in public strings convert APIs ([#14255](https://github.com/rapidsai/cudf/pull/14255)) [@davidwendt](https://github.com/davidwendt)
+- Add in java bindings for DataSource ([#14254](https://github.com/rapidsai/cudf/pull/14254)) [@revans2](https://github.com/revans2)
+- Reimplement `cudf::merge` for nested types without using comparators ([#14250](https://github.com/rapidsai/cudf/pull/14250)) [@divyegala](https://github.com/divyegala)
+- Add stream parameter to List Manipulation and Operations APIs ([#14248](https://github.com/rapidsai/cudf/pull/14248)) [@SurajAralihalli](https://github.com/SurajAralihalli)
+- Expose stream parameter in public strings split/partition APIs ([#14247](https://github.com/rapidsai/cudf/pull/14247)) [@davidwendt](https://github.com/davidwendt)
+- Improve `contains_column` by invoking `contains_table` ([#14238](https://github.com/rapidsai/cudf/pull/14238)) [@PointKernel](https://github.com/PointKernel)
+- Detect and report errors in Parquet header parsing ([#14237](https://github.com/rapidsai/cudf/pull/14237)) [@etseidl](https://github.com/etseidl)
+- Normalizing offsets iterator ([#14234](https://github.com/rapidsai/cudf/pull/14234)) [@davidwendt](https://github.com/davidwendt)
+- Forward merge `23.10` into `23.12` ([#14231](https://github.com/rapidsai/cudf/pull/14231)) [@galipremsagar](https://github.com/galipremsagar)
+- Return error if BOOL8 column-type is used with integers-to-hex ([#14208](https://github.com/rapidsai/cudf/pull/14208)) [@davidwendt](https://github.com/davidwendt)
+- Enable indexalator for device code ([#14206](https://github.com/rapidsai/cudf/pull/14206)) [@davidwendt](https://github.com/davidwendt)
+- Marginally reduce memory footprint of joins ([#14197](https://github.com/rapidsai/cudf/pull/14197)) [@wence-](https://github.com/wence-)
+- Add nvtx annotations to spilling-based data movement ([#14196](https://github.com/rapidsai/cudf/pull/14196)) [@wence-](https://github.com/wence-)
+- Optimize ORC writer for decimal columns ([#14190](https://github.com/rapidsai/cudf/pull/14190)) [@vuule](https://github.com/vuule)
+- Remove the use of volatile in ORC ([#14175](https://github.com/rapidsai/cudf/pull/14175)) [@vuule](https://github.com/vuule)
+- Add `bytes_per_second` to distinct_count of stream_compaction nvbench. ([#14172](https://github.com/rapidsai/cudf/pull/14172)) [@Blonck](https://github.com/Blonck)
+- Add `bytes_per_second` to transpose benchmark ([#14170](https://github.com/rapidsai/cudf/pull/14170)) [@Blonck](https://github.com/Blonck)
+- cuDF: Build CUDA 12.0 ARM conda packages. ([#14112](https://github.com/rapidsai/cudf/pull/14112)) [@bdice](https://github.com/bdice)
+- Add `bytes_per_second` to shift benchmark ([#13950](https://github.com/rapidsai/cudf/pull/13950)) [@Blonck](https://github.com/Blonck)
+- Extract `debug_utilities.hpp/cu` from `column_utilities.hpp/cu` ([#13720](https://github.com/rapidsai/cudf/pull/13720)) [@ttnghia](https://github.com/ttnghia)
+
 # cuDF 23.10.00 (11 Oct 2023)
 
 ## 🚨 Breaking Changes