From 9be4de53268d49665bc0d700f12f1192207fff79 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 9 Nov 2023 16:00:43 -0800
Subject: [PATCH 01/36] Upgrade to nvCOMP 3.0.4 (#13815)

Update the nvCOMP version used for cuIO compression/decompression to 3.0.4.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/13815
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-120_arch-x86_64.yaml | 2 +-
 conda/recipes/libcudf/conda_build_config.yaml    | 2 +-
 dependencies.yaml                                | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 4d5c56e4a7d..a479d517c24 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -60,7 +60,7 @@ dependencies:
 - numpy>=1.21,<1.25
 - numpydoc
 - nvcc_linux-64=11.8
-- nvcomp==2.6.1
+- nvcomp==3.0.4
 - nvtx>=0.2.1
 - packaging
 - pandas>=1.3,<1.6.0dev0
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 8606932dae4..d1779aaeeac 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -59,7 +59,7 @@ dependencies:
 - numba>=0.57,<0.58
 - numpy>=1.21,<1.25
 - numpydoc
-- nvcomp==2.6.1
+- nvcomp==3.0.4
 - nvtx>=0.2.1
 - packaging
 - pandas>=1.3,<1.6.0dev0
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 05b2135184b..fa06ed048b7 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -38,7 +38,7 @@ spdlog_version:
   - ">=1.11.0,<1.12"
 
 nvcomp_version:
-  - "=2.6.1"
+  - "=3.0.4"
 
 zlib_version:
   - ">=1.2.13"
diff --git a/dependencies.yaml b/dependencies.yaml
index 35d08239a4c..3850347aa63 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -244,7 +244,7 @@ dependencies:
           - libarrow-all==14.0.1.*
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
-          - nvcomp==2.6.1
+          - nvcomp==3.0.4
           - spdlog>=1.11.0,<1.12
   build_wheels:
     common:

From 87d2a36f04f431a8c5236d2aee723ec79b9dc5f9 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 9 Nov 2023 16:54:25 -0800
Subject: [PATCH 02/36] Remove Cython libcpp wrappers (#14382)

All of these wrappers have now been upstreamed into Cython as of Cython 3.0.3.

Contributes to #14023

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/14382
---
 .../all_cuda-118_arch-x86_64.yaml             |  2 +-
 .../all_cuda-120_arch-x86_64.yaml             |  2 +-
 conda/recipes/cudf/meta.yaml                  |  2 +-
 conda/recipes/cudf_kafka/meta.yaml            |  2 +-
 dependencies.yaml                             |  2 +-
 python/cudf/cudf/_lib/column.pyx              |  3 +-
 python/cudf/cudf/_lib/concat.pyx              |  3 +-
 python/cudf/cudf/_lib/copying.pyx             |  5 +-
 python/cudf/cudf/_lib/cpp/copying.pxd         |  2 +-
 python/cudf/cudf/_lib/cpp/groupby.pxd         |  4 +-
 python/cudf/cudf/_lib/cpp/io/orc.pxd          |  2 +-
 python/cudf/cudf/_lib/cpp/io/parquet.pxd      |  4 +-
 python/cudf/cudf/_lib/cpp/io/timezone.pxd     |  2 +-
 python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd |  0
 python/cudf/cudf/_lib/cpp/libcpp/__init__.py  |  0
 .../cudf/cudf/_lib/cpp/libcpp/functional.pxd  |  7 ---
 python/cudf/cudf/_lib/cpp/libcpp/memory.pxd   | 12 -----
 python/cudf/cudf/_lib/cpp/libcpp/optional.pxd | 50 -------------------
 python/cudf/cudf/_lib/expressions.pyx         |  3 +-
 python/cudf/cudf/_lib/groupby.pyx             |  3 +-
 python/cudf/cudf/_lib/join.pyx                |  3 +-
 python/cudf/cudf/_lib/null_mask.pyx           |  3 +-
 python/cudf/cudf/_lib/parquet.pyx             |  3 +-
 python/cudf/cudf/_lib/timezone.pyx            |  2 +-
 python/cudf/pyproject.toml                    |  2 +-
 python/cudf_kafka/cudf_kafka/_lib/kafka.pyx   |  3 +-
 python/cudf_kafka/pyproject.toml              |  2 +-
 27 files changed, 27 insertions(+), 101 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/__init__.py
 delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/functional.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/optional.pxd

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index a479d517c24..adf4fcad32d 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -24,7 +24,7 @@ dependencies:
 - cudatoolkit
 - cupy>=12.0.0
 - cxx-compiler
-- cython>=3.0.0
+- cython>=3.0.3
 - dask-core>=2023.9.2
 - dask-cuda==23.12.*
 - dask>=2023.9.2
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index d1779aaeeac..a69ef587570 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -26,7 +26,7 @@ dependencies:
 - cuda-version=12.0
 - cupy>=12.0.0
 - cxx-compiler
-- cython>=3.0.0
+- cython>=3.0.3
 - dask-core>=2023.9.2
 - dask-cuda==23.12.*
 - dask>=2023.9.2
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 9b5c5f3d14b..27edde1c98a 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -57,7 +57,7 @@ requirements:
   host:
     - protobuf ==4.24.*
     - python
-    - cython >=3.0.0
+    - cython >=3.0.3
     - scikit-build >=0.13.1
     - setuptools
     - dlpack >=0.5,<0.6.0a0
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index cdc547b4d68..9440f8bf124 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -51,7 +51,7 @@ requirements:
     {% endif %}
   host:
     - python
-    - cython >=3.0.0
+    - cython >=3.0.3
     - cuda-version ={{ cuda_version }}
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 3850347aa63..a16b51f4483 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -256,7 +256,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cython>=3.0.0
+          - cython>=3.0.3
           # TODO: Pin to numpy<1.25 until cudf requires pandas 2
           - &numpy numpy>=1.21,<1.25
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index f751d73b142..0edf9f8aa95 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -24,7 +24,7 @@ from cudf.utils.dtypes import _get_base_dtype
 
 from cpython.buffer cimport PyObject_CheckBuffer
 from libc.stdint cimport uintptr_t
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -47,7 +47,6 @@ from cudf._lib.cpp.column.column_factories cimport (
     make_numeric_column,
 )
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.null_mask cimport null_count as cpp_null_count
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.scalar cimport DeviceScalar
diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
index feaf75ef237..1ec4719631e 100644
--- a/python/cudf/cudf/_lib/concat.pyx
+++ b/python/cudf/cudf/_lib/concat.pyx
@@ -1,7 +1,7 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -12,7 +12,6 @@ from cudf._lib.cpp.concatenate cimport (
     concatenate_masks as libcudf_concatenate_masks,
     concatenate_tables as libcudf_concatenate_tables,
 )
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table cimport table, table_view
 from cudf._lib.utils cimport (
     data_from_unique_ptr,
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index f57bc15ed57..ea6ee76c14a 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -24,12 +24,13 @@ from cudf._lib.utils cimport table_view_from_columns, table_view_from_table
 from cudf._lib.reduce import minmax
 from cudf.core.abc import Serializable
 
+from libcpp.functional cimport reference_wrapper
+from libcpp.memory cimport make_unique
+
 cimport cudf._lib.cpp.contiguous_split as cpp_contiguous_split
 cimport cudf._lib.cpp.copying as cpp_copying
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.lists.gather cimport (
     segmented_gather as cpp_segmented_gather,
 )
diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd
index 20725c252fc..5637b55ac1c 100644
--- a/python/cudf/cudf/_lib/cpp/copying.pxd
+++ b/python/cudf/cudf/_lib/cpp/copying.pxd
@@ -2,6 +2,7 @@
 
 from libc.stdint cimport int32_t, int64_t, uint8_t
 from libcpp cimport bool
+from libcpp.functional cimport reference_wrapper
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
@@ -9,7 +10,6 @@ from rmm._lib.device_buffer cimport device_buffer
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/cpp/groupby.pxd
index 2ecdf76842f..0266404fc50 100644
--- a/python/cudf/cudf/_lib/cpp/groupby.pxd
+++ b/python/cudf/cudf/_lib/cpp/groupby.pxd
@@ -1,6 +1,7 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
+from libcpp.functional cimport reference_wrapper
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
@@ -11,7 +12,6 @@ from cudf._lib.cpp.aggregation cimport (
 )
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
 from cudf._lib.cpp.replace cimport replace_policy
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index dd6f919a74d..d5ac8574fe4 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -4,12 +4,12 @@ from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.libcpp.optional cimport optional
 from cudf._lib.cpp.types cimport data_type, size_type
 
 
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index a6a7ba034aa..cdd1bde0274 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -2,16 +2,16 @@
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
+from libcpp.functional cimport reference_wrapper
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.table.table_view as cudf_table_view
 from cudf._lib.cpp.expressions cimport expression
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
-from cudf._lib.cpp.libcpp.optional cimport optional
 from cudf._lib.cpp.types cimport data_type, size_type
 
 
diff --git a/python/cudf/cudf/_lib/cpp/io/timezone.pxd b/python/cudf/cudf/_lib/cpp/io/timezone.pxd
index ba481d9a1d3..927c2118473 100644
--- a/python/cudf/cudf/_lib/cpp/io/timezone.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/timezone.pxd
@@ -2,9 +2,9 @@
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 
-from cudf._lib.cpp.libcpp.optional cimport optional
 from cudf._lib.cpp.table.table cimport table
 
 
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd b/python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/__init__.py b/python/cudf/cudf/_lib/cpp/libcpp/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd
deleted file mode 100644
index f3e2d6d0878..00000000000
--- a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-
-cdef extern from "<functional>" namespace "std" nogil:
-    cdef cppclass reference_wrapper[T]:
-        reference_wrapper()
-        reference_wrapper(T)
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd b/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
deleted file mode 100644
index 2178f1a940c..00000000000
--- a/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-
-cdef extern from "<memory>" namespace "std" nogil:
-    # The Cython standard header does not have except +, so C++
-    # exceptions from make_unique are not caught and translated to
-    # Python ones. This is not perfectly ergonomic, we always have to
-    # wrap make_unique in move, but at least we can catch exceptions.
-    # See https://github.com/cython/cython/issues/5560
-    unique_ptr[T] make_unique[T](...) except +
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd
deleted file mode 100644
index a78c18f3f7a..00000000000
--- a/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd
+++ /dev/null
@@ -1,50 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION &
-# AFFILIATES. All rights reserved.  SPDX-License-Identifier:
-# Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from libcpp cimport bool
-
-
-cdef extern from "<optional>" namespace "std" nogil:
-    cdef cppclass nullopt_t:
-        nullopt_t()
-
-    cdef nullopt_t nullopt
-
-    cdef cppclass optional[T]:
-        ctypedef T value_type
-        optional()
-        optional(nullopt_t)
-        optional(optional&) except +
-        optional(T&) except +
-        bool has_value()
-        T& value()
-        T& value_or[U](U& default_value)
-        void swap(optional&)
-        void reset()
-        T& emplace(...)
-        T& operator*()
-        optional& operator=(optional&)
-        optional& operator=[U](U&)
-        bool operator bool()
-        bool operator!()
-        bool operator==[U](optional&, U&)
-        bool operator!=[U](optional&, U&)
-        bool operator<[U](optional&, U&)
-        bool operator>[U](optional&, U&)
-        bool operator<=[U](optional&, U&)
-        bool operator>=[U](optional&, U&)
-
-    optional[T] make_optional[T](...) except +
diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx
index 8d7545ffe15..01a080f635f 100644
--- a/python/cudf/cudf/_lib/expressions.pyx
+++ b/python/cudf/cudf/_lib/expressions.pyx
@@ -4,12 +4,11 @@ from enum import Enum
 
 from cython.operator cimport dereference
 from libc.stdint cimport int64_t
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.cpp cimport expressions as libcudf_exp
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.types cimport size_type
 
 # Necessary for proper casting, see below.
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index a26d820de6f..b3778e45cde 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -24,6 +24,8 @@ from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf._lib.scalar import as_device_scalar
 
+from libcpp.functional cimport reference_wrapper
+
 cimport cudf._lib.cpp.groupby as libcudf_groupby
 cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.aggregation cimport (
@@ -33,7 +35,6 @@ from cudf._lib.aggregation cimport (
     make_groupby_scan_aggregation,
 )
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
 from cudf._lib.cpp.replace cimport replace_policy
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table, table_view
diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx
index 416680aae24..378be978cc0 100644
--- a/python/cudf/cudf/_lib/join.pyx
+++ b/python/cudf/cudf/_lib/join.pyx
@@ -2,7 +2,7 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
@@ -11,7 +11,6 @@ from rmm._lib.device_buffer cimport device_buffer
 cimport cudf._lib.cpp.join as cpp_join
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type, type_id
 from cudf._lib.utils cimport table_view_from_columns
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index 5b4538629f6..1f98140d9e4 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -6,13 +6,12 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.null_mask cimport (
     bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes,
     bitmask_and as cpp_bitmask_and,
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index d8d363686cc..4acb1ce10b1 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -32,7 +32,7 @@ from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
@@ -52,7 +52,6 @@ from cudf._lib.cpp.io.parquet cimport (
     write_parquet as parquet_writer,
 )
 from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type
 from cudf._lib.io.datasource cimport NativeFileDatasource
diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx
index 4d76cbfcdb5..808d1321b0b 100644
--- a/python/cudf/cudf/_lib/timezone.pyx
+++ b/python/cudf/cudf/_lib/timezone.pyx
@@ -1,13 +1,13 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from libcpp.optional cimport make_optional
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.cpp.io.timezone cimport (
     make_timezone_transition_table as cpp_make_timezone_transition_table,
 )
-from cudf._lib.cpp.libcpp.optional cimport make_optional
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.utils cimport columns_from_unique_ptr
 
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 1c687269e55..b38970271d7 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -4,7 +4,7 @@
 build-backend = "setuptools.build_meta"
 requires = [
     "cmake>=3.26.4",
-    "cython>=3.0.0",
+    "cython>=3.0.3",
     "ninja",
     "numpy>=1.21,<1.25",
     "protoc-wheel",
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
index 4d732478723..2fbaacff7c6 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
@@ -3,12 +3,11 @@
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool, nullptr
 from libcpp.map cimport map
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.cpp.io.datasource cimport datasource
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 
 from cudf_kafka._lib.kafka cimport kafka_consumer
 
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index f5cbd480e9c..4829f06ab09 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -3,7 +3,7 @@
 [build-system]
 
 requires = [
-    "cython>=3.0.0",
+    "cython>=3.0.3",
     "numpy>=1.21,<1.25",
     "pyarrow==14.0.1.*",
     "setuptools",

From 04d13d81b0bb4c2b3db2bfc9d9e28432e0a73c44 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 13 Nov 2023 09:05:19 -0500
Subject: [PATCH 03/36] Normalizing offsets iterator (#14234)

Creates a normalizing offsets iterator that returns an int64 value given either a int32 or int64 column data.
Depends on #14206

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/14234
---
 .../cudf/column/column_device_view.cuh        |   8 +-
 cpp/include/cudf/detail/indexalator.cuh       | 151 ++++++++++++++--
 .../cudf/detail/normalizing_iterator.cuh      | 160 +----------------
 cpp/include/cudf/detail/offsets_iterator.cuh  | 165 ++++++++++++++++++
 .../cudf/detail/offsets_iterator_factory.cuh  |  47 +++++
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/iterator/indexalator_test.cu        |  37 ----
 cpp/tests/iterator/offsetalator_test.cu       | 140 +++++++++++++++
 8 files changed, 502 insertions(+), 207 deletions(-)
 create mode 100644 cpp/include/cudf/detail/offsets_iterator.cuh
 create mode 100644 cpp/include/cudf/detail/offsets_iterator_factory.cuh
 create mode 100644 cpp/tests/iterator/offsetalator_test.cu

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 35851a99822..b1ff0bbaea7 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/offsets_iterator.cuh>
 #include <cudf/detail/utilities/alignment.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/lists/list_view.hpp>
@@ -442,10 +443,11 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   __device__ T element(size_type element_index) const noexcept
   {
     size_type index       = element_index + offset();  // account for this view's _offset
-    auto const* d_offsets = d_children[strings_column_view::offsets_column_index].data<int32_t>();
     char const* d_strings = d_children[strings_column_view::chars_column_index].data<char>();
-    size_type offset      = d_offsets[index];
-    return string_view{d_strings + offset, d_offsets[index + 1] - offset};
+    auto const offsets    = d_children[strings_column_view::offsets_column_index];
+    auto const itr        = cudf::detail::input_offsetalator(offsets.head(), offsets.type());
+    auto const offset     = itr[index];
+    return string_view{d_strings + offset, static_cast<cudf::size_type>(itr[index + 1] - offset)};
   }
 
  private:
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index 6532dae3695..4d261c54b29 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -56,10 +56,69 @@ namespace detail {
  *  auto result = thrust::find(thrust::device, begin, end, size_type{12} );
  * @endcode
  */
-using input_indexalator = input_normalator<cudf::size_type>;
+struct input_indexalator : base_normalator<input_indexalator, cudf::size_type> {
+  friend struct base_normalator<input_indexalator, cudf::size_type>;  // for CRTP
+
+  using reference = cudf::size_type const;  // this keeps STL and thrust happy
+
+  input_indexalator()                                    = default;
+  input_indexalator(input_indexalator const&)            = default;
+  input_indexalator(input_indexalator&&)                 = default;
+  input_indexalator& operator=(input_indexalator const&) = default;
+  input_indexalator& operator=(input_indexalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns the value at the current iterator position
+   */
+  __device__ inline cudf::size_type operator*() const { return operator[](0); }
+
+  /**
+   * @brief Dispatch functor for resolving a Integer value from any integer type
+   */
+  struct normalize_type {
+    template <typename T, CUDF_ENABLE_IF(cudf::is_index_type<T>())>
+    __device__ cudf::size_type operator()(void const* tp)
+    {
+      return static_cast<cudf::size_type>(*static_cast<T const*>(tp));
+    }
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_index_type<T>())>
+    __device__ cudf::size_type operator()(void const*)
+    {
+      CUDF_UNREACHABLE("only integral types are supported");
+    }
+  };
+
+  /**
+   * @brief Array subscript operator returns a value at the input
+   * `idx` position as a `Integer` value.
+   */
+  __device__ inline cudf::size_type operator[](size_type idx) const
+  {
+    void const* tp = p_ + (idx * this->width_);
+    return type_dispatcher(this->dtype_, normalize_type{}, tp);
+  }
+
+  /**
+   * @brief Create an input index normalizing iterator
+   *
+   * Use the indexalator_factory to create an iterator instance.
+   *
+   * @param data   Pointer to an integer array in device memory.
+   * @param dtype  Type of data in data
+   * @param offset Applied to the data pointer per size of the type
+   */
+  CUDF_HOST_DEVICE input_indexalator(void const* data, data_type dtype, cudf::size_type offset = 0)
+    : base_normalator<input_indexalator, cudf::size_type>(dtype), p_{static_cast<char const*>(data)}
+  {
+    p_ += offset * this->width_;
+  }
+
+ protected:
+  char const* p_;  /// pointer to the integer data in device memory
+};
 
 /**
- * @brief The index normalizing output iterator.
+ * @brief The index normalizing output iterator
  *
  * This is an iterator that can be used for index types (integers) without
  * requiring a type-specific instance. It can be used for any iterator
@@ -82,7 +141,75 @@ using input_indexalator = input_normalator<cudf::size_type>;
  *                      thrust::less<Element>());
  * @endcode
  */
-using output_indexalator = output_normalator<cudf::size_type>;
+struct output_indexalator : base_normalator<output_indexalator, cudf::size_type> {
+  friend struct base_normalator<output_indexalator, cudf::size_type>;  // for CRTP
+
+  using reference = output_indexalator const&;  // required for output iterators
+
+  output_indexalator()                                     = default;
+  output_indexalator(output_indexalator const&)            = default;
+  output_indexalator(output_indexalator&&)                 = default;
+  output_indexalator& operator=(output_indexalator const&) = default;
+  output_indexalator& operator=(output_indexalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns this iterator instance in order
+   * to capture the `operator=(Integer)` calls.
+   */
+  __device__ inline reference operator*() const { return *this; }
+
+  /**
+   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
+   *
+   * This allows capturing the subsequent `operator=(Integer)` call in this class.
+   */
+  __device__ inline output_indexalator const operator[](size_type idx) const
+  {
+    output_indexalator tmp{*this};
+    tmp.p_ += (idx * this->width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Dispatch functor for setting the index value from a size_type value.
+   */
+  struct normalize_type {
+    template <typename T, CUDF_ENABLE_IF(cudf::is_index_type<T>())>
+    __device__ void operator()(void* tp, cudf::size_type const value)
+    {
+      (*static_cast<T*>(tp)) = static_cast<T>(value);
+    }
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_index_type<T>())>
+    __device__ void operator()(void*, cudf::size_type const)
+    {
+      CUDF_UNREACHABLE("only index types are supported");
+    }
+  };
+
+  /**
+   * @brief Assign an Integer value to the current iterator position
+   */
+  __device__ inline reference operator=(cudf::size_type const value) const
+  {
+    void* tp = p_;
+    type_dispatcher(this->dtype_, normalize_type{}, tp, value);
+    return *this;
+  }
+
+  /**
+   * @brief Create an output normalizing iterator
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param dtype Type of data in data
+   */
+  CUDF_HOST_DEVICE output_indexalator(void* data, data_type dtype)
+    : base_normalator<output_indexalator, cudf::size_type>(dtype), p_{static_cast<char*>(data)}
+  {
+  }
+
+ protected:
+  char* p_;  /// pointer to the integer data in device memory
+};
 
 /**
  * @brief Use this class to create an indexalator instance.
@@ -92,14 +219,12 @@ struct indexalator_factory {
    * @brief A type_dispatcher functor to create an input iterator from an indices column.
    */
   struct input_indexalator_fn {
-    template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, CUDF_ENABLE_IF(is_index_type<IndexType>())>
     input_indexalator operator()(column_view const& indices)
     {
       return input_indexalator(indices.data<IndexType>(), indices.type());
     }
-    template <typename IndexType,
-              typename... Args,
-              std::enable_if_t<not is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, typename... Args, CUDF_ENABLE_IF(not is_index_type<IndexType>())>
     input_indexalator operator()(Args&&... args)
     {
       CUDF_FAIL("indices must be an index type");
@@ -110,16 +235,14 @@ struct indexalator_factory {
    * @brief Use this class to create an indexalator to a scalar index.
    */
   struct input_indexalator_scalar_fn {
-    template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, CUDF_ENABLE_IF(is_index_type<IndexType>())>
     input_indexalator operator()(scalar const& index)
     {
       // note: using static_cast<scalar_type_t<IndexType> const&>(index) creates a copy
       auto const scalar_impl = static_cast<scalar_type_t<IndexType> const*>(&index);
       return input_indexalator(scalar_impl->data(), index.type());
     }
-    template <typename IndexType,
-              typename... Args,
-              std::enable_if_t<not is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, typename... Args, CUDF_ENABLE_IF(not is_index_type<IndexType>())>
     input_indexalator operator()(Args&&... args)
     {
       CUDF_FAIL("scalar must be an index type");
@@ -130,14 +253,12 @@ struct indexalator_factory {
    * @brief A type_dispatcher functor to create an output iterator from an indices column.
    */
   struct output_indexalator_fn {
-    template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, CUDF_ENABLE_IF(is_index_type<IndexType>())>
     output_indexalator operator()(mutable_column_view const& indices)
     {
       return output_indexalator(indices.data<IndexType>(), indices.type());
     }
-    template <typename IndexType,
-              typename... Args,
-              std::enable_if_t<not is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, typename... Args, CUDF_ENABLE_IF(not is_index_type<IndexType>())>
     output_indexalator operator()(Args&&... args)
     {
       CUDF_FAIL("indices must be an index type");
diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh
index 35a695d47df..8f90afc3e57 100644
--- a/cpp/include/cudf/detail/normalizing_iterator.cuh
+++ b/cpp/include/cudf/detail/normalizing_iterator.cuh
@@ -33,7 +33,7 @@ namespace detail {
  * @tparam Integer The type the iterator normalizes to
  */
 template <class Derived, typename Integer>
-struct base_normalator {
+struct alignas(16) base_normalator {
   static_assert(cudf::is_index_type<Integer>());
   using difference_type   = std::ptrdiff_t;
   using value_type        = Integer;
@@ -204,7 +204,7 @@ struct base_normalator {
 
  private:
   struct integer_sizeof_fn {
-    template <typename T, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_fixed_width<T>())>
     CUDF_HOST_DEVICE constexpr std::size_t operator()() const
     {
 #ifndef __CUDA_ARCH__
@@ -213,7 +213,7 @@ struct base_normalator {
       CUDF_UNREACHABLE("only integral types are supported");
 #endif
     }
-    template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
+    template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_width<T>())>
     CUDF_HOST_DEVICE constexpr std::size_t operator()() const noexcept
     {
       return sizeof(T);
@@ -229,160 +229,16 @@ struct base_normalator {
     width_ = static_cast<int32_t>(type_dispatcher(dtype, integer_sizeof_fn{}));
   }
 
-  int32_t width_;    /// integer type width = 1,2,4, or 8
-  data_type dtype_;  /// for type-dispatcher calls
-};
-
-/**
- * @brief The integer normalizing input iterator
- *
- * This is an iterator that can be used for index types (integers) without
- * requiring a type-specific instance. It can be used for any iterator
- * interface for reading an array of integer values of type
- * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
- * Reading specific elements always return a type of `Integer`
- *
- * @tparam Integer Type returned by all read functions
- */
-template <typename Integer>
-struct input_normalator : base_normalator<input_normalator<Integer>, Integer> {
-  friend struct base_normalator<input_normalator<Integer>, Integer>;  // for CRTP
-
-  using reference = Integer const;  // this keeps STL and thrust happy
-
-  input_normalator()                                   = default;
-  input_normalator(input_normalator const&)            = default;
-  input_normalator(input_normalator&&)                 = default;
-  input_normalator& operator=(input_normalator const&) = default;
-  input_normalator& operator=(input_normalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns the value at the current iterator position
-   */
-  __device__ inline Integer operator*() const { return operator[](0); }
-
-  /**
-   * @brief Dispatch functor for resolving a Integer value from any integer type
-   */
-  struct normalize_type {
-    template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
-    __device__ Integer operator()(void const* tp)
-    {
-      return static_cast<Integer>(*static_cast<T const*>(tp));
-    }
-    template <typename T, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
-    __device__ Integer operator()(void const*)
-    {
-      CUDF_UNREACHABLE("only integral types are supported");
-    }
-  };
-
   /**
-   * @brief Array subscript operator returns a value at the input
-   * `idx` position as a `Integer` value.
-   */
-  __device__ inline Integer operator[](size_type idx) const
-  {
-    void const* tp = p_ + (idx * this->width_);
-    return type_dispatcher(this->dtype_, normalize_type{}, tp);
-  }
-
-  /**
-   * @brief Create an input index normalizing iterator.
-   *
-   * Use the indexalator_factory to create an iterator instance.
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param data_type Type of data in data
-   */
-  CUDF_HOST_DEVICE input_normalator(void const* data, data_type dtype, cudf::size_type offset = 0)
-    : base_normalator<input_normalator<Integer>, Integer>(dtype), p_{static_cast<char const*>(data)}
-  {
-    p_ += offset * this->width_;
-  }
-
-  char const* p_;  /// pointer to the integer data in device memory
-};
-
-/**
- * @brief The integer normalizing output iterator
- *
- * This is an iterator that can be used for index types (integers) without
- * requiring a type-specific instance. It can be used for any iterator
- * interface for writing an array of integer values of type
- * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
- * Setting specific elements always accept the `Integer` type values.
- *
- * @tparam Integer The type used for all write functions
- */
-template <typename Integer>
-struct output_normalator : base_normalator<output_normalator<Integer>, Integer> {
-  friend struct base_normalator<output_normalator<Integer>, Integer>;  // for CRTP
-
-  using reference = output_normalator const&;  // required for output iterators
-
-  output_normalator()                                    = default;
-  output_normalator(output_normalator const&)            = default;
-  output_normalator(output_normalator&&)                 = default;
-  output_normalator& operator=(output_normalator const&) = default;
-  output_normalator& operator=(output_normalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns this iterator instance in order
-   * to capture the `operator=(Integer)` calls.
-   */
-  __device__ inline output_normalator const& operator*() const { return *this; }
-
-  /**
-   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
-   *
-   * This allows capturing the subsequent `operator=(Integer)` call in this class.
-   */
-  __device__ inline output_normalator const operator[](size_type idx) const
-  {
-    output_normalator tmp{*this};
-    tmp.p_ += (idx * this->width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Dispatch functor for setting the index value from a size_type value.
-   */
-  struct normalize_type {
-    template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
-    __device__ void operator()(void* tp, Integer const value)
-    {
-      (*static_cast<T*>(tp)) = static_cast<T>(value);
-    }
-    template <typename T, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
-    __device__ void operator()(void*, Integer const)
-    {
-      CUDF_UNREACHABLE("only index types are supported");
-    }
-  };
-
-  /**
-   * @brief Assign an Integer value to the current iterator position
-   */
-  __device__ inline output_normalator const& operator=(Integer const value) const
-  {
-    void* tp = p_;
-    type_dispatcher(this->dtype_, normalize_type{}, tp, value);
-    return *this;
-  }
-
-  /**
-   * @brief Create an output normalizing iterator
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param data_type Type of data in data
+   * @brief Constructor assigns width and type member variables for base class.
    */
-  CUDF_HOST_DEVICE output_normalator(void* data, data_type dtype)
-    : base_normalator<output_normalator<Integer>, Integer>(dtype), p_{static_cast<char*>(data)}
+  explicit CUDF_HOST_DEVICE base_normalator(data_type dtype, int32_t width)
+    : width_(width), dtype_(dtype)
   {
   }
 
-  char* p_;  /// pointer to the integer data in device memory
+  int32_t width_;    /// integer type width = 1,2,4, or 8
+  data_type dtype_;  /// for type-dispatcher calls
 };
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/offsets_iterator.cuh b/cpp/include/cudf/detail/offsets_iterator.cuh
new file mode 100644
index 00000000000..3eb77b32353
--- /dev/null
+++ b/cpp/include/cudf/detail/offsets_iterator.cuh
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/normalizing_iterator.cuh>
+#include <cudf/types.hpp>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief The offsets normalizing input iterator
+ *
+ * This is an iterator that can be used for offsets where the underlying
+ * type may be int32_t or int64_t.
+ *
+ * Use the offsetalator_factory to create an appropriate input iterator
+ * from an offsets column_view.
+ */
+struct input_offsetalator : base_normalator<input_offsetalator, int64_t> {
+  friend struct base_normalator<input_offsetalator, int64_t>;  // for CRTP
+
+  using reference = int64_t const;  // this keeps STL and thrust happy
+
+  input_offsetalator()                                     = default;
+  input_offsetalator(input_offsetalator const&)            = default;
+  input_offsetalator(input_offsetalator&&)                 = default;
+  input_offsetalator& operator=(input_offsetalator const&) = default;
+  input_offsetalator& operator=(input_offsetalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns the value at the current iterator position
+   */
+  __device__ inline int64_t operator*() const { return operator[](0); }
+
+  /**
+   * @brief Array subscript operator returns a value at the input
+   * `idx` position as a int64_t value.
+   */
+  __device__ inline int64_t operator[](size_type idx) const
+  {
+    void const* tp = p_ + (idx * this->width_);
+    return this->width_ == sizeof(int32_t) ? static_cast<int64_t>(*static_cast<int32_t const*>(tp))
+                                           : *static_cast<int64_t const*>(tp);
+  }
+
+  /**
+   * @brief Create an input index normalizing iterator.
+   *
+   * Use the indexalator_factory to create an iterator instance.
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param dtype Type of data in data
+   */
+  CUDF_HOST_DEVICE input_offsetalator(void const* data, data_type dtype)
+    : base_normalator<input_offsetalator, int64_t>(
+        dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)),
+      p_{static_cast<char const*>(data)}
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_EXPECTS(dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64,
+                 "Unexpected offsets type");
+#else
+    cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) &&
+                "Unexpected offsets type");
+#endif
+  }
+
+ protected:
+  char const* p_;  /// pointer to the integer data in device memory
+};
+
+/**
+ * @brief The offsets normalizing output iterator
+ *
+ * This is an iterator that can be used for storing offsets values
+ * where the underlying type may be either int32_t or int64_t.
+ *
+ * Use the offsetalator_factory to create an appropriate output iterator
+ * from a mutable_column_view.
+ *
+ */
+struct output_offsetalator : base_normalator<output_offsetalator, int64_t> {
+  friend struct base_normalator<output_offsetalator, int64_t>;  // for CRTP
+
+  using reference = output_offsetalator const&;  // required for output iterators
+
+  output_offsetalator()                                      = default;
+  output_offsetalator(output_offsetalator const&)            = default;
+  output_offsetalator(output_offsetalator&&)                 = default;
+  output_offsetalator& operator=(output_offsetalator const&) = default;
+  output_offsetalator& operator=(output_offsetalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns this iterator instance in order
+   * to capture the `operator=(int64)` calls.
+   */
+  __device__ inline output_offsetalator const& operator*() const { return *this; }
+
+  /**
+   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
+   *
+   * This allows capturing the subsequent `operator=(int64)` call in this class.
+   */
+  __device__ inline output_offsetalator const operator[](size_type idx) const
+  {
+    output_offsetalator tmp{*this};
+    tmp.p_ += (idx * this->width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Assign an offset value to the current iterator position
+   */
+  __device__ inline output_offsetalator const& operator=(int64_t const value) const
+  {
+    void* tp = p_;
+    if (this->width_ == sizeof(int32_t)) {
+      (*static_cast<int32_t*>(tp)) = static_cast<int32_t>(value);
+    } else {
+      (*static_cast<int64_t*>(tp)) = value;
+    }
+    return *this;
+  }
+
+  /**
+   * @brief Create an output offsets iterator
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param dtype Type of data in data
+   */
+  CUDF_HOST_DEVICE output_offsetalator(void* data, data_type dtype)
+    : base_normalator<output_offsetalator, int64_t>(
+        dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)),
+      p_{static_cast<char*>(data)}
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_EXPECTS(dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64,
+                 "Unexpected offsets type");
+#else
+    cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) &&
+                "Unexpected offsets type");
+#endif
+  }
+
+ protected:
+  char* p_;  /// pointer to the integer data in device memory
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/offsets_iterator_factory.cuh b/cpp/include/cudf/detail/offsets_iterator_factory.cuh
new file mode 100644
index 00000000000..5b4c6b825d2
--- /dev/null
+++ b/cpp/include/cudf/detail/offsets_iterator_factory.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/offsets_iterator.cuh>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Use this class to create an offsetalator instance.
+ */
+struct offsetalator_factory {
+  /**
+   * @brief Create an input offsetalator instance from an offsets column
+   */
+  static input_offsetalator make_input_iterator(column_view const& offsets)
+  {
+    return input_offsetalator(offsets.head(), offsets.type());
+  }
+
+  /**
+   * @brief Create an output offsetalator instance from an offsets column
+   */
+  static output_offsetalator make_output_iterator(mutable_column_view const& offsets)
+  {
+    return output_offsetalator(offsets.head(), offsets.type());
+  }
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b0382d15807..7b628649051 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -393,6 +393,7 @@ set_tests_properties(
 ConfigureTest(
   ITERATOR_TEST
   iterator/indexalator_test.cu
+  iterator/offsetalator_test.cu
   iterator/optional_iterator_test_chrono.cu
   iterator/optional_iterator_test_numeric.cu
   iterator/pair_iterator_test_chrono.cu
diff --git a/cpp/tests/iterator/indexalator_test.cu b/cpp/tests/iterator/indexalator_test.cu
index 3e8bcd5cb0d..0c10853ec02 100644
--- a/cpp/tests/iterator/indexalator_test.cu
+++ b/cpp/tests/iterator/indexalator_test.cu
@@ -157,40 +157,3 @@ TYPED_TEST(IndexalatorTest, output_iterator)
   expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 1, 1, 2, 3, 4, 5, 5, 7});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
 }
-
-/**
- * For testing creating and using the indexalator in device code.
- */
-struct device_functor_fn {
-  cudf::column_device_view const d_col;
-  __device__ cudf::size_type operator()(cudf::size_type idx)
-  {
-    auto itr = cudf::detail::input_indexalator(d_col.head(), d_col.type());
-    return itr[idx] * 3;
-  }
-};
-
-TYPED_TEST(IndexalatorTest, device_indexalator)
-{
-  using T = TypeParam;
-
-  auto d_col1 =
-    cudf::test::fixed_width_column_wrapper<T, int32_t>({0, 6, 7, 14, 23, 33, 43, 45, 63});
-  auto d_col2 =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 0, 0, 0, 0, 0, 0, 0, 0});
-  auto input  = cudf::column_view(d_col1);
-  auto output = cudf::mutable_column_view(d_col2);
-  auto stream = cudf::get_default_stream();
-
-  auto d_input = cudf::column_device_view::create(input, stream);
-
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::counting_iterator<int>(0),
-                    thrust::counting_iterator<int>(input.size()),
-                    output.begin<cudf::size_type>(),
-                    device_functor_fn{*d_input});
-
-  auto expected =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 18, 21, 42, 69, 99, 129, 135, 189});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
-}
diff --git a/cpp/tests/iterator/offsetalator_test.cu b/cpp/tests/iterator/offsetalator_test.cu
new file mode 100644
index 00000000000..e569e58f42a
--- /dev/null
+++ b/cpp/tests/iterator/offsetalator_test.cu
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+#include <tests/iterator/iterator_tests.cuh>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/offsets_iterator_factory.cuh>
+
+#include <thrust/binary_search.h>
+#include <thrust/gather.h>
+#include <thrust/host_vector.h>
+#include <thrust/optional.h>
+#include <thrust/pair.h>
+#include <thrust/scatter.h>
+#include <thrust/sequence.h>
+#include <thrust/transform.h>
+
+using TestingTypes = cudf::test::Types<int32_t, int64_t>;
+
+template <typename T>
+struct OffsetalatorTest : public IteratorTest<T> {};
+
+TYPED_TEST_SUITE(OffsetalatorTest, TestingTypes);
+
+TYPED_TEST(OffsetalatorTest, input_iterator)
+{
+  using T = TypeParam;
+
+  auto host_values = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
+
+  auto d_col = cudf::test::fixed_width_column_wrapper<T>(host_values.begin(), host_values.end());
+
+  auto expected_values = thrust::host_vector<cudf::size_type>(host_values.size());
+  std::transform(host_values.begin(), host_values.end(), expected_values.begin(), [](auto v) {
+    return static_cast<cudf::size_type>(v);
+  });
+
+  auto it_dev = cudf::detail::offsetalator_factory::make_input_iterator(d_col);
+  this->iterator_test_thrust(expected_values, it_dev, host_values.size());
+}
+
+TYPED_TEST(OffsetalatorTest, output_iterator)
+{
+  using T = TypeParam;
+
+  auto d_col1 = cudf::test::fixed_width_column_wrapper<int64_t>({0, 6, 7, 14, 23, 33, 43, 45, 63});
+  auto d_col2 = cudf::test::fixed_width_column_wrapper<T>({0, 0, 0, 0, 0, 0, 0, 0, 0});
+  auto itr    = cudf::detail::offsetalator_factory::make_output_iterator(d_col2);
+  auto input  = cudf::column_view(d_col1);
+  auto stream = cudf::get_default_stream();
+
+  auto map   = cudf::test::fixed_width_column_wrapper<int>({0, 2, 4, 6, 8, 1, 3, 5, 7});
+  auto d_map = cudf::column_view(map);
+  thrust::gather(rmm::exec_policy_nosync(stream),
+                 d_map.begin<int>(),
+                 d_map.end<int>(),
+                 input.begin<int64_t>(),
+                 itr);
+  auto expected = cudf::test::fixed_width_column_wrapper<T>({0, 7, 23, 43, 63, 6, 14, 33, 45});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::scatter(rmm::exec_policy_nosync(stream),
+                  input.begin<int64_t>(),
+                  input.end<int64_t>(),
+                  d_map.begin<int>(),
+                  itr);
+  expected = cudf::test::fixed_width_column_wrapper<T>({0, 33, 6, 43, 7, 45, 14, 63, 23});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::fill(rmm::exec_policy(stream), itr, itr + input.size(), 77);
+  expected = cudf::test::fixed_width_column_wrapper<T>({77, 77, 77, 77, 77, 77, 77, 77, 77});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::sequence(rmm::exec_policy(stream), itr, itr + input.size());
+  expected = cudf::test::fixed_width_column_wrapper<T>({0, 1, 2, 3, 4, 5, 6, 7, 8});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  auto offsets =
+    cudf::test::fixed_width_column_wrapper<int64_t>({0, 10, 20, 30, 40, 50, 60, 70, 80});
+  auto d_offsets = cudf::column_view(offsets);
+  thrust::lower_bound(rmm::exec_policy(stream),
+                      d_offsets.begin<int64_t>(),
+                      d_offsets.end<int64_t>(),
+                      input.begin<int64_t>(),
+                      input.end<int64_t>(),
+                      itr);
+  expected = cudf::test::fixed_width_column_wrapper<T>({0, 1, 1, 2, 3, 4, 5, 5, 7});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+}
+
+namespace {
+/**
+ * For testing creating and using the offsetalator in device code.
+ */
+struct device_functor_fn {
+  cudf::column_device_view const d_col;
+  __device__ int32_t operator()(int idx)
+  {
+    auto const itr = cudf::detail::input_offsetalator(d_col.head(), d_col.type());
+    return static_cast<int32_t>(itr[idx] * 3);
+  }
+};
+}  // namespace
+
+TYPED_TEST(OffsetalatorTest, device_offsetalator)
+{
+  using T = TypeParam;
+
+  auto d_col1 = cudf::test::fixed_width_column_wrapper<T>({0, 6, 7, 14, 23, 33, 43, 45, 63});
+  auto d_col2 = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 0, 0, 0, 0, 0, 0, 0});
+  auto input  = cudf::column_view(d_col1);
+  auto output = cudf::mutable_column_view(d_col2);
+  auto stream = cudf::get_default_stream();
+
+  auto d_input = cudf::column_device_view::create(input, stream);
+
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::counting_iterator<int>(0),
+                    thrust::counting_iterator<int>(input.size()),
+                    output.begin<int32_t>(),
+                    device_functor_fn{*d_input});
+
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<int32_t>({0, 18, 21, 42, 69, 99, 129, 135, 189});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+}

From 4313cfa9b3fcff41f67b48ac8797dc015d441ecc Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 13 Nov 2023 11:40:36 -0800
Subject: [PATCH 04/36] Use new rapids-dask-dependency metapackage for managing
 dask versions (#14364)

* Update dependency lists

* Update wheel building to stop needing manual installations

* Update wheel dependency with alpha spec

* Rename the package

* Update update-version.sh

* Update conda/recipes/dask-cudf/meta.yaml

Co-authored-by: GALI PREM SAGAR <sagarprem75@gmail.com>

* Make pip/conda dependencies consistent and fix recipe

* dfg

* Apply suggestions from code review

---------

Co-authored-by: GALI PREM SAGAR <sagarprem75@gmail.com>
---
 ci/build_wheel.sh                                | 2 ++
 ci/release/update-version.sh                     | 1 +
 ci/test_wheel_dask_cudf.sh                       | 3 ---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 4 +---
 conda/environments/all_cuda-120_arch-x86_64.yaml | 4 +---
 conda/recipes/dask-cudf/meta.yaml                | 8 +-------
 dependencies.yaml                                | 4 +---
 python/dask_cudf/pyproject.toml                  | 3 +--
 8 files changed, 8 insertions(+), 21 deletions(-)

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 08716cdb3d9..ae1d9c3fb1a 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -36,6 +36,8 @@ fi
 
 if [[ ${package_name} == "dask_cudf" ]]; then
     sed -r -i "s/cudf==(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}
 else
     sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
     # ptxcompiler and cubinlinker aren't version constrained
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 7574b4174e9..843abd3c3c1 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -80,6 +80,7 @@ DEPENDENCIES=(
   kvikio
   libkvikio
   librmm
+  rapids-dask-dependency
   rmm
 )
 for DEP in "${DEPENDENCIES[@]}"; do
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 118bea753d0..e9162b816aa 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -23,9 +23,6 @@ manylinux="manylinux_${manylinux_version}"
 RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
 python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
 
-# Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.12
-
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
 
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index adf4fcad32d..9b85888a7b3 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -25,10 +25,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-core>=2023.9.2
 - dask-cuda==23.12.*
-- dask>=2023.9.2
-- distributed>=2023.9.2
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -80,6 +77,7 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
+- rapids-dask-dependency==23.12.*
 - rich
 - rmm==23.12.*
 - s3fs>=2022.3.0
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index a69ef587570..da2b4e109b3 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -27,10 +27,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-core>=2023.9.2
 - dask-cuda==23.12.*
-- dask>=2023.9.2
-- distributed>=2023.9.2
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -78,6 +75,7 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
+- rapids-dask-dependency==23.12.*
 - rich
 - rmm==23.12.*
 - s3fs>=2022.3.0
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 9dc9f76d9f5..16638926492 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -37,17 +37,11 @@ build:
 requirements:
   host:
     - python
-    - cudf ={{ version }}
-    - dask >=2023.9.2
-    - dask-core >=2023.9.2
-    - distributed >=2023.9.2
     - cuda-version ={{ cuda_version }}
   run:
     - python
     - cudf ={{ version }}
-    - dask >=2023.9.2
-    - dask-core >=2023.9.2
-    - distributed >=2023.9.2
+    - rapids-dask-dependency ={{ minor_version }}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/dependencies.yaml b/dependencies.yaml
index a16b51f4483..b971a682571 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -500,12 +500,10 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask>=2023.9.2
-          - distributed>=2023.9.2
+          - rapids-dask-dependency==23.12.*
       - output_types: conda
         packages:
           - cupy>=12.0.0
-          - dask-core>=2023.9.2  # dask-core in conda is the actual package & dask is the meta package
       - output_types: pyproject
         packages:
           - &cudf cudf==23.12.*
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 32c7bb9fd15..0306da3de46 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -20,11 +20,10 @@ requires-python = ">=3.9"
 dependencies = [
     "cudf==23.12.*",
     "cupy-cuda11x>=12.0.0",
-    "dask>=2023.9.2",
-    "distributed>=2023.9.2",
     "fsspec>=0.6.0",
     "numpy>=1.21,<1.25",
     "pandas>=1.3,<1.6.0dev0",
+    "rapids-dask-dependency==23.12.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",

From 5d09d38bc8ea44e1bdf1fa29e11a820c7417bac5 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 14 Nov 2023 00:51:42 -0500
Subject: [PATCH 05/36] Always build nvbench statically so we don't need to
 package it (#14399)

Corrects failures seen in C++ CI where libnvbench.so can't be found

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14399
---
 cpp/cmake/thirdparty/get_nvbench.cmake             | 2 +-
 cpp/cmake/thirdparty/patches/nvbench_override.json | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_nvbench.cmake b/cpp/cmake/thirdparty/get_nvbench.cmake
index f0642145fa0..bbd22693ba4 100644
--- a/cpp/cmake/thirdparty/get_nvbench.cmake
+++ b/cpp/cmake/thirdparty/get_nvbench.cmake
@@ -21,7 +21,7 @@ function(find_and_configure_nvbench)
   set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
   rapids_cpm_package_override("${cudf_patch_dir}/nvbench_override.json")
 
-  rapids_cpm_nvbench()
+  rapids_cpm_nvbench(BUILD_STATIC)
 
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json
index 7be868081b6..ad9b19c29c1 100644
--- a/cpp/cmake/thirdparty/patches/nvbench_override.json
+++ b/cpp/cmake/thirdparty/patches/nvbench_override.json
@@ -7,11 +7,6 @@
           "file" : "${current_json_dir}/nvbench_global_setup.diff",
           "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]",
           "fixed_in" : ""
-        },
-        {
-          "file" : "nvbench/use_existing_fmt.diff",
-          "issue" : "Fix add support for using an existing fmt [https://github.com/NVIDIA/nvbench/pull/125]",
-          "fixed_in" : ""
         }
       ]
     }

From e982d3736f095e680298af85bde732d9b5a73122 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Tue, 14 Nov 2023 09:51:02 -0500
Subject: [PATCH 06/36] cudf.pandas: cuDF subpath checking in module
 `__getattr__` (#14388)

Closes https://github.com/rapidsai/cudf/issues/14384. `x.startswith(y)` is not a good enough check for if `x` is a subdirectory of `y`. It causes `pandasai` to be reported as a sub-package of `pandas`.

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/14388
---
 python/cudf/cudf/pandas/module_accelerator.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index eb35c4adaaf..180d75d96e8 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -10,6 +10,7 @@
 import importlib.abc
 import importlib.machinery
 import os
+import pathlib
 import sys
 import threading
 import warnings
@@ -554,9 +555,10 @@ def getattr_real_or_wrapped(
             frame = sys._getframe()
             # We cannot possibly be at the top level.
             assert frame.f_back
-            calling_module = frame.f_back.f_code.co_filename
+            calling_module = pathlib.PurePath(frame.f_back.f_code.co_filename)
             use_real = any(
-                calling_module.startswith(path) for path in loader._denylist
+                calling_module.is_relative_to(path)
+                for path in loader._denylist
             )
         try:
             if use_real:

From 7f3fba164c4dd28c701ea2941d0525fc782a639c Mon Sep 17 00:00:00 2001
From: Jeremy Dyer <jdye64@gmail.com>
Date: Tue, 14 Nov 2023 12:02:10 -0500
Subject: [PATCH 07/36] Refactor cudf_kafka to use skbuild (#14292)

Refactor the currently outdated cudf_kafka build setup to use skbuild instead.

Authors:
  - Jeremy Dyer (https://github.com/jdye64)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/14292
---
 build.sh                                      |  2 +-
 ci/release/update-version.sh                  |  1 +
 .../all_cuda-120_arch-x86_64.yaml             |  1 -
 conda/recipes/cudf_kafka/build.sh             | 13 ---
 .../cudf_kafka/conda_build_config.yaml        |  6 ++
 conda/recipes/cudf_kafka/meta.yaml            | 21 ++--
 cpp/libcudf_kafka/CMakeLists.txt              |  8 +-
 .../cmake/thirdparty/get_cudf.cmake           | 16 +--
 cpp/libcudf_kafka/tests/CMakeLists.txt        |  2 +-
 dependencies.yaml                             | 13 +--
 python/cudf/cudf/_lib/CMakeLists.txt          |  6 --
 python/cudf_kafka/CMakeLists.txt              | 47 +++++++++
 python/cudf_kafka/LICENSE                     |  1 +
 python/cudf_kafka/README.md                   |  1 +
 .../cudf_kafka/cudf_kafka/_lib/CMakeLists.txt | 62 ++++++++++++
 python/cudf_kafka/cudf_kafka/_lib/kafka.pxd   |  4 +-
 python/cudf_kafka/pyproject.toml              |  1 +
 python/cudf_kafka/setup.py                    | 97 ++-----------------
 18 files changed, 160 insertions(+), 142 deletions(-)
 create mode 100644 python/cudf_kafka/CMakeLists.txt
 create mode 120000 python/cudf_kafka/LICENSE
 create mode 120000 python/cudf_kafka/README.md
 create mode 100644 python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt

diff --git a/build.sh b/build.sh
index 2ad69712e5d..e5beb51dedf 100755
--- a/build.sh
+++ b/build.sh
@@ -369,7 +369,7 @@ fi
 # build cudf_kafka Python package
 if hasArg cudf_kafka; then
     cd ${REPODIR}/python/cudf_kafka
-    SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR}" \
+    SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS}" \
         SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL:-1}" \
         python -m pip install --no-build-isolation --no-deps .
 fi
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 843abd3c3c1..4f1cbc47d1d 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -43,6 +43,7 @@ sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g'
 
 # Python CMakeLists updates
 sed_runner 's/'"cudf_version .*)"'/'"cudf_version ${NEXT_FULL_TAG})"'/g' python/cudf/CMakeLists.txt
+sed_runner 's/'"cudf_kafka_version .*)"'/'"cudf_kafka_version ${NEXT_FULL_TAG})"'/g' python/cudf_kafka/CMakeLists.txt
 
 # cpp libcudf_kafka update
 sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/libcudf_kafka/CMakeLists.txt
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index da2b4e109b3..a3eeb3dd99f 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -17,7 +17,6 @@ dependencies:
 - cmake>=3.26.4
 - cramjam
 - cuda-cudart-dev
-- cuda-gdb
 - cuda-nvcc
 - cuda-nvrtc-dev
 - cuda-nvtx-dev
diff --git a/conda/recipes/cudf_kafka/build.sh b/conda/recipes/cudf_kafka/build.sh
index f4bb6e1bc91..9458349d101 100644
--- a/conda/recipes/cudf_kafka/build.sh
+++ b/conda/recipes/cudf_kafka/build.sh
@@ -1,16 +1,3 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-# This assumes the script is executed from the root of the repo directory
-# Need to set CUDA_HOME inside conda environments because the hacked together
-# setup.py for cudf-kafka searches that way.
-# TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-# cudf_kafka to use scikit-build
-CUDA_MAJOR=${RAPIDS_CUDA_VERSION%%.*}
-if [[ ${CUDA_MAJOR} == "12" ]]; then
-    target_name="x86_64-linux"
-    if [[ ! $(arch) == "x86_64" ]]; then
-        target_name="sbsa-linux"
-    fi
-    export CUDA_HOME="${PREFIX}/targets/${target_name}/"
-fi
 ./build.sh -v cudf_kafka
diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml
index b63a136ad2d..c98c2701653 100644
--- a/conda/recipes/cudf_kafka/conda_build_config.yaml
+++ b/conda/recipes/cudf_kafka/conda_build_config.yaml
@@ -9,3 +9,9 @@ sysroot_version:
 
 cmake_version:
   - ">=3.26.4"
+
+cuda_compiler:
+  - cuda-nvcc
+
+cuda11_compiler:
+  - nvcc
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 9440f8bf124..343ec2519f1 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -33,28 +33,31 @@ build:
     - SCCACHE_S3_KEY_PREFIX=cudf-kafka-linux64 # [linux64]
     - SCCACHE_S3_USE_SSL
     - SCCACHE_S3_NO_CREDENTIALS
-    # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-    # cudf_kafka to use scikit-build
-    - RAPIDS_CUDA_VERSION
+  ignore_run_exports_from:
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }}
+    {% endif %}
 
 requirements:
   build:
     - cmake {{ cmake_version }}
+    - ninja
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
-    - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
-    # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-    # cudf_kafka to use scikit-build
-    {% if cuda_major == "12" %}
-    - cuda-gdb
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    {% else %}
+    - {{ compiler('cuda') }}
     {% endif %}
+    - cuda-version ={{ cuda_version }}
+    - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
     - python
     - cython >=3.0.3
     - cuda-version ={{ cuda_version }}
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
+    - scikit-build >=0.13.1
     - setuptools
     {% if cuda_major == "12" %}
     - cuda-cudart-dev
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index 1a15a3ec2cd..4128afa3935 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -21,7 +21,7 @@ include(rapids-export)
 include(rapids-find)
 
 project(
-  CUDA_KAFKA
+  CUDF_KAFKA
   VERSION 23.12.00
   LANGUAGES CXX
 )
@@ -64,7 +64,7 @@ add_library(cudf_kafka SHARED src/kafka_consumer.cpp src/kafka_callback.cpp)
 # ##################################################################################################
 # * include paths ---------------------------------------------------------------------------------
 target_include_directories(
-  cudf_kafka PUBLIC "$<BUILD_INTERFACE:${CUDA_KAFKA_SOURCE_DIR}/include>"
+  cudf_kafka PUBLIC "$<BUILD_INTERFACE:${CUDF_KAFKA_SOURCE_DIR}/include>"
                     "$<INSTALL_INTERFACE:include>"
 )
 
@@ -85,6 +85,8 @@ set_target_properties(
              CXX_STANDARD_REQUIRED ON
 )
 
+add_library(cudf_kafka::cudf_kafka ALIAS cudf_kafka)
+
 # ##################################################################################################
 # * cudf_kafka Install ----------------------------------------------------------------------------
 rapids_cmake_install_lib_dir(lib_dir)
@@ -94,7 +96,7 @@ install(
   EXPORT cudf_kafka-exports
 )
 
-install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include DESTINATION include)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
 rapids_export(
   INSTALL cudf_kafka
diff --git a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
index aa4c5b60e7a..20aa9873f43 100644
--- a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
+++ b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -35,21 +35,21 @@ function(find_and_configure_cudf VERSION)
   endif()
 endfunction()
 
-set(CUDA_KAFKA_MIN_VERSION_cudf
-    "${CUDA_KAFKA_VERSION_MAJOR}.${CUDA_KAFKA_VERSION_MINOR}.${CUDA_KAFKA_VERSION_PATCH}"
+set(CUDF_KAFKA_MIN_VERSION
+    "${CUDF_KAFKA_VERSION_MAJOR}.${CUDF_KAFKA_VERSION_MINOR}.${CUDF_KAFKA_VERSION_PATCH}"
 )
-find_and_configure_cudf(${CUDA_KAFKA_MIN_VERSION_cudf})
+find_and_configure_cudf(${CUDF_KAFKA_MIN_VERSION})
 
 if(cudf_REQUIRES_CUDA)
-  rapids_cuda_init_architectures(CUDA_KAFKA)
+  rapids_cuda_init_architectures(CUDF_KAFKA)
 
   # Since we are building cudf as part of ourselves we need to enable the CUDA language in the
   # top-most scope
   enable_language(CUDA)
 
-  # Since CUDA_KAFKA only enables CUDA optionally we need to manually include the file that
+  # Since CUDF_KAFKA only enables CUDA optionally we need to manually include the file that
   # rapids_cuda_init_architectures relies on `project` calling
-  if(DEFINED CMAKE_PROJECT_CUDA_KAFKA_INCLUDE)
-    include("${CMAKE_PROJECT_CUDA_KAFKA_INCLUDE}")
+  if(DEFINED CMAKE_PROJECT_CUDF_KAFKA_INCLUDE)
+    include("${CMAKE_PROJECT_CUDF_KAFKA_INCLUDE}")
   endif()
 endif()
diff --git a/cpp/libcudf_kafka/tests/CMakeLists.txt b/cpp/libcudf_kafka/tests/CMakeLists.txt
index 68a5327b455..b819cb6fc3b 100644
--- a/cpp/libcudf_kafka/tests/CMakeLists.txt
+++ b/cpp/libcudf_kafka/tests/CMakeLists.txt
@@ -26,7 +26,7 @@ function(ConfigureTest test_name)
   add_executable(${test_name} ${ARGN})
   set_target_properties(
     ${test_name}
-    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDA_KAFKA_BINARY_DIR}/gtests>"
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_KAFKA_BINARY_DIR}/gtests>"
                INSTALL_RPATH "\$ORIGIN/../../../lib"
   )
   target_link_libraries(
diff --git a/dependencies.yaml b/dependencies.yaml
index b971a682571..97149a5e2ba 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -9,8 +9,8 @@ files:
       - build_all
       - build_cpp
       - build_wheels
-      - build_python
       - build_python_common
+      - build_python_cudf
       - cudatoolkit
       - develop
       - docs
@@ -71,8 +71,8 @@ files:
       table: build-system
     includes:
       - build_all
-      - build_python
       - build_python_common
+      - build_python_cudf
       - build_wheels
   py_run_cudf:
     output: pyproject
@@ -138,8 +138,8 @@ files:
     extras:
       table: build-system
     includes:
-      - build_wheels
       - build_python_common
+      - build_wheels
   py_run_cudf_kafka:
     output: pyproject
     pyproject_dir: python/cudf_kafka
@@ -259,16 +259,16 @@ dependencies:
           - cython>=3.0.3
           # TODO: Pin to numpy<1.25 until cudf requires pandas 2
           - &numpy numpy>=1.21,<1.25
+          - scikit-build>=0.13.1
       - output_types: [conda, requirements, pyproject]
         packages:
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
           - pyarrow==14.0.1.*
-  build_python:
+  build_python_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - scikit-build>=0.13.1
           - rmm==23.12.*
       - output_types: conda
         packages:
@@ -302,9 +302,6 @@ dependencies:
               - cuda-nvrtc-dev
               - cuda-nvtx-dev
               - libcurand-dev
-              # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-              # cudf_kafka to use scikit-build
-              - cuda-gdb
           - matrix:
               cuda: "11.8"
             packages:
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 1b543b94589..c041c7f4842 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -81,12 +81,6 @@ target_link_libraries(strings_udf cudf_strings_udf)
 # necessary. The relevant command is tar -xf /opt/_internal/static-libs-for-embedding-only.tar.xz -C
 # /opt/_internal"
 find_package(NumPy REQUIRED)
-set(targets_using_numpy interop avro csv orc json parquet)
-foreach(target IN LISTS targets_using_numpy)
-  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
-  # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24.
-  # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}")
-endforeach()
 
 set(targets_using_dlpack interop)
 foreach(target IN LISTS targets_using_dlpack)
diff --git a/python/cudf_kafka/CMakeLists.txt b/python/cudf_kafka/CMakeLists.txt
new file mode 100644
index 00000000000..d55c3fdc076
--- /dev/null
+++ b/python/cudf_kafka/CMakeLists.txt
@@ -0,0 +1,47 @@
+# =============================================================================
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
+
+set(cudf_kafka_version 23.12.00)
+
+include(../../fetch_rapids.cmake)
+
+project(
+  cudf-kafka-python
+  VERSION ${cudf_kafka_version}
+  LANGUAGES # TODO: Building Python extension modules via the python_extension_module requires the C
+            # language to be enabled here. The test project that is built in scikit-build to verify
+            # various linking options for the python library is hardcoded to build with C, so until
+            # that is fixed we need to keep C.
+            C CXX
+)
+
+find_package(cudf_kafka ${cudf_kafka_version} REQUIRED)
+
+if(NOT cudf_kafka_FOUND)
+  message(
+    FATAL_ERROR
+      "cudf_kafka package not found. cudf_kafka C++ is required to build this Python package."
+  )
+endif()
+
+include(rapids-cython)
+rapids_cython_init()
+
+add_subdirectory(cudf_kafka/_lib)
+
+if(DEFINED cython_lib_dir)
+  rapids_cython_add_rpath_entries(TARGET cudf_kafka PATHS "${cython_lib_dir}")
+endif()
diff --git a/python/cudf_kafka/LICENSE b/python/cudf_kafka/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/cudf_kafka/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/cudf_kafka/README.md b/python/cudf_kafka/README.md
new file mode 120000
index 00000000000..fe840054137
--- /dev/null
+++ b/python/cudf_kafka/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
new file mode 100644
index 00000000000..3262b7d5ebe
--- /dev/null
+++ b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
@@ -0,0 +1,62 @@
+# =============================================================================
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources kafka.pyx)
+set(linked_libraries cudf_kafka::cudf_kafka)
+
+rapids_cython_create_modules(
+  CXX ASSOCIATED_TARGETS cudf_kafka
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}"
+)
+
+# TODO: Finding NumPy currently requires finding Development due to a bug in CMake. This bug was
+# fixed in https://gitlab.kitware.com/cmake/cmake/-/merge_requests/7410 and will be available in
+# CMake 3.24, so we can remove the Development component once we upgrade to CMake 3.24.
+# find_package(Python REQUIRED COMPONENTS Development NumPy)
+
+# Note: The bug noted above prevents us from finding NumPy successfully using FindPython.cmake
+# inside the manylinux images used to build wheels because manylinux images do not contain
+# libpython.so and therefore Development cannot be found. Until we upgrade to CMake 3.24, we should
+# use FindNumpy.cmake instead (provided by scikit-build). When we switch to 3.24 we can try
+# switching back, but it may not work if that implicitly still requires Python libraries. In that
+# case we'll need to follow up with the CMake team to remove that dependency.  The stopgap solution
+# is to unpack the static lib tarballs in the wheel building jobs so that there are at least static
+# libs to be found, but that should be a last resort since it implies a dependency that isn't really
+# necessary. The relevant command is tar -xf /opt/_internal/static-libs-for-embedding-only.tar.xz -C
+# /opt/_internal"
+find_package(NumPy REQUIRED)
+
+find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
+
+execute_process(
+  COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())"
+  OUTPUT_VARIABLE PYARROW_INCLUDE_DIR
+  ERROR_VARIABLE PYARROW_ERROR
+  RESULT_VARIABLE PYARROW_RESULT
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+if(${PYARROW_RESULT})
+  message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}")
+endif()
+
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers.
+# These requirements will go away once all scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
index ca729c62512..068837d04ee 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
@@ -11,12 +11,12 @@ from cudf._lib.cpp.io.datasource cimport datasource
 from cudf._lib.io.datasource cimport Datasource
 
 
-cdef extern from "kafka_callback.hpp" \
+cdef extern from "cudf_kafka/kafka_callback.hpp" \
         namespace "cudf::io::external::kafka" nogil:
     ctypedef object (*python_callable_type)()
 
 
-cdef extern from "kafka_consumer.hpp" \
+cdef extern from "cudf_kafka/kafka_consumer.hpp" \
         namespace "cudf::io::external::kafka" nogil:
 
     cpdef cppclass kafka_consumer:
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 4829f06ab09..15431161d75 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -6,6 +6,7 @@ requires = [
     "cython>=3.0.3",
     "numpy>=1.21,<1.25",
     "pyarrow==14.0.1.*",
+    "scikit-build>=0.13.1",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cudf_kafka/setup.py b/python/cudf_kafka/setup.py
index 6f3909d4528..6a99e9ed968 100644
--- a/python/cudf_kafka/setup.py
+++ b/python/cudf_kafka/setup.py
@@ -1,96 +1,13 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-import os
-import shutil
-import sysconfig
-from distutils.sysconfig import get_python_lib
-
-import numpy as np
-import pyarrow as pa
-from Cython.Build import cythonize
-from setuptools import find_packages, setup
-from setuptools.extension import Extension
-
-cython_files = ["cudf_kafka/_lib/*.pyx"]
-
-CUDA_HOME = os.environ.get("CUDA_HOME", False)
-if not CUDA_HOME:
-    path_to_cuda_gdb = shutil.which("cuda-gdb")
-    if path_to_cuda_gdb is None:
-        raise OSError(
-            "Could not locate CUDA. "
-            "Please set the environment variable "
-            "CUDA_HOME to the path to the CUDA installation "
-            "and try again."
-        )
-    CUDA_HOME = os.path.dirname(os.path.dirname(path_to_cuda_gdb))
-
-if not os.path.isdir(CUDA_HOME):
-    raise OSError(f"Invalid CUDA_HOME: directory does not exist: {CUDA_HOME}")
-
-cuda_include_dir = os.path.join(CUDA_HOME, "include")
-
-CUDF_ROOT = os.environ.get(
-    "CUDF_ROOT",
-    os.path.abspath(
-        os.path.join(
-            os.path.dirname(os.path.abspath(__file__)), "../../cpp/build/"
-        )
-    ),
-)
-CUDF_KAFKA_ROOT = os.environ.get(
-    "CUDF_KAFKA_ROOT", "../../cpp/libcudf_kafka/build"
-)
-
-try:
-    nthreads = int(os.environ.get("PARALLEL_LEVEL", "0") or "0")
-except Exception:
-    nthreads = 0
-
-extensions = [
-    Extension(
-        "*",
-        sources=cython_files,
-        include_dirs=[
-            os.path.abspath(os.path.join(CUDF_ROOT, "../include/cudf")),
-            os.path.abspath(os.path.join(CUDF_ROOT, "../include")),
-            os.path.abspath(
-                os.path.join(CUDF_ROOT, "../libcudf_kafka/include/cudf_kafka")
-            ),
-            os.path.join(CUDF_ROOT, "include"),
-            os.path.join(CUDF_ROOT, "_deps/libcudacxx-src/include"),
-            os.path.join(
-                os.path.dirname(sysconfig.get_path("include")),
-                "rapids/libcudacxx",
-            ),
-            os.path.dirname(sysconfig.get_path("include")),
-            np.get_include(),
-            pa.get_include(),
-            cuda_include_dir,
-        ],
-        library_dirs=(
-            [
-                get_python_lib(),
-                os.path.join(os.sys.prefix, "lib"),
-                CUDF_KAFKA_ROOT,
-            ]
-        ),
-        libraries=["cudf", "cudf_kafka"],
-        language="c++",
-        extra_compile_args=["-std=c++17", "-DFMT_HEADER_ONLY=1"],
-    )
-]
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+from setuptools import find_packages
+from skbuild import setup
 
 packages = find_packages(include=["cudf_kafka*"])
+
 setup(
-    # Include the separately-compiled shared library
-    ext_modules=cythonize(
-        extensions,
-        nthreads=nthreads,
-        compiler_directives=dict(
-            profile=False, language_level=3, embedsignature=True
-        ),
-    ),
     packages=packages,
-    package_data={key: ["VERSION", "*.pxd"] for key in packages},
+    package_data={
+        key: ["VERSION", "*.pxd", "*.hpp", "*.cuh"] for key in packages
+    },
     zip_safe=False,
 )

From b0c1b7b82ccdf1a7e4159cb3bffa1984092440d4 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 14 Nov 2023 12:48:32 -0500
Subject: [PATCH 08/36] Add BytePairEncoder class to cuDF (#13891)

Adds a new BytePairEncoding class to cuDF
```
>>> import cudf
>>> from cudf.core.byte_pair_encoding import BytePairEncoder
>>> mps = cudf.read_text('merges.txt', delimiter='\n', strip_delimiters=True)
>>> bpe = BytePairEncoder(mps)
>>> str_series = cudf.Series(['This is a sentence', 'thisisit'])
>>> bpe(str_series)
0    This is a sent ence
1             this is it
dtype: object
```
This class wraps the existing `nvtext::byte_pair_encoding` APIs to load the merge-pairs data and encode a column of strings.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13891
---
 .../cudf/_lib/cpp/nvtext/byte_pair_encode.pxd | 24 ++++++++
 python/cudf/cudf/_lib/nvtext/CMakeLists.txt   |  4 +-
 .../cudf/_lib/nvtext/byte_pair_encode.pyx     | 50 ++++++++++++++++
 python/cudf/cudf/core/byte_pair_encoding.py   | 59 +++++++++++++++++++
 .../cudf/cudf/tests/text/test_text_methods.py | 41 +++++++++++++
 5 files changed, 176 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
 create mode 100644 python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
 create mode 100644 python/cudf/cudf/core/byte_pair_encoding.py

diff --git a/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
new file mode 100644
index 00000000000..e678e4e84db
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
@@ -0,0 +1,24 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+
+
+cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil:
+
+    cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs":
+        pass
+
+    cdef unique_ptr[bpe_merge_pairs] load_merge_pairs(
+        const column_view &merge_pairs
+    ) except +
+
+    cdef unique_ptr[column] byte_pair_encoding(
+        const column_view &strings,
+        const bpe_merge_pairs &merge_pairs,
+        const string_scalar &separator
+    ) except +
diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
index d4e2392ee04..d7cbdeb5bda 100644
--- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
@@ -13,8 +13,8 @@
 # =============================================================================
 
 set(cython_sources
-    edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx
-    replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
+    byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
+    ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
 )
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
new file mode 100644
index 00000000000..cfc76afa8a5
--- /dev/null
+++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
@@ -0,0 +1,50 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+
+from cudf.core.buffer import acquire_spill_lock
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.nvtext.byte_pair_encode cimport (
+    bpe_merge_pairs as cpp_bpe_merge_pairs,
+    byte_pair_encoding as cpp_byte_pair_encoding,
+    load_merge_pairs as cpp_load_merge_pairs,
+)
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.scalar cimport DeviceScalar
+
+
+cdef class BPEMergePairs:
+    cdef unique_ptr[cpp_bpe_merge_pairs] c_obj
+
+    def __cinit__(self, Column merge_pairs):
+        cdef column_view c_pairs = merge_pairs.view()
+        with nogil:
+            self.c_obj = move(cpp_load_merge_pairs(c_pairs))
+
+
+@acquire_spill_lock()
+def byte_pair_encoding(
+    Column strings,
+    BPEMergePairs merge_pairs,
+    object separator
+):
+    cdef column_view c_strings = strings.view()
+    cdef DeviceScalar d_separator = separator.device_value
+    cdef const string_scalar* c_separator = <const string_scalar*>d_separator\
+        .get_raw_ptr()
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = move(
+            cpp_byte_pair_encoding(
+                c_strings,
+                merge_pairs.c_obj.get()[0],
+                c_separator[0]
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
new file mode 100644
index 00000000000..4c881022ecf
--- /dev/null
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import cudf
+from cudf._lib.nvtext.byte_pair_encode import (
+    BPEMergePairs as cpp_merge_pairs,
+    byte_pair_encoding as cpp_byte_pair_encoding,
+)
+
+
+class BytePairEncoder:
+    """
+    Given a merge pairs strings series, performs byte pair encoding on
+    a strings series using the provided separator.
+
+    Parameters
+    ----------
+    merges_pairs : str
+        Strings column of merge pairs
+
+    Returns
+    -------
+    BytePairEncoder
+    """
+
+    def __init__(self, merges_pair: "cudf.Series"):
+        self.merge_pairs = cpp_merge_pairs(merges_pair._column)
+
+    def __call__(self, text, separator: str = " "):
+        """
+
+        Parameters
+        ----------
+        text : cudf string series
+            The strings to be encoded.
+
+        Returns
+        -------
+        Encoded strings
+
+        Examples
+        --------
+        >>> import cudf
+        >>> from cudf.core.byte_pair_encoding import BytePairEncoder
+        >>> mps = cudf.Series(["e n", "i t", "i s", "e s", "en t",
+        ...                    "c e", "es t", "en ce", "T h", "Th is",
+        ...                    "t est", "s ent", "t h", "th is"])
+        >>> bpe = BytePairEncoder(mps)
+        >>> str_series = cudf.Series(['This is the sentence', 'thisisit'])
+        >>> bpe(str_series)
+        0    This is a sent ence
+        1             this is it
+        dtype: object
+        """
+        sep = cudf.Scalar(separator, dtype="str")
+        result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep)
+
+        return cudf.Series(result)
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index e565df8f3da..2dccd583b23 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -7,6 +7,7 @@
 import pytest
 
 import cudf
+from cudf.core.byte_pair_encoding import BytePairEncoder
 from cudf.core.tokenize_vocabulary import TokenizeVocabulary
 from cudf.testing._utils import assert_eq
 
@@ -1024,3 +1025,43 @@ def test_jaccard_index_random_strings():
 
     actual = str1.str.jaccard_index(str2, jaccard_width)
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "separator, input, results",
+    [
+        (" ", "thetestsentence", "the test sent ence"),
+        ("_", "sentenceistest", "sent_ence_is_test"),
+        ("$", "istestsentencehere", "is$test$sent$ence$he$r$e"),
+    ],
+)
+def test_byte_pair_encoding(separator, input, results):
+    pairs_table = cudf.Series(
+        [
+            "t he",
+            "h e",
+            "e n",
+            "i t",
+            "i s",
+            "e s",
+            "en t",
+            "c e",
+            "es t",
+            "en ce",
+            "t h",
+            "h i",
+            "th is",
+            "t est",
+            "s i",
+            "s ent",
+        ]
+    )
+    encoder = BytePairEncoder(pairs_table)
+
+    strings = cudf.Series([input, None, "", input])
+
+    expected = cudf.Series([results, None, "", results])
+
+    actual = encoder(strings, separator)
+    assert type(expected) == type(actual)
+    assert_eq(expected, actual)

From b446a6f187241e765c925da1053ece2679313a06 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 14 Nov 2023 12:49:19 -0500
Subject: [PATCH 09/36] Fix token-count logic in
 nvtext::tokenize_with_vocabulary (#14393)

Fixes a bug introduced in #14336 when trying to simplify the token-counting logic as per this discussion https://github.com/rapidsai/cudf/pull/14336#discussion_r1378173552
The simplification caused an error which was found when running the nvtext benchmarks.
The appropriate gtest has been updated to cover this case now.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/14393
---
 cpp/benchmarks/text/vocab.cpp       |  2 +-
 cpp/src/text/vocabulary_tokenize.cu |  8 ++++++--
 cpp/tests/text/tokenize_tests.cpp   | 12 ++++++------
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp
index 6922b7214ff..80942e2697d 100644
--- a/cpp/benchmarks/text/vocab.cpp
+++ b/cpp/benchmarks/text/vocab.cpp
@@ -53,7 +53,7 @@ static void bench_vocab_tokenize(nvbench::state& state)
 
   auto const vocab_col = [] {
     data_profile const profile = data_profile_builder().no_validity().distribution(
-      cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
+      cudf::type_id::STRING, distribution_id::NORMAL, 0, 15);
     auto const col = create_random_column(cudf::type_id::STRING, row_count{100}, profile);
     return cudf::strings::filter_characters_of_type(
       cudf::strings_column_view(col->view()),
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index 41f8c0a8731..511f1995374 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -276,8 +276,12 @@ __global__ void token_counts_fn(cudf::column_device_view const d_strings,
   __syncwarp();
 
   for (auto itr = d_output + lane_idx + 1; itr < d_output_end; itr += cudf::detail::warp_size) {
-    // add one if at the edge of a token or at the string's end
-    count += ((*itr && !(*(itr - 1))) || (itr + 1 == d_output_end));
+    // add one if at the edge of a token or if at the string's end
+    if (*itr) {
+      count += !(*(itr - 1));
+    } else {
+      count += (itr + 1 == d_output_end);
+    }
   }
   __syncwarp();
 
diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp
index 8118183a458..ea36e13de6f 100644
--- a/cpp/tests/text/tokenize_tests.cpp
+++ b/cpp/tests/text/tokenize_tests.cpp
@@ -246,14 +246,14 @@ TEST_F(TextTokenizeTest, Vocabulary)
 
 TEST_F(TextTokenizeTest, VocabularyLongStrings)
 {
-  cudf::test::strings_column_wrapper vocabulary(  // leaving out 'cat' on purpose
+  cudf::test::strings_column_wrapper vocabulary(
     {"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"});
   auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary));
 
   std::vector<std::string> h_strings(
     4,
     "the fox jumped chased the dog cheese mouse at the over there dog mouse cat plus the horse "
-    "jumped over the mouse house with the dog");
+    "jumped  over  the mousé  house with the dog  ");
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end());
   auto input_view = cudf::strings_column_view(input);
   auto delimiter  = cudf::string_scalar(" ");
@@ -262,10 +262,10 @@ TEST_F(TextTokenizeTest, VocabularyLongStrings)
 
   using LCW = cudf::test::lists_column_wrapper<cudf::size_type>;
   // clang-format off
-  LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
-                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
-                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
-                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3}});
+  LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3}});
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 

From 8106a0c3d2050786f42152a280bd9315b897379e Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 14 Nov 2023 16:03:54 -0600
Subject: [PATCH 10/36] Cleanup remaining usages of dask dependencies (#14407)

This PR switches remaining usages of `dask` dependencies to use `rapids-dask-dependency`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Jake Awe (https://github.com/AyodeAwe)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14407
---
 conda/recipes/custreamz/meta.yaml   |  4 +---
 conda/recipes/dask-cudf/run_test.sh | 36 -----------------------------
 2 files changed, 1 insertion(+), 39 deletions(-)
 delete mode 100644 conda/recipes/dask-cudf/run_test.sh

diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index fb6efabffd4..b8c5918ea60 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -45,9 +45,7 @@ requirements:
     - streamz
     - cudf ={{ version }}
     - cudf_kafka ={{ version }}
-    - dask >=2023.9.2
-    - dask-core >=2023.9.2
-    - distributed >=2023.9.2
+    - rapids-dask-dependency ={{ version }}
     - python-confluent-kafka >=1.9.0,<1.10.0a0
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh
deleted file mode 100644
index e7238d00f2b..00000000000
--- a/conda/recipes/dask-cudf/run_test.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-
-set -e
-
-# Logger function for build status output
-function logger() {
-  echo -e "\n>>>> $@\n"
-}
-
-# Importing cudf on arm64 CPU only nodes is currently not working due to a
-# difference in reported gpu devices between arm64 and amd64
-ARCH=$(arch)
-
-if [ "${ARCH}" = "aarch64" ]; then
-  logger "Skipping tests on arm64"
-  exit 0
-fi
-
-# Dask & Distributed option to install main(nightly) or `conda-forge` packages.
-export INSTALL_DASK_MAIN=1
-
-# Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2023.9.2"
-
-# Install the conda-forge or nightly version of dask and distributed
-if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
-    rapids-logger "rapids-mamba-retry install -c dask/label/dev 'dask/label/dev::dask' 'dask/label/dev::distributed'"
-    rapids-mamba-retry install -c dask/label/dev "dask/label/dev::dask" "dask/label/dev::distributed"
-else
-    rapids-logger "rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall"
-    rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall
-fi
-
-logger "python -c 'import dask_cudf'"
-python -c "import dask_cudf"

From 27b052d01ebdfd3690b90588971817423614acc0 Mon Sep 17 00:00:00 2001
From: shrshi <shruti.shivakumar@gmail.com>
Date: Tue, 14 Nov 2023 14:39:07 -0800
Subject: [PATCH 11/36] Added streams to CSV reader and writer api (#14340)

This PR contributes to https://github.com/rapidsai/cudf/issues/13744.
-Added stream parameters to public APIs
`cudf::io::read_csv`
`cudf::io::write_csv`
-Added stream gtests

Authors:
  - https://github.com/shrshi
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/14340
---
 cpp/include/cudf/io/csv.hpp              |   4 +
 cpp/include/cudf/io/detail/csv.hpp       |   1 -
 cpp/include/cudf_test/column_wrapper.hpp |  16 ++--
 cpp/src/io/csv/writer_impl.cu            |  38 +++++----
 cpp/src/io/functions.cpp                 |  12 ++-
 cpp/tests/CMakeLists.txt                 |   1 +
 cpp/tests/streams/io/csv_test.cpp        | 102 +++++++++++++++++++++++
 7 files changed, 150 insertions(+), 24 deletions(-)
 create mode 100644 cpp/tests/streams/io/csv_test.cpp

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index ac885c54356..435583e805d 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -1307,6 +1307,7 @@ class csv_reader_options_builder {
  * @endcode
  *
  * @param options Settings for controlling reading behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the table in the returned
  * table_with_metadata
  *
@@ -1314,6 +1315,7 @@ class csv_reader_options_builder {
  */
 table_with_metadata read_csv(
   csv_reader_options options,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
@@ -1715,9 +1717,11 @@ class csv_writer_options_builder {
  * @endcode
  *
  * @param options Settings for controlling writing behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
 void write_csv(csv_writer_options const& options,
+               rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 9fdc7a47fb9..40ddcf385b0 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/io/csv.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index e94dfea9dcf..b9f2e0d9868 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -803,7 +803,8 @@ class strings_column_wrapper : public detail::column_wrapper {
       offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
     auto d_bitmask = cudf::detail::make_device_uvector_sync(
       null_mask, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
-    wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask, null_count);
+    wrapped = cudf::make_strings_column(
+      d_chars, d_offsets, d_bitmask, null_count, cudf::test::get_default_stream());
   }
 
   /**
@@ -1846,7 +1847,8 @@ class structs_column_wrapper : public detail::column_wrapper {
                    child_column_wrappers.end(),
                    std::back_inserter(child_columns),
                    [&](auto const& column_wrapper) {
-                     return std::make_unique<cudf::column>(column_wrapper.get());
+                     return std::make_unique<cudf::column>(column_wrapper.get(),
+                                                           cudf::test::get_default_stream());
                    });
     init(std::move(child_columns), validity);
   }
@@ -1882,7 +1884,8 @@ class structs_column_wrapper : public detail::column_wrapper {
                    child_column_wrappers.end(),
                    std::back_inserter(child_columns),
                    [&](auto const& column_wrapper) {
-                     return std::make_unique<cudf::column>(column_wrapper.get());
+                     return std::make_unique<cudf::column>(column_wrapper.get(),
+                                                           cudf::test::get_default_stream());
                    });
     init(std::move(child_columns), validity_iter);
   }
@@ -1906,8 +1909,11 @@ class structs_column_wrapper : public detail::column_wrapper {
       return cudf::test::detail::make_null_mask(validity.begin(), validity.end());
     }();
 
-    wrapped = cudf::make_structs_column(
-      num_rows, std::move(child_columns), null_count, std::move(null_mask));
+    wrapped = cudf::make_structs_column(num_rows,
+                                        std::move(child_columns),
+                                        null_count,
+                                        std::move(null_mask),
+                                        cudf::test::get_default_stream());
   }
 
   template <typename V>
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 8c586306ad5..6e9c634804c 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -146,6 +146,12 @@ struct column_to_strings_fn {
   {
   }
 
+  ~column_to_strings_fn()                                      = default;
+  column_to_strings_fn(column_to_strings_fn const&)            = delete;
+  column_to_strings_fn& operator=(column_to_strings_fn const&) = delete;
+  column_to_strings_fn(column_to_strings_fn&&)                 = delete;
+  column_to_strings_fn& operator=(column_to_strings_fn&&)      = delete;
+
   // Note: `null` replacement with `na_rep` deferred to `concatenate()`
   // instead of column-wise; might be faster
   //
@@ -160,8 +166,9 @@ struct column_to_strings_fn {
   std::enable_if_t<std::is_same_v<column_type, bool>, std::unique_ptr<column>> operator()(
     column_view const& column) const
   {
-    return cudf::strings::detail::from_booleans(
-      column, options_.get_true_value(), options_.get_false_value(), stream_, mr_);
+    string_scalar true_string{options_.get_true_value(), true, stream_};
+    string_scalar false_string{options_.get_false_value(), true, stream_};
+    return cudf::strings::detail::from_booleans(column, true_string, false_string, stream_, mr_);
   }
 
   // strings:
@@ -367,10 +374,10 @@ void write_chunked(data_sink* out_sink,
 
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
 
-  cudf::string_scalar newline{options.get_line_terminator()};
+  cudf::string_scalar newline{options.get_line_terminator(), true, stream};
   auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view,
                                                             newline,
-                                                            string_scalar("", false),
+                                                            string_scalar{"", false, stream},
                                                             stream,
                                                             rmm::mr::get_current_device_resource());
   strings_column_view strings_column{p_str_col_w_nl->view()};
@@ -455,12 +462,14 @@ void write_csv(data_sink* out_sink,
 
       // populate vector of string-converted columns:
       //
-      std::transform(sub_view.begin(),
-                     sub_view.end(),
-                     std::back_inserter(str_column_vec),
-                     [converter](auto const& current_col) {
-                       return cudf::type_dispatcher(current_col.type(), converter, current_col);
-                     });
+      std::transform(
+        sub_view.begin(),
+        sub_view.end(),
+        std::back_inserter(str_column_vec),
+        [&converter = std::as_const(converter)](auto const& current_col) {
+          return cudf::type_dispatcher<cudf::id_to_type_impl, column_to_strings_fn const&>(
+            current_col.type(), converter, current_col);
+        });
 
       // create string table view from str_column_vec:
       //
@@ -470,18 +479,19 @@ void write_csv(data_sink* out_sink,
       // concatenate columns in each row into one big string column
       // (using null representation and delimiter):
       //
-      std::string delimiter_str{options.get_inter_column_delimiter()};
       auto str_concat_col = [&] {
+        cudf::string_scalar delimiter_str{
+          std::string{options.get_inter_column_delimiter()}, true, stream};
+        cudf::string_scalar options_narep{options.get_na_rep(), true, stream};
         if (str_table_view.num_columns() > 1)
           return cudf::strings::detail::concatenate(str_table_view,
                                                     delimiter_str,
-                                                    options.get_na_rep(),
+                                                    options_narep,
                                                     strings::separator_on_nulls::YES,
                                                     stream,
                                                     rmm::mr::get_current_device_resource());
-        cudf::string_scalar narep{options.get_na_rep()};
         return cudf::strings::detail::replace_nulls(
-          str_table_view.column(0), narep, stream, rmm::mr::get_current_device_resource());
+          str_table_view.column(0), options_narep, stream, rmm::mr::get_current_device_resource());
       }();
 
       write_chunked(out_sink, str_concat_col->view(), options, stream, mr);
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 00d56008611..964e40e36cd 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -230,7 +230,9 @@ void write_json(json_writer_options const& options,
     mr);
 }
 
-table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_csv(csv_reader_options options,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -245,12 +247,14 @@ table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_
   return cudf::io::detail::csv::read_csv(  //
     std::move(datasources[0]),
     options,
-    cudf::get_default_stream(),
+    stream,
     mr);
 }
 
 // Freeform API wraps the detail writer class API
-void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resource* mr)
+void write_csv(csv_writer_options const& options,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr)
 {
   using namespace cudf::io::detail;
 
@@ -262,7 +266,7 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
     options.get_table(),
     options.get_names(),
     options,
-    cudf::get_default_stream(),
+    stream,
     mr);
 }
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 7b628649051..1be8566fb0f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -635,6 +635,7 @@ ConfigureTest(
 ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_CSVIO_TEST streams/io/csv_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/io/csv_test.cpp b/cpp/tests/streams/io/csv_test.cpp
new file mode 100644
index 00000000000..88514fa412c
--- /dev/null
+++ b/cpp/tests/streams/io/csv_test.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/io/csv.hpp>
+#include <cudf/io/detail/csv.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <string>
+#include <vector>
+
+auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
+  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+class CSVTest : public cudf::test::BaseFixture {};
+
+TEST_F(CSVTest, CSVWriter)
+{
+  constexpr auto num_rows = 10;
+
+  std::vector<size_t> zeros(num_rows, 0);
+  std::vector<size_t> ones(num_rows, 1);
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{12}};
+  });
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{-12}};
+  });
+
+  cudf::test::fixed_width_column_wrapper<bool> col0(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int8_t> col1(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int16_t> col2(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6(col6_data, col6_data + num_rows);
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7(col7_data, col7_data + num_rows);
+
+  std::vector<std::string> col8_data(num_rows, "rapids");
+  cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end());
+
+  cudf::table_view tab({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+
+  auto const filepath = temp_env->get_temp_dir() + "multicolumn.csv";
+  auto w_options      = cudf::io::csv_writer_options::builder(cudf::io::sink_info{filepath}, tab)
+                     .include_header(false)
+                     .inter_column_delimiter(',');
+  cudf::io::write_csv(w_options.build(), cudf::test::get_default_stream());
+}
+
+TEST_F(CSVTest, CSVReader)
+{
+  constexpr auto num_rows = 10;
+
+  std::vector<size_t> zeros(num_rows, 0);
+  std::vector<size_t> ones(num_rows, 1);
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{12}};
+  });
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{-12}};
+  });
+
+  cudf::test::fixed_width_column_wrapper<bool> col0(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int8_t> col1(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int16_t> col2(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6(col6_data, col6_data + num_rows);
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7(col7_data, col7_data + num_rows);
+
+  std::vector<std::string> col8_data(num_rows, "rapids");
+  cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end());
+
+  cudf::table_view tab({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+
+  auto const filepath = temp_env->get_temp_dir() + "multicolumn.csv";
+  auto w_options      = cudf::io::csv_writer_options::builder(cudf::io::sink_info{filepath}, tab)
+                     .include_header(false)
+                     .inter_column_delimiter(',');
+  cudf::io::write_csv(w_options.build(), cudf::test::get_default_stream());
+}

From 330d389b26a05676d9f079503a3d96b571762337 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 14 Nov 2023 17:56:55 -0500
Subject: [PATCH 12/36] Ensure nvbench initializes nvml context when built
 statically (#14411)

Port https://github.com/NVIDIA/nvbench/pull/148 to cudf so that nvbench benchmarks work now that we always use a static version of nvbench.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14411
---
 cpp/cmake/thirdparty/patches/nvbench_override.json | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json
index ad9b19c29c1..f85bdb9486c 100644
--- a/cpp/cmake/thirdparty/patches/nvbench_override.json
+++ b/cpp/cmake/thirdparty/patches/nvbench_override.json
@@ -7,6 +7,11 @@
           "file" : "${current_json_dir}/nvbench_global_setup.diff",
           "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]",
           "fixed_in" : ""
+        },
+        {
+          "file" : "nvbench/nvml_with_static_builds.diff",
+          "issue" : "Add support for nvml with static nvbench [https://github.com/NVIDIA/nvbench/pull/148]",
+          "fixed_in" : ""
         }
       ]
     }

From 8a0a08f34ff804a7329ea640aa1e0a9b188d2162 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 14 Nov 2023 17:55:16 -1000
Subject: [PATCH 13/36] Fix as_column(pd.Timestamp/Timedelta, length=) not
 respecting length (#14390)

Noticed this while trying to clean up `as_column`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14390
---
 python/cudf/cudf/core/column/column.py |  5 ++++-
 python/cudf/cudf/tests/test_column.py  | 13 ++++++++-----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index a5e99abd79e..b4f65693d85 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2102,7 +2102,10 @@ def as_column(
     elif isinstance(arbitrary, (pd.Timestamp, pd.Timedelta)):
         # This will always treat NaTs as nulls since it's not technically a
         # discrete value like NaN
-        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))
+        length = length or 1
+        data = as_column(
+            pa.array(pd.Series([arbitrary] * length), from_pandas=True)
+        )
         if dtype is not None:
             data = data.astype(dtype)
 
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index db0446d506c..0546638f388 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -193,12 +193,15 @@ def test_column_mixed_dtype(data, error):
 
 
 @pytest.mark.parametrize("nan_as_null", [True, False])
-def test_as_column_scalar_with_nan(nan_as_null):
-    size = 10
-    scalar = np.nan
-
+@pytest.mark.parametrize(
+    "scalar",
+    [np.nan, pd.Timedelta(days=1), pd.Timestamp(2020, 1, 1)],
+    ids=repr,
+)
+@pytest.mark.parametrize("size", [1, 10])
+def test_as_column_scalar_with_nan(nan_as_null, scalar, size):
     expected = (
-        cudf.Series([np.nan] * size, nan_as_null=nan_as_null)
+        cudf.Series([scalar] * size, nan_as_null=nan_as_null)
         .dropna()
         .to_numpy()
     )

From ab2248ea8e693143823d02bb8b806c65bfc3bf32 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 14 Nov 2023 23:30:51 -0800
Subject: [PATCH 14/36] Fix and disable encoding for nanosecond statistics in
 ORC writer (#14367)

Issue https://github.com/rapidsai/cudf/issues/14325

Use uint when reading/writing nano stats because nanoseconds have int32 encoding (different from both unit32 and sint32, _obviously_), which does not use zigzag.
sint32 uses zigzag, and unit32 does not allow negative numbers, so we can use uint since we'll never have negative nanoseconds.

Also disabled the nanoseconds because it should only be written after ORC-135; we don't write the version so readers get confused if nanoseconds are there. Planning to re-enable once we start writing the version.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14367
---
 cpp/include/cudf/io/orc_metadata.hpp |  8 +++----
 cpp/src/io/orc/orc.cpp               | 13 +++++++++++
 cpp/src/io/orc/orc.hpp               |  6 +++++
 cpp/src/io/orc/stats_enc.cu          | 35 +++++++++++++++++++++-------
 cpp/tests/io/orc_test.cpp            | 16 +++++++++----
 5 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index 82d59803c25..9531a012e49 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -141,10 +141,10 @@ using binary_statistics = sum_statistics<int64_t>;
  * the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC.
  */
 struct timestamp_statistics : minmax_statistics<int64_t> {
-  std::optional<int64_t> minimum_utc;    ///< minimum in milliseconds
-  std::optional<int64_t> maximum_utc;    ///< maximum in milliseconds
-  std::optional<int32_t> minimum_nanos;  ///< nanoseconds part of the minimum
-  std::optional<int32_t> maximum_nanos;  ///< nanoseconds part of the maximum
+  std::optional<int64_t> minimum_utc;     ///< minimum in milliseconds
+  std::optional<int64_t> maximum_utc;     ///< maximum in milliseconds
+  std::optional<uint32_t> minimum_nanos;  ///< nanoseconds part of the minimum
+  std::optional<uint32_t> maximum_nanos;  ///< nanoseconds part of the maximum
 };
 
 namespace orc {
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index bc399b75ef9..ee5fa4e8b5a 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -182,6 +182,19 @@ void ProtobufReader::read(timestamp_statistics& s, size_t maxlen)
                        field_reader(5, s.minimum_nanos),
                        field_reader(6, s.maximum_nanos));
   function_builder(s, maxlen, op);
+
+  // Adjust nanoseconds because they are encoded as (value + 1)
+  // Range [1, 1000'000] is translated here to [0, 999'999]
+  if (s.minimum_nanos.has_value()) {
+    auto& min_nanos = s.minimum_nanos.value();
+    CUDF_EXPECTS(min_nanos >= 1 and min_nanos <= 1000'000, "Invalid minimum nanoseconds");
+    --min_nanos;
+  }
+  if (s.maximum_nanos.has_value()) {
+    auto& max_nanos = s.maximum_nanos.value();
+    CUDF_EXPECTS(max_nanos >= 1 and max_nanos <= 1000'000, "Invalid maximum nanoseconds");
+    --max_nanos;
+  }
 }
 
 void ProtobufReader::read(column_statistics& s, size_t maxlen)
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 6f65e384d2d..783ed4206b6 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -41,6 +41,12 @@ static constexpr uint32_t block_header_size = 3;
 // Seconds from January 1st, 1970 to January 1st, 2015
 static constexpr int64_t orc_utc_epoch = 1420070400;
 
+// Used for the nanosecond remainder in timestamp statistics when the actual nanoseconds of min/max
+// are not included. As the timestamp statistics are stored as milliseconds + nanosecond remainder,
+// the maximum nanosecond remainder is 999,999 (nanoseconds in a millisecond - 1).
+static constexpr int32_t DEFAULT_MIN_NANOS = 0;
+static constexpr int32_t DEFAULT_MAX_NANOS = 999'999;
+
 struct PostScript {
   uint64_t footerLength       = 0;     // the length of the footer section in bytes
   CompressionKind compression = NONE;  // the kind of generic compression used
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 479a2dfada3..429fd5b929d 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -27,6 +27,10 @@ namespace cudf::io::orc::gpu {
 
 using strings::detail::fixed_point_string_size;
 
+// Nanosecond statistics should not be enabled until the spec version is set correctly in the output
+// files. See https://github.com/rapidsai/cudf/issues/14325 for more details
+constexpr bool enable_nanosecond_statistics = false;
+
 constexpr unsigned int init_threads_per_group = 32;
 constexpr unsigned int init_groups_per_block  = 4;
 constexpr unsigned int init_threads_per_block = init_threads_per_group * init_groups_per_block;
@@ -96,8 +100,10 @@ __global__ void __launch_bounds__(block_size, 1)
           stats_len = pb_fldlen_common + pb_fld_hdrlen + 2 * (pb_fld_hdrlen + pb_fldlen_int64);
           break;
         case dtype_timestamp64:
-          stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64) +
-                      2 * (pb_fld_hdrlen + pb_fldlen_int32);
+          stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64);
+          if constexpr (enable_nanosecond_statistics) {
+            stats_len += 2 * (pb_fld_hdrlen + pb_fldlen_int32);
+          }
           break;
         case dtype_float32:
         case dtype_float64:
@@ -405,7 +411,8 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  optional sint64 minimumUtc = 3; // min,max values saved as milliseconds since UNIX epoch
         //  optional sint64 maximumUtc = 4;
         //  optional int32 minimumNanos = 5; // lower 6 TS digits for min/max to achieve nanosecond
-        //  precision optional int32 maximumNanos = 6;
+        //  precision
+        // optional int32 maximumNanos = 6;
         // }
         if (s->chunk.has_minmax) {
           cur[0] = 9 * 8 + ProtofType::FIXEDLEN;
@@ -416,12 +423,22 @@ __global__ void __launch_bounds__(encode_threads_per_block)
             split_nanosecond_timestamp(s->chunk.max_value.i_val);
 
           // minimum/maximum are the same as minimumUtc/maximumUtc as we always write files in UTC
-          cur          = pb_put_int(cur, 1, min_ms);            // minimum
-          cur          = pb_put_int(cur, 2, max_ms);            // maximum
-          cur          = pb_put_int(cur, 3, min_ms);            // minimumUtc
-          cur          = pb_put_int(cur, 4, max_ms);            // maximumUtc
-          cur          = pb_put_int(cur, 5, min_ns_remainder);  // minimumNanos
-          cur          = pb_put_int(cur, 6, max_ns_remainder);  // maximumNanos
+          cur = pb_put_int(cur, 1, min_ms);  // minimum
+          cur = pb_put_int(cur, 2, max_ms);  // maximum
+          cur = pb_put_int(cur, 3, min_ms);  // minimumUtc
+          cur = pb_put_int(cur, 4, max_ms);  // maximumUtc
+
+          if constexpr (enable_nanosecond_statistics) {
+            if (min_ns_remainder != DEFAULT_MIN_NANOS) {
+              // using uint because positive values are not zigzag encoded
+              cur = pb_put_uint(cur, 5, min_ns_remainder + 1);  // minimumNanos
+            }
+            if (max_ns_remainder != DEFAULT_MAX_NANOS) {
+              // using uint because positive values are not zigzag encoded
+              cur = pb_put_uint(cur, 6, max_ns_remainder + 1);  // maximumNanos
+            }
+          }
+
           fld_start[1] = cur - (fld_start + 2);
         }
         break;
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 234716749ff..dca3886db14 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1054,8 +1054,12 @@ TEST_F(OrcStatisticsTest, Basic)
     EXPECT_EQ(*ts4.maximum, 3);
     EXPECT_EQ(*ts4.minimum_utc, -4);
     EXPECT_EQ(*ts4.maximum_utc, 3);
-    EXPECT_EQ(*ts4.minimum_nanos, 999994);
-    EXPECT_EQ(*ts4.maximum_nanos, 6);
+    // nanosecond precision can't be included until we write a writer version that includes ORC-135
+    // see https://github.com/rapidsai/cudf/issues/14325
+    // EXPECT_EQ(*ts4.minimum_nanos, 999994);
+    EXPECT_FALSE(ts4.minimum_nanos.has_value());
+    // EXPECT_EQ(*ts4.maximum_nanos, 6);
+    EXPECT_FALSE(ts4.maximum_nanos.has_value());
 
     auto& s5 = stats[5];
     EXPECT_EQ(*s5.number_of_values, 4ul);
@@ -1065,8 +1069,12 @@ TEST_F(OrcStatisticsTest, Basic)
     EXPECT_EQ(*ts5.maximum, 3000);
     EXPECT_EQ(*ts5.minimum_utc, -3001);
     EXPECT_EQ(*ts5.maximum_utc, 3000);
-    EXPECT_EQ(*ts5.minimum_nanos, 994000);
-    EXPECT_EQ(*ts5.maximum_nanos, 6000);
+    // nanosecond precision can't be included until we write a writer version that includes ORC-135
+    // see https://github.com/rapidsai/cudf/issues/14325
+    // EXPECT_EQ(*ts5.minimum_nanos, 994000);
+    EXPECT_FALSE(ts5.minimum_nanos.has_value());
+    // EXPECT_EQ(*ts5.maximum_nanos, 6000);
+    EXPECT_FALSE(ts5.maximum_nanos.has_value());
 
     auto& s6 = stats[6];
     EXPECT_EQ(*s6.number_of_values, 4ul);

From 8deb3dd7573000e7d87f18a9e2bbe39cf2932e10 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 15 Nov 2023 07:56:37 -0600
Subject: [PATCH 15/36] Raise error in `reindex` when `index` is not unique
 (#14400)

Fixes: #14398
This PR raises an error in `reindex` API when reindexing is performed on a non-unique index column.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14400
---
 python/cudf/cudf/core/indexed_frame.py   |  4 ++++
 python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++
 python/cudf/cudf/tests/test_series.py    | 12 ++++++++++++
 python/dask_cudf/dask_cudf/backends.py   | 13 ++++---------
 4 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 376bef6d0b2..4211a8c24bf 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -2607,6 +2607,10 @@ def _reindex(
 
         df = self
         if index is not None:
+            if not df._index.is_unique:
+                raise ValueError(
+                    "cannot reindex on an axis with duplicate labels"
+                )
             index = cudf.core.index.as_index(
                 index, name=getattr(index, "name", self._index.name)
             )
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d44cf594e8b..5677f97408a 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10723,3 +10723,15 @@ def test_dataframe_series_dot():
     expected = gser @ [12, 13]
 
     assert_eq(expected, actual)
+
+
+def test_dataframe_duplicate_index_reindex():
+    gdf = cudf.DataFrame({"a": [0, 1, 2, 3]}, index=[0, 0, 1, 1])
+    pdf = gdf.to_pandas()
+
+    assert_exceptions_equal(
+        gdf.reindex,
+        pdf.reindex,
+        lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+        rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+    )
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 8f8f87c20e0..c15a797713f 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2638,3 +2638,15 @@ def test_series_setitem_mixed_bool_dtype():
     s = cudf.Series([True, False, True])
     with pytest.raises(TypeError):
         s[0] = 10
+
+
+def test_series_duplicate_index_reindex():
+    gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1])
+    ps = gs.to_pandas()
+
+    assert_exceptions_equal(
+        gs.reindex,
+        ps.reindex,
+        lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+        rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+    )
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index b0da82eaeee..387643587d1 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -427,17 +427,12 @@ def union_categoricals_cudf(
     )
 
 
-@_dask_cudf_nvtx_annotate
-def safe_hash(frame):
-    return cudf.Series(frame.hash_values(), index=frame.index)
-
-
 @hash_object_dispatch.register((cudf.DataFrame, cudf.Series))
 @_dask_cudf_nvtx_annotate
 def hash_object_cudf(frame, index=True):
     if index:
-        return safe_hash(frame.reset_index())
-    return safe_hash(frame)
+        frame = frame.reset_index()
+    return frame.hash_values()
 
 
 @hash_object_dispatch.register(cudf.BaseIndex)
@@ -445,10 +440,10 @@ def hash_object_cudf(frame, index=True):
 def hash_object_cudf_index(ind, index=None):
 
     if isinstance(ind, cudf.MultiIndex):
-        return safe_hash(ind.to_frame(index=False))
+        return ind.to_frame(index=False).hash_values()
 
     col = cudf.core.column.as_column(ind)
-    return safe_hash(cudf.Series(col))
+    return cudf.Series(col).hash_values()
 
 
 @group_split_dispatch.register((cudf.Series, cudf.DataFrame))

From 9e7f8a5fdd03d6a24630687621d0ee14c2db26d7 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 15 Nov 2023 15:27:28 -0800
Subject: [PATCH 16/36] Fix dask dependency in custreamz (#14420)

#14407 added a dask dependency to custreamz, but it added too tight of a pinning by requiring the exact same version. This is not valid because rapids-dask-dependency won't release a new version corresponding to each new cudf release, so pinning to the exact same version up to the alpha creates an unsatisfiable constraint.

Authors:
   - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
   - Ray Douglass (https://github.com/raydouglass)
   - Bradley Dice (https://github.com/bdice)
   - GALI PREM SAGAR (https://github.com/galipremsagar)
---
 conda/recipes/custreamz/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index b8c5918ea60..755394e3936 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -45,7 +45,7 @@ requirements:
     - streamz
     - cudf ={{ version }}
     - cudf_kafka ={{ version }}
-    - rapids-dask-dependency ={{ version }}
+    - rapids-dask-dependency ={{ minor_version }}
     - python-confluent-kafka >=1.9.0,<1.10.0a0
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 

From e4e69757b340ce45e8ceca53047f079c8b3eb648 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 15 Nov 2023 16:31:03 -0800
Subject: [PATCH 17/36] Update cudf_kafka_version.

---
 python/cudf_kafka/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf_kafka/CMakeLists.txt b/python/cudf_kafka/CMakeLists.txt
index d55c3fdc076..1e21c873585 100644
--- a/python/cudf_kafka/CMakeLists.txt
+++ b/python/cudf_kafka/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(cudf_kafka_version 23.12.00)
+set(cudf_kafka_version 24.02.00)
 
 include(../../fetch_rapids.cmake)
 

From f9c586d48aa2a879b2267318088d3cc38f398662 Mon Sep 17 00:00:00 2001
From: Ferdinand Xu <ferdinandx@nvidia.com>
Date: Thu, 16 Nov 2023 10:14:19 +0800
Subject: [PATCH 18/36] Support java AST String literal with desired encoding
 (#14402)

Authors:
  - Ferdinand Xu (https://github.com/winningsix)
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/14402
---
 java/src/main/java/ai/rapids/cudf/ast/Literal.java | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ast/Literal.java b/java/src/main/java/ai/rapids/cudf/ast/Literal.java
index 427dd286b0c..4e1e886c282 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/Literal.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/Literal.java
@@ -20,6 +20,7 @@
 
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
+import java.nio.charset.StandardCharsets;
 
 /** A literal value in an AST expression. */
 public final class Literal extends AstExpression {
@@ -205,7 +206,14 @@ public static Literal ofString(String value) {
     if (value == null) {
       return ofNull(DType.STRING);
     }
-    byte[] stringBytes = value.getBytes();
+    return ofUTF8String(value.getBytes(StandardCharsets.UTF_8));
+  }
+
+  /** Construct a string literal directly with byte array to skip transcoding. */
+  public static Literal ofUTF8String(byte[] stringBytes) {
+    if (stringBytes == null) {
+      return ofNull(DType.STRING);
+    }
     byte[] serializedValue = new byte[stringBytes.length + Integer.BYTES];
     ByteBuffer.wrap(serializedValue).order(ByteOrder.nativeOrder()).putInt(stringBytes.length);
     System.arraycopy(stringBytes, 0, serializedValue, Integer.BYTES, stringBytes.length);

From afd7d189b83cbcccba783877f42bb153b5cf315e Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Wed, 15 Nov 2023 23:33:28 -0500
Subject: [PATCH 19/36] Example code for blog on new row comparators (#13795)

Example code using a few libcudf APIs to demonstrate nested-type usage.

Authors:
  - Divye Gala (https://github.com/divyegala)
  - Karthikeyan (https://github.com/karthikeyann)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/13795
---
 ci/release/update-version.sh                |   3 +-
 cpp/examples/README.md                      |   1 +
 cpp/examples/basic/CMakeLists.txt           |  18 +-
 cpp/examples/build.sh                       |  31 ++-
 cpp/examples/fetch_dependencies.cmake       |  30 +++
 cpp/examples/nested_types/CMakeLists.txt    |  16 ++
 cpp/examples/nested_types/deduplication.cpp | 209 ++++++++++++++++++++
 cpp/examples/nested_types/example.json      |   5 +
 cpp/examples/strings/CMakeLists.txt         |  18 +-
 9 files changed, 279 insertions(+), 52 deletions(-)
 create mode 100644 cpp/examples/fetch_dependencies.cmake
 create mode 100644 cpp/examples/nested_types/CMakeLists.txt
 create mode 100644 cpp/examples/nested_types/deduplication.cpp
 create mode 100644 cpp/examples/nested_types/example.json

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 4f1cbc47d1d..16742465c32 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -101,8 +101,7 @@ sed_runner "s/version == ${CURRENT_SHORT_TAG}/version == ${NEXT_SHORT_TAG}/g" RE
 sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
 
 # Libcudf examples update
-sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt
-sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/strings/CMakeLists.txt
+sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/fetch_dependencies.cmake
 
 # CI files
 for FILE in .github/workflows/*.yaml; do
diff --git a/cpp/examples/README.md b/cpp/examples/README.md
index b2e8dd399d0..7f2b769f4a5 100644
--- a/cpp/examples/README.md
+++ b/cpp/examples/README.md
@@ -7,3 +7,4 @@ Current examples:
 
 - Basic: demonstrates a basic use case with libcudf and building a custom application with libcudf
 - Strings: demonstrates using libcudf for accessing and creating strings columns and for building custom kernels for strings
+- Nested Types: demonstrates using libcudf for some operations on nested types
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index 9ff716f41e4..759a43b5627 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -8,23 +8,7 @@ project(
   LANGUAGES CXX CUDA
 )
 
-set(CPM_DOWNLOAD_VERSION v0.35.3)
-file(
-  DOWNLOAD
-  https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
-  ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake
-)
-include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
-
-set(CUDF_TAG branch-23.12)
-CPMFindPackage(
-  NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
-  GIT_TAG ${CUDF_TAG}
-  GIT_SHALLOW
-    TRUE
-    SOURCE_SUBDIR
-    cpp
-)
+include(../fetch_dependencies.cmake)
 
 # Configure your project here
 add_executable(basic_example src/process_csv.cpp)
diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index 7d389cd318d..001cdeec694 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 
 # libcudf examples build script
 
@@ -14,18 +14,17 @@ LIB_BUILD_DIR=${LIB_BUILD_DIR:-$(readlink -f "${EXAMPLES_DIR}/../build")}
 ################################################################################
 # Add individual libcudf examples build scripts down below
 
-# Basic example
-BASIC_EXAMPLE_DIR=${EXAMPLES_DIR}/basic
-BASIC_EXAMPLE_BUILD_DIR=${BASIC_EXAMPLE_DIR}/build
-# Configure
-cmake -S ${BASIC_EXAMPLE_DIR} -B ${BASIC_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}"
-# Build
-cmake --build ${BASIC_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL}
-
-# Strings example
-STRINGS_EXAMPLE_DIR=${EXAMPLES_DIR}/strings
-STRINGS_EXAMPLE_BUILD_DIR=${STRINGS_EXAMPLE_DIR}/build
-# Configure
-cmake -S ${STRINGS_EXAMPLE_DIR} -B ${STRINGS_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}"
-# Build
-cmake --build ${STRINGS_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL}
+build_example() {
+  example_dir=${1}
+  example_dir="${EXAMPLES_DIR}/${example_dir}"
+  build_dir="${example_dir}/build"
+
+  # Configure
+  cmake -S ${example_dir} -B ${build_dir} -Dcudf_ROOT="${LIB_BUILD_DIR}"
+  # Build
+  cmake --build ${build_dir} -j${PARALLEL_LEVEL}
+}
+
+build_example basic
+build_example strings
+build_example nested_types
diff --git a/cpp/examples/fetch_dependencies.cmake b/cpp/examples/fetch_dependencies.cmake
new file mode 100644
index 00000000000..dc86c6a9aa5
--- /dev/null
+++ b/cpp/examples/fetch_dependencies.cmake
@@ -0,0 +1,30 @@
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+set(CPM_DOWNLOAD_VERSION v0.35.3)
+file(
+  DOWNLOAD
+  https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
+  ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake
+)
+include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
+
+set(CUDF_TAG branch-23.12)
+CPMFindPackage(
+  NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
+  GIT_TAG ${CUDF_TAG}
+  GIT_SHALLOW
+    TRUE
+    SOURCE_SUBDIR
+    cpp
+)
diff --git a/cpp/examples/nested_types/CMakeLists.txt b/cpp/examples/nested_types/CMakeLists.txt
new file mode 100644
index 00000000000..cb9430db237
--- /dev/null
+++ b/cpp/examples/nested_types/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.26.4)
+
+project(
+  nested_types
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+include(../fetch_dependencies.cmake)
+
+# Configure your project here
+add_executable(deduplication deduplication.cpp)
+target_link_libraries(deduplication PRIVATE cudf::cudf)
+target_compile_features(deduplication PRIVATE cxx_std_17)
diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp
new file mode 100644
index 00000000000..5969985cc72
--- /dev/null
+++ b/cpp/examples/nested_types/deduplication.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/io/json.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/join.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <chrono>
+#include <iostream>
+#include <string>
+
+/**
+ * @file deduplication.cpp
+ * @brief Demonstrates usage of the libcudf APIs to perform operations on nested-type tables.
+ *
+ * The algorithms chosen to be demonstrated are to showcase nested-type row operators of three
+ * kinds:
+ * 1. hashing: Used by functions `count_aggregate` and `join_count` to hash inputs of any type
+ * 2. equality: Used by functions `count_aggregate` and `join_count` in conjunction with hashing
+ * to determine equality for nested types
+ * 3. lexicographic: Used by function `sort_keys` to create a lexicographical order for nested-types
+ * so as to enable sorting
+ *
+ */
+
+/**
+ * @brief Create memory resource for libcudf functions
+ *
+ * @param pool Whether to use a pool memory resource.
+ * @return Memory resource instance
+ */
+std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool pool)
+{
+  auto cuda_mr = std::make_shared<rmm::mr::cuda_memory_resource>();
+  if (pool) { return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(cuda_mr); }
+  return cuda_mr;
+}
+
+/**
+ * @brief Read JSON input from file
+ *
+ * @param filepath path to input JSON file
+ * @return cudf::io::table_with_metadata
+ */
+cudf::io::table_with_metadata read_json(std::string filepath)
+{
+  auto source_info = cudf::io::source_info(filepath);
+  auto builder     = cudf::io::json_reader_options::builder(source_info).lines(true);
+  auto options     = builder.build();
+  return cudf::io::read_json(options);
+}
+
+/**
+ * @brief Write JSON output to file
+ *
+ * @param input table to write
+ * @param metadata metadata of input table read by JSON reader
+ * @param filepath path to output JSON file
+ */
+void write_json(cudf::table_view input, cudf::io::table_metadata metadata, std::string filepath)
+{
+  // write the data for inspection
+  auto sink_info = cudf::io::sink_info(filepath);
+  auto builder   = cudf::io::json_writer_options::builder(sink_info, input).lines(true);
+  builder.metadata(metadata);
+  auto options = builder.build();
+  cudf::io::write_json(options);
+}
+
+/**
+ * @brief Aggregate count of duplicate rows in nested-type column
+ *
+ * @param input table to aggregate
+ * @return std::unique_ptr<cudf::table>
+ */
+std::unique_ptr<cudf::table> count_aggregate(cudf::table_view input)
+{
+  // Get count for each key
+  auto keys = cudf::table_view{{input.column(0)}};
+  auto val  = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, keys.num_rows());
+
+  cudf::groupby::groupby grpby_obj(keys);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back(cudf::groupby::aggregation_request());
+  auto agg = cudf::make_count_aggregation<cudf::groupby_aggregation>();
+  requests[0].aggregations.push_back(std::move(agg));
+  requests[0].values = *val;
+  auto agg_results   = grpby_obj.aggregate(requests);
+  auto result_key    = std::move(agg_results.first);
+  auto result_val    = std::move(agg_results.second[0].results[0]);
+
+  auto left_cols = result_key->release();
+  left_cols.push_back(std::move(result_val));
+
+  return std::make_unique<cudf::table>(std::move(left_cols));
+}
+
+/**
+ * @brief Join each row with its duplicate counts
+ *
+ * @param left left table
+ * @param right right table
+ * @return std::unique_ptr<cudf::table>
+ */
+std::unique_ptr<cudf::table> join_count(cudf::table_view left, cudf::table_view right)
+{
+  auto [left_indices, right_indices] =
+    cudf::inner_join(cudf::table_view{{left.column(0)}}, cudf::table_view{{right.column(0)}});
+  auto new_left  = cudf::gather(left, cudf::device_span<cudf::size_type const>{*left_indices});
+  auto new_right = cudf::gather(right, cudf::device_span<cudf::size_type const>{*right_indices});
+
+  auto left_cols  = new_left->release();
+  auto right_cols = new_right->release();
+  left_cols.push_back(std::move(right_cols[1]));
+
+  return std::make_unique<cudf::table>(std::move(left_cols));
+}
+
+/**
+ * @brief Sort nested-type column
+ *
+ * @param input table to sort
+ * @return std::unique_ptr<cudf::table>
+ *
+ * @note if stability is desired, use `cudf::stable_sorted_order`
+ */
+std::unique_ptr<cudf::table> sort_keys(cudf::table_view input)
+{
+  auto sort_order = cudf::sorted_order(cudf::table_view{{input.column(0)}});
+  return cudf::gather(input, *sort_order);
+}
+
+/**
+ * @brief Main for nested_types examples
+ *
+ * Command line parameters:
+ * 1. JSON input file name/path (default: "example.json")
+ * 2. JSON output file name/path (default: "output.json")
+ * 3. Memory resource (optional): "pool" or "cuda" (default: "pool")
+ *
+ * Example invocation from directory `cudf/cpp/examples/nested_types`:
+ * ./build/deduplication example.json output.json pool
+ *
+ */
+int main(int argc, char const** argv)
+{
+  std::string input_filepath;
+  std::string output_filepath;
+  std::string mr_name;
+  if (argc != 4 && argc != 1) {
+    std::cout << "Either provide all command-line arguments, or none to use defaults" << std::endl;
+    return 1;
+  }
+  if (argc == 1) {
+    input_filepath  = "example.json";
+    output_filepath = "output.json";
+    mr_name         = "pool";
+  } else {
+    input_filepath  = argv[1];
+    output_filepath = argv[2];
+    mr_name         = argv[3];
+  }
+
+  auto pool     = mr_name == "pool";
+  auto resource = create_memory_resource(pool);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  std::cout << "Reading " << input_filepath << "..." << std::endl;
+  // read input file
+  auto [input, metadata] = read_json(input_filepath);
+
+  auto count = count_aggregate(input->view());
+
+  auto combined = join_count(input->view(), count->view());
+
+  auto sorted = sort_keys(combined->view());
+
+  metadata.schema_info.emplace_back("count");
+
+  std::cout << "Writing " << output_filepath << "..." << std::endl;
+  write_json(sorted->view(), metadata, output_filepath);
+
+  return 0;
+}
diff --git a/cpp/examples/nested_types/example.json b/cpp/examples/nested_types/example.json
new file mode 100644
index 00000000000..efaa37817d6
--- /dev/null
+++ b/cpp/examples/nested_types/example.json
@@ -0,0 +1,5 @@
+{"features": {"key": "a1", "values": [{"info": "message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_a", "quality": 0.7}
+{"features": {"key": "a2", "values": [{"info": "message_2", "type": "device_a", "dt": 1688750002}]}, "source": "network_a", "quality": 0.7}
+{"features": {"key": "a3", "values": [{"info": "message_3", "type": "device_a", "dt": 1688750003}]}, "source": "network_b", "quality": 0.8}
+{"features": {"key": "a1", "values": [{"info": "message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_b", "quality": 0.9}
+{"features": {"key": "a4", "values": [{"info": "message_4", "type": "device_a", "dt": 1688750004}]}, "source": "network_b", "quality": 0.9}
diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt
index 4b500d3a92e..c90fa9dde16 100644
--- a/cpp/examples/strings/CMakeLists.txt
+++ b/cpp/examples/strings/CMakeLists.txt
@@ -8,23 +8,7 @@ project(
   LANGUAGES CXX CUDA
 )
 
-set(CPM_DOWNLOAD_VERSION v0.35.3)
-file(
-  DOWNLOAD
-  https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
-  ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake
-)
-include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
-
-set(CUDF_TAG branch-23.12)
-CPMFindPackage(
-  NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
-  GIT_TAG ${CUDF_TAG}
-  GIT_SHALLOW
-    TRUE
-    SOURCE_SUBDIR
-    cpp
-)
+include(../fetch_dependencies.cmake)
 
 list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
 

From 53127de4d9e06f9fa172ac34952f85104eb7bac9 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 16 Nov 2023 08:28:44 -0600
Subject: [PATCH 20/36] Remove needs: wheel-build-cudf. (#14427)

This PR fixes a nightly test failure due to an extraneous `needs:` entry in `test.yaml`.

```
Invalid workflow file: .github/workflows/test.yaml#L100
The workflow is not valid. .github/workflows/test.yaml (Line: 100, Col: 12): Job 'unit-tests-cudf-pandas' depends on unknown job 'wheel-build-cudf'.
```

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14427
---
 .github/workflows/test.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 0b6b55069f6..0d4401160e1 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -97,7 +97,6 @@ jobs:
       sha: ${{ inputs.sha }}
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
-    needs: wheel-build-cudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
     with:

From 8e1ef05b2b96775ce7e1a2f22894ec7a8ebb65a4 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Thu, 16 Nov 2023 16:43:29 -0500
Subject: [PATCH 21/36] Change `nullable()` to `has_nulls()` in
 `cudf::detail::gather` (#14363)

In https://github.com/rapidsai/cudf/pull/13795, we found out that `nullable()` causes severe perf degradation for the nested-type case when the input is read from file via `cudf::io::read_json`. This is because the JSON reader adds a null mask for columns that don't have NULLs. This change is a no-overhead replacement that checks the actual null count instead of checking if a null mask is present.

This PR also solves a bug in quantile/median groupby where NULLs were being [set](https://github.com/rapidsai/cudf/blob/8deb3dd7573000e7d87f18a9e2bbe39cf2932e10/cpp/src/groupby/sort/group_quantiles.cu#L73) but the null count was not updated.

Authors:
  - Divye Gala (https://github.com/divyegala)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14363
---
 cpp/include/cudf/detail/gather.cuh      | 22 ++++++----
 cpp/include/cudf/detail/null_mask.hpp   | 17 ++++++++
 cpp/include/cudf/table/table_view.hpp   | 17 ++++++++
 cpp/src/bitmask/null_mask.cu            | 15 +++++++
 cpp/src/groupby/sort/group_quantiles.cu | 17 +++++---
 cpp/tests/join/join_tests.cpp           | 56 -------------------------
 6 files changed, 75 insertions(+), 69 deletions(-)

diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 955f9914632..c9975ef2199 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -673,14 +673,20 @@ std::unique_ptr<table> gather(table_view const& source_table,
                                                    mr));
   }
 
-  auto const nullable = bounds_policy == out_of_bounds_policy::NULLIFY ||
-                        std::any_of(source_table.begin(), source_table.end(), [](auto const& col) {
-                          return col.nullable();
-                        });
-  if (nullable) {
-    auto const op = bounds_policy == out_of_bounds_policy::NULLIFY ? gather_bitmask_op::NULLIFY
-                                                                   : gather_bitmask_op::DONT_CHECK;
-    gather_bitmask(source_table, gather_map_begin, destination_columns, op, stream, mr);
+  auto needs_new_bitmask = bounds_policy == out_of_bounds_policy::NULLIFY ||
+                           cudf::has_nested_nullable_columns(source_table);
+  if (needs_new_bitmask) {
+    needs_new_bitmask = needs_new_bitmask || cudf::has_nested_nulls(source_table);
+    if (needs_new_bitmask) {
+      auto const op = bounds_policy == out_of_bounds_policy::NULLIFY
+                        ? gather_bitmask_op::NULLIFY
+                        : gather_bitmask_op::DONT_CHECK;
+      gather_bitmask(source_table, gather_map_begin, destination_columns, op, stream, mr);
+    } else {
+      for (size_type i = 0; i < source_table.num_columns(); ++i) {
+        set_all_valid_null_masks(source_table.column(i), *destination_columns[i], stream, mr);
+      }
+    }
   }
 
   return std::make_unique<table>(std::move(destination_columns));
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 8c10bbe416f..74e2ccd2ea1 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <cudf/column/column.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
@@ -259,6 +260,22 @@ cudf::size_type inplace_bitmask_and(device_span<bitmask_type> dest_mask,
                                     size_type mask_size_bits,
                                     rmm::cuda_stream_view stream);
 
+/**
+ * @brief Recursively set valid null masks for all children.
+ *
+ * This function applies all valid null masks to the output column if input column satisfies
+ * `nullable() == true` condition
+ *
+ * @param input input column to check for nullability
+ * @param output output column to mirror nullability of input
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+void set_all_valid_null_masks(column_view const& input,
+                              column& output,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index b90b2dac012..5d9c930d137 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -336,6 +336,23 @@ inline bool has_nested_nulls(table_view const& input)
   });
 }
 
+/**
+ * @brief Returns True if the table has a nullable column at any level of the column hierarchy
+ *
+ * @param input The table to check for nullable columns
+ * @return True if the table has nullable columns at any level of the column hierarchy, false
+ * otherwise
+ */
+inline bool has_nested_nullable_columns(table_view const& input)
+{
+  return std::any_of(input.begin(), input.end(), [](auto const& col) {
+    return col.nullable() ||
+           std::any_of(col.child_begin(), col.child_end(), [](auto const& child_col) {
+             return has_nested_nullable_columns(table_view{{child_col}});
+           });
+  });
+}
+
 /**
  * @brief The function to collect all nullable columns at all nested levels in a given table.
  *
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 3ff56eabe1e..1a1cbb17d15 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -510,6 +510,21 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
   return std::pair(std::move(null_mask), 0);
 }
 
+void set_all_valid_null_masks(column_view const& input,
+                              column& output,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
+{
+  if (input.nullable()) {
+    auto mask = detail::create_null_mask(output.size(), mask_state::ALL_VALID, stream, mr);
+    output.set_null_mask(std::move(mask), 0);
+
+    for (size_type i = 0; i < input.num_children(); ++i) {
+      set_all_valid_null_masks(input.child(i), output.child(i), stream, mr);
+    }
+  }
+}
+
 }  // namespace detail
 
 // Create a bitmask from a specific range
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index a9edcfecbf7..a456d4b5964 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -49,6 +49,7 @@ struct calculate_quantile_fn {
   double const* d_quantiles;
   size_type num_quantiles;
   interpolation interpolation;
+  size_type* null_count;
 
   __device__ void operator()(size_type i)
   {
@@ -68,11 +69,13 @@ struct calculate_quantile_fn {
     thrust::for_each_n(thrust::seq,
                        thrust::make_counting_iterator(0),
                        num_quantiles,
-                       [d_result = d_result, segment_size, offset](size_type j) {
-                         if (segment_size == 0)
+                       [d_result = d_result, segment_size, offset, this](size_type j) {
+                         if (segment_size == 0) {
                            d_result.set_null(offset + j);
-                         else
+                           atomicAdd(this->null_count, 1);
+                         } else {
                            d_result.set_valid(offset + j);
+                         }
                        });
   }
 };
@@ -104,6 +107,7 @@ struct quantiles_functor {
     auto values_view     = column_device_view::create(values, stream);
     auto group_size_view = column_device_view::create(group_sizes, stream);
     auto result_view     = mutable_column_device_view::create(result->mutable_view(), stream);
+    auto null_count      = rmm::device_scalar<cudf::size_type>(0, stream, mr);
 
     // For each group, calculate quantile
     if (!cudf::is_dictionary(values.type())) {
@@ -118,7 +122,8 @@ struct quantiles_functor {
                            group_offsets.data(),
                            quantile.data(),
                            static_cast<size_type>(quantile.size()),
-                           interpolation});
+                           interpolation,
+                           null_count.data()});
     } else {
       auto values_iter = cudf::dictionary::detail::make_dictionary_iterator<T>(*values_view);
       thrust::for_each_n(rmm::exec_policy(stream),
@@ -131,9 +136,11 @@ struct quantiles_functor {
                            group_offsets.data(),
                            quantile.data(),
                            static_cast<size_type>(quantile.size()),
-                           interpolation});
+                           interpolation,
+                           null_count.data()});
     }
 
+    result->set_null_count(null_count.value(stream));
     return result;
   }
 
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 089db315748..a416df0c7c3 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -1941,62 +1941,6 @@ TEST_F(JoinTest, FullJoinWithStructsAndNulls)
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
-TEST_F(JoinTest, Repro_StructsWithoutNullsPushedDown)
-{
-  // When joining on a STRUCT column, if the parent nulls are not reflected in
-  // the children, the join might produce incorrect results.
-  //
-  // In this test, a fact table of structs is joined against a dimension table.
-  // Both tables must match (only) on the NULL row. This will fail if the fact table's
-  // nulls are not pushed down into its children.
-  using ints    = column_wrapper<int32_t>;
-  using structs = cudf::test::structs_column_wrapper;
-  using namespace cudf::test::iterators;
-
-  auto make_table = [](auto&& col) {
-    auto columns = CVector{};
-    columns.push_back(std::move(col));
-    return cudf::table{std::move(columns)};
-  };
-
-  auto const fact_table = [make_table] {
-    auto fact_ints    = ints{0, 1, 2, 3, 4};
-    auto fact_structs = structs{{fact_ints}, no_nulls()}.release();
-    // Now set struct validity to invalidate index#3.
-    cudf::detail::set_null_mask(
-      fact_structs->mutable_view().null_mask(), 3, 4, false, cudf::get_default_stream());
-    // Struct row#3 is null, but Struct.child has a non-null value.
-    return make_table(std::move(fact_structs));
-  }();
-
-  auto const dimension_table = [make_table] {
-    auto dim_ints    = ints{999};
-    auto dim_structs = structs{{dim_ints}, null_at(0)};
-    return make_table(dim_structs.release());
-  }();
-
-  auto const result = inner_join(fact_table.view(), dimension_table.view(), {0}, {0});
-  EXPECT_EQ(result->num_rows(), 1);  // The null STRUCT rows should match.
-
-  // Note: Join result might not have nulls pushed down, since it's an output of gather().
-  // Must superimpose parent nulls before comparisons.
-  auto [superimposed_results, _] = cudf::structs::detail::push_down_nulls(
-    *result, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-
-  auto const expected = [] {
-    auto fact_ints    = ints{0};
-    auto fact_structs = structs{{fact_ints}, null_at(0)};
-    auto dim_ints     = ints{0};
-    auto dim_structs  = structs{{dim_ints}, null_at(0)};
-    auto columns      = CVector{};
-    columns.push_back(fact_structs.release());
-    columns.push_back(dim_structs.release());
-    return cudf::table{std::move(columns)};
-  }();
-
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(superimposed_results, expected);
-}
-
 using lcw = cudf::test::lists_column_wrapper<int32_t>;
 using cudf::test::iterators::null_at;
 

From bf63d1049db70c28ea961b677ad5f207aa648860 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Thu, 16 Nov 2023 14:47:36 -0800
Subject: [PATCH 22/36] Add decoder for DELTA_BYTE_ARRAY to Parquet reader
 (#14101)

Part of #13501. Adds ability to decode DELTA_BYTE_ARRAY encoded pages.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - https://github.com/nvdbaranec
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/14101
---
 cpp/src/io/parquet/delta_binary.cuh           |  58 ++-
 cpp/src/io/parquet/page_data.cu               |  12 +-
 cpp/src/io/parquet/page_decode.cuh            |  12 +-
 cpp/src/io/parquet/page_delta_decode.cu       | 490 +++++++++++++++++-
 cpp/src/io/parquet/page_hdr.cu                |  17 +-
 cpp/src/io/parquet/page_string_decode.cu      | 344 ++++++++++--
 cpp/src/io/parquet/parquet_gpu.hpp            |  79 ++-
 cpp/src/io/parquet/reader_impl.cpp            |  38 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu  |   4 +-
 .../tests/data/parquet/delta_byte_arr.parquet | Bin 0 -> 5783 bytes
 python/cudf/cudf/tests/test_parquet.py        | 104 ++++
 11 files changed, 1044 insertions(+), 114 deletions(-)
 create mode 100644 python/cudf/cudf/tests/data/parquet/delta_byte_arr.parquet

diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh
index e3b23f4c0a0..ccc28791071 100644
--- a/cpp/src/io/parquet/delta_binary.cuh
+++ b/cpp/src/io/parquet/delta_binary.cuh
@@ -39,15 +39,15 @@ namespace cudf::io::parquet::detail {
 // per mini-block. While encoding, the lowest delta value is subtracted from all the deltas in the
 // block to ensure that all encoded values are positive. The deltas for each mini-block are bit
 // packed using the same encoding as the RLE/Bit-Packing Hybrid encoder.
-//
-// DELTA_BYTE_ARRAY encoding (incremental encoding or front compression), is used for BYTE_ARRAY
-// columns. For each element in a sequence of strings, a prefix length from the preceding string
-// and a suffix is stored. The prefix lengths are DELTA_BINARY_PACKED encoded. The suffixes are
-// encoded with DELTA_LENGTH_BYTE_ARRAY encoding, which is a DELTA_BINARY_PACKED list of suffix
-// lengths, followed by the concatenated suffix data.
 
-// we decode one mini-block at a time. max mini-block size seen is 64.
-constexpr int delta_rolling_buf_size = 128;
+// The largest mini-block size we can currently support.
+constexpr int max_delta_mini_block_size = 64;
+
+// The first pass decodes `values_per_mb` values, and then the second pass does another
+// batch of size `values_per_mb`. The largest value for values_per_miniblock among the
+// major writers seems to be 64, so 2 * 64 should be good. We save the first value separately
+// since it is not encoded in the first mini-block.
+constexpr int delta_rolling_buf_size = 2 * max_delta_mini_block_size;
 
 /**
  * @brief Read a ULEB128 varint integer
@@ -90,7 +90,8 @@ struct delta_binary_decoder {
   uleb128_t mini_block_count;  // usually 4, chosen such that block_size/mini_block_count is a
                                // multiple of 32
   uleb128_t value_count;       // total values encoded in the block
-  zigzag128_t last_value;      // last value decoded, initialized to first_value from header
+  zigzag128_t first_value;     // initial value, stored in the header
+  zigzag128_t last_value;      // last value decoded
 
   uint32_t values_per_mb;      // block_size / mini_block_count, must be multiple of 32
   uint32_t current_value_idx;  // current value index, initialized to 0 at start of block
@@ -102,6 +103,13 @@ struct delta_binary_decoder {
 
   uleb128_t value[delta_rolling_buf_size];  // circular buffer of delta values
 
+  // returns the value stored in the `value` array at index
+  // `rolling_index<delta_rolling_buf_size>(idx)`. If `idx` is `0`, then return `first_value`.
+  constexpr zigzag128_t value_at(size_type idx)
+  {
+    return idx == 0 ? first_value : value[rolling_index<delta_rolling_buf_size>(idx)];
+  }
+
   // returns the number of values encoded in the block data. when all_values is true,
   // account for the first value in the header. otherwise just count the values encoded
   // in the mini-block data.
@@ -145,7 +153,8 @@ struct delta_binary_decoder {
     block_size       = get_uleb128(d_start, d_end);
     mini_block_count = get_uleb128(d_start, d_end);
     value_count      = get_uleb128(d_start, d_end);
-    last_value       = get_zz128(d_start, d_end);
+    first_value      = get_zz128(d_start, d_end);
+    last_value       = first_value;
 
     current_value_idx = 0;
     values_per_mb     = block_size / mini_block_count;
@@ -179,6 +188,28 @@ struct delta_binary_decoder {
     }
   }
 
+  // given start/end pointers in the data, find the end of the binary encoded block. when done,
+  // `this` will be initialized with the correct start and end positions. returns the end, which is
+  // start of data/next block. should only be called from thread 0.
+  inline __device__ uint8_t const* find_end_of_block(uint8_t const* start, uint8_t const* end)
+  {
+    // read block header
+    init_binary_block(start, end);
+
+    // test for no encoded values. a single value will be in the block header.
+    if (value_count <= 1) { return block_start; }
+
+    // read mini-block headers and skip over data
+    while (current_value_idx < num_encoded_values(false)) {
+      setup_next_mini_block(false);
+    }
+    // calculate the correct end of the block
+    auto const* const new_end = cur_mb == 0 ? block_start : cur_mb_start;
+    // re-init block with correct end
+    init_binary_block(start, new_end);
+    return new_end;
+  }
+
   // decode the current mini-batch of deltas, and convert to values.
   // called by all threads in a warp, currently only one warp supported.
   inline __device__ void calc_mini_block_values(int lane_id)
@@ -186,12 +217,9 @@ struct delta_binary_decoder {
     using cudf::detail::warp_size;
     if (current_value_idx >= value_count) { return; }
 
-    // need to save first value from header on first pass
+    // need to account for the first value from header on first pass
     if (current_value_idx == 0) {
-      if (lane_id == 0) {
-        current_value_idx++;
-        value[0] = last_value;
-      }
+      if (lane_id == 0) { current_value_idx++; }
       __syncwarp();
       if (current_value_idx >= value_count) { return; }
     }
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index a783b489c02..0c53877f7c7 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -449,8 +449,13 @@ __global__ void __launch_bounds__(decode_block_size)
   int out_thread0;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
-  if (!setupLocalPageInfo(
-        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{KERNEL_MASK_GENERAL}, true)) {
+  if (!setupLocalPageInfo(s,
+                          &pages[page_idx],
+                          chunks,
+                          min_row,
+                          num_rows,
+                          mask_filter{decode_kernel_mask::GENERAL},
+                          true)) {
     return;
   }
 
@@ -486,6 +491,7 @@ __global__ void __launch_bounds__(decode_block_size)
       target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
       if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); }
     }
+    // TODO(ets): see if this sync can be removed
     __syncthreads();
     if (t < 32) {
       // decode repetition and definition levels.
@@ -603,7 +609,7 @@ __global__ void __launch_bounds__(decode_block_size)
 }
 
 struct mask_tform {
-  __device__ uint32_t operator()(PageInfo const& p) { return p.kernel_mask; }
+  __device__ uint32_t operator()(PageInfo const& p) { return static_cast<uint32_t>(p.kernel_mask); }
 };
 
 }  // anonymous namespace
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index ab1cc68923d..4db9bd3904b 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -991,8 +991,15 @@ struct all_types_filter {
  * @brief Functor for setupLocalPageInfo that takes a mask of allowed types.
  */
 struct mask_filter {
-  int mask;
-  __device__ inline bool operator()(PageInfo const& page) { return (page.kernel_mask & mask) != 0; }
+  uint32_t mask;
+
+  __device__ mask_filter(uint32_t m) : mask(m) {}
+  __device__ mask_filter(decode_kernel_mask m) : mask(static_cast<uint32_t>(m)) {}
+
+  __device__ inline bool operator()(PageInfo const& page)
+  {
+    return BitAnd(mask, page.kernel_mask) != 0;
+  }
 };
 
 /**
@@ -1306,6 +1313,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           s->dict_run = 0;
         } break;
         case Encoding::DELTA_BINARY_PACKED:
+        case Encoding::DELTA_BYTE_ARRAY:
           // nothing to do, just don't error
           break;
         default: {
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index bb5e5066b69..bc025c6fc3e 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -27,6 +27,277 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
+constexpr int decode_block_size = 128;
+
+// DELTA_BYTE_ARRAY encoding (incremental encoding or front compression), is used for BYTE_ARRAY
+// columns. For each element in a sequence of strings, a prefix length from the preceding string
+// and a suffix is stored. The prefix lengths are DELTA_BINARY_PACKED encoded. The suffixes are
+// encoded with DELTA_LENGTH_BYTE_ARRAY encoding, which is a DELTA_BINARY_PACKED list of suffix
+// lengths, followed by the concatenated suffix data.
+struct delta_byte_array_decoder {
+  uint8_t const* last_string;       // pointer to last decoded string...needed for its prefix
+  uint8_t const* suffix_char_data;  // pointer to the start of character data
+
+  uint8_t* temp_buf;         // buffer used when skipping values
+  uint32_t start_val;        // decoded strings up to this index will be dumped to temp_buf
+  uint32_t last_string_len;  // length of the last decoded string
+
+  delta_binary_decoder prefixes;  // state of decoder for prefix lengths
+  delta_binary_decoder suffixes;  // state of decoder for suffix lengths
+
+  // initialize the prefixes and suffixes blocks
+  __device__ void init(uint8_t const* start, uint8_t const* end, uint32_t start_idx, uint8_t* temp)
+  {
+    auto const* suffix_start = prefixes.find_end_of_block(start, end);
+    suffix_char_data         = suffixes.find_end_of_block(suffix_start, end);
+    last_string              = nullptr;
+    temp_buf                 = temp;
+    start_val                = start_idx;
+  }
+
+  // kind of like an inclusive scan for strings. takes prefix_len bytes from preceding
+  // string and prepends to the suffix we've already copied into place. called from
+  // within loop over values_in_mb, so this only needs to handle a single warp worth of data
+  // at a time.
+  __device__ void string_scan(uint8_t* strings_out,
+                              uint8_t const* last_string,
+                              uint32_t start_idx,
+                              uint32_t end_idx,
+                              uint32_t offset,
+                              uint32_t lane_id)
+  {
+    using cudf::detail::warp_size;
+
+    // let p(n) === length(prefix(string_n))
+    //
+    // if p(n-1) > p(n), then string_n can be completed when string_n-2 is completed. likewise if
+    // p(m) > p(n), then string_n can be completed with string_m-1. however, if p(m) < p(n), then m
+    // is a "blocker" for string_n; string_n can be completed only after string_m is.
+    //
+    // we will calculate the nearest blocking position for each lane, and then fill in string_0. we
+    // then iterate, finding all lanes that have had their "blocker" filled in and completing them.
+    // when all lanes are filled in, we return. this will still hit the worst case if p(n-1) < p(n)
+    // for all n
+    __shared__ __align__(8) int64_t prefix_lens[warp_size];
+    __shared__ __align__(8) uint8_t const* offsets[warp_size];
+
+    uint32_t const ln_idx   = start_idx + lane_id;
+    uint64_t prefix_len     = ln_idx < end_idx ? prefixes.value_at(ln_idx) : 0;
+    uint8_t* const lane_out = ln_idx < end_idx ? strings_out + offset : nullptr;
+
+    prefix_lens[lane_id] = prefix_len;
+    offsets[lane_id]     = lane_out;
+
+    // if all prefix_len's are zero, then there's nothing to do
+    if (__all_sync(0xffff'ffff, prefix_len == 0)) { return; }
+
+    // find a neighbor to the left that has a prefix length less than this lane. once that
+    // neighbor is complete, this lane can be completed.
+    int blocker = lane_id - 1;
+    while (blocker > 0 && prefix_lens[blocker] != 0 && prefix_len <= prefix_lens[blocker]) {
+      blocker--;
+    }
+
+    // fill in lane 0 (if necessary)
+    if (lane_id == 0 && prefix_len > 0) {
+      memcpy(lane_out, last_string, prefix_len);
+      prefix_lens[0] = prefix_len = 0;
+    }
+    __syncwarp();
+
+    // now fill in blockers until done
+    for (uint32_t i = 1; i < warp_size && i + start_idx < end_idx; i++) {
+      if (prefix_len != 0 && prefix_lens[blocker] == 0 && lane_out != nullptr) {
+        memcpy(lane_out, offsets[blocker], prefix_len);
+        prefix_lens[lane_id] = prefix_len = 0;
+      }
+
+      // check for finished
+      if (__all_sync(0xffff'ffff, prefix_len == 0)) { return; }
+    }
+  }
+
+  // calculate a mini-batch of string values, writing the results to
+  // `strings_out`. starting at global index `start_idx` and decoding
+  // up to `num_values` strings.
+  // called by all threads in a warp. used for strings <= 32 chars.
+  // returns number of bytes written
+  __device__ size_t calculate_string_values(uint8_t* strings_out,
+                                            uint32_t start_idx,
+                                            uint32_t num_values,
+                                            uint32_t lane_id)
+  {
+    using cudf::detail::warp_size;
+    using WarpScan = cub::WarpScan<uint64_t>;
+    __shared__ WarpScan::TempStorage scan_temp;
+
+    if (start_idx >= suffixes.value_count) { return 0; }
+    auto end_idx = start_idx + min(suffixes.values_per_mb, num_values);
+    end_idx      = min(end_idx, static_cast<uint32_t>(suffixes.value_count));
+
+    auto p_strings_out = strings_out;
+    auto p_temp_out    = temp_buf;
+
+    auto copy_batch = [&](uint8_t* out, uint32_t idx, uint32_t end) {
+      uint32_t const ln_idx = idx + lane_id;
+
+      // calculate offsets into suffix data
+      uint64_t const suffix_len = ln_idx < end ? suffixes.value_at(ln_idx) : 0;
+      uint64_t suffix_off       = 0;
+      WarpScan(scan_temp).ExclusiveSum(suffix_len, suffix_off);
+
+      // calculate offsets into string data
+      uint64_t const prefix_len = ln_idx < end ? prefixes.value_at(ln_idx) : 0;
+      uint64_t const string_len = prefix_len + suffix_len;
+
+      // get offset into output for each lane
+      uint64_t string_off, warp_total;
+      WarpScan(scan_temp).ExclusiveSum(string_len, string_off, warp_total);
+      auto const so_ptr = out + string_off;
+
+      // copy suffixes into string data
+      if (ln_idx < end) { memcpy(so_ptr + prefix_len, suffix_char_data + suffix_off, suffix_len); }
+      __syncwarp();
+
+      // copy prefixes into string data.
+      string_scan(out, last_string, idx, end, string_off, lane_id);
+
+      // save the position of the last computed string. this will be used in
+      // the next iteration to reconstruct the string in lane 0.
+      if (ln_idx == end - 1 || (ln_idx < end && lane_id == 31)) {
+        // set last_string to this lane's string
+        last_string     = out + string_off;
+        last_string_len = string_len;
+        // and consume used suffix_char_data
+        suffix_char_data += suffix_off + suffix_len;
+      }
+
+      return warp_total;
+    };
+
+    uint64_t string_total = 0;
+    for (int idx = start_idx; idx < end_idx; idx += warp_size) {
+      auto const n_in_batch = min(warp_size, end_idx - idx);
+      // account for the case where start_val occurs in the middle of this batch
+      if (idx < start_val && idx + n_in_batch > start_val) {
+        // dump idx...start_val into temp_buf
+        copy_batch(p_temp_out, idx, start_val);
+        __syncwarp();
+
+        // start_val...idx + n_in_batch into strings_out
+        auto nbytes = copy_batch(p_strings_out, start_val, idx + n_in_batch);
+        p_strings_out += nbytes;
+        string_total = nbytes;
+      } else {
+        if (idx < start_val) {
+          p_temp_out += copy_batch(p_temp_out, idx, end_idx);
+        } else {
+          auto nbytes = copy_batch(p_strings_out, idx, end_idx);
+          p_strings_out += nbytes;
+          string_total += nbytes;
+        }
+      }
+      __syncwarp();
+    }
+
+    return string_total;
+  }
+
+  // character parallel version of CalculateStringValues(). This is faster for strings longer than
+  // 32 chars.
+  __device__ size_t calculate_string_values_cp(uint8_t* strings_out,
+                                               uint32_t start_idx,
+                                               uint32_t num_values,
+                                               uint32_t lane_id)
+  {
+    using cudf::detail::warp_size;
+    __shared__ __align__(8) uint8_t* so_ptr;
+
+    if (start_idx >= suffixes.value_count) { return; }
+    auto end_idx = start_idx + min(suffixes.values_per_mb, num_values);
+    end_idx      = min(end_idx, static_cast<uint32_t>(suffixes.value_count));
+
+    if (lane_id == 0) { so_ptr = start_idx < start_val ? temp_buf : strings_out; }
+    __syncwarp();
+
+    uint64_t string_total = 0;
+    for (int idx = start_idx; idx < end_idx; idx++) {
+      uint64_t const suffix_len = suffixes.value_at(idx);
+      uint64_t const prefix_len = prefixes.value_at(idx);
+      uint64_t const string_len = prefix_len + suffix_len;
+
+      // copy prefix and suffix data into current strings_out position
+      // for longer strings use a 4-byte version stolen from gather_chars_fn_string_parallel.
+      if (string_len > 64) {
+        if (prefix_len > 0) { wideStrcpy(so_ptr, last_string, prefix_len, lane_id); }
+        if (suffix_len > 0) {
+          wideStrcpy(so_ptr + prefix_len, suffix_char_data, suffix_len, lane_id);
+        }
+      } else {
+        for (int i = lane_id; i < string_len; i += warp_size) {
+          so_ptr[i] = i < prefix_len ? last_string[i] : suffix_char_data[i - prefix_len];
+        }
+      }
+      __syncwarp();
+
+      if (idx >= start_val) { string_total += string_len; }
+
+      if (lane_id == 0) {
+        last_string     = so_ptr;
+        last_string_len = string_len;
+        suffix_char_data += suffix_len;
+        if (idx == start_val - 1) {
+          so_ptr = strings_out;
+        } else {
+          so_ptr += string_len;
+        }
+      }
+      __syncwarp();
+    }
+
+    return string_total;
+  }
+
+  // dump strings before start_val to temp buf
+  __device__ void skip(bool use_char_ll)
+  {
+    using cudf::detail::warp_size;
+    int const t       = threadIdx.x;
+    int const lane_id = t % warp_size;
+
+    // is this even necessary? return if asking to skip the whole block.
+    if (start_val >= prefixes.num_encoded_values(true)) { return; }
+
+    // prefixes and suffixes will have the same parameters (it's checked earlier)
+    auto const batch_size = prefixes.values_per_mb;
+
+    uint32_t skip_pos = 0;
+    while (prefixes.current_value_idx < start_val) {
+      // warp 0 gets prefixes and warp 1 gets suffixes
+      auto* const db = t < 32 ? &prefixes : &suffixes;
+
+      // this will potentially decode past start_val, but that's ok
+      if (t < 64) { db->decode_batch(); }
+      __syncthreads();
+
+      // warp 0 decodes the batch.
+      if (t < 32) {
+        auto const num_to_decode = min(batch_size, start_val - skip_pos);
+        auto const bytes_written =
+          use_char_ll ? calculate_string_values_cp(temp_buf, skip_pos, num_to_decode, lane_id)
+                      : calculate_string_values(temp_buf, skip_pos, num_to_decode, lane_id);
+        // store last_string someplace safe in temp buffer
+        if (t == 0) {
+          memcpy(temp_buf + bytes_written, last_string, last_string_len);
+          last_string = temp_buf + bytes_written;
+        }
+      }
+      skip_pos += prefixes.values_per_mb;
+      __syncthreads();
+    }
+  }
+};
+
 // Decode page data that is DELTA_BINARY_PACKED encoded. This encoding is
 // only used for int32 and int64 physical types (and appears to only be used
 // with V2 page headers; see https://www.mail-archive.com/dev@parquet.apache.org/msg11826.html).
@@ -52,13 +323,9 @@ __global__ void __launch_bounds__(96)
   auto* const db        = &db_state;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
-  if (!setupLocalPageInfo(s,
-                          &pages[page_idx],
-                          chunks,
-                          min_row,
-                          num_rows,
-                          mask_filter{KERNEL_MASK_DELTA_BINARY},
-                          true)) {
+  auto const mask = decode_kernel_mask::DELTA_BINARY;
+  if (!setupLocalPageInfo(
+        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{mask}, true)) {
     return;
   }
 
@@ -78,6 +345,10 @@ __global__ void __launch_bounds__(96)
   __syncthreads();
 
   auto const batch_size = db->values_per_mb;
+  if (batch_size > max_delta_mini_block_size) {
+    set_error(static_cast<int32_t>(decode_error::DELTA_PARAMS_UNSUPPORTED), error_code);
+    return;
+  }
 
   // if skipped_leaf_values is non-zero, then we need to decode up to the first mini-block
   // that has a value we need.
@@ -93,6 +364,7 @@ __global__ void __launch_bounds__(96)
     } else {  // warp2
       target_pos = min(s->nz_count, src_pos + batch_size);
     }
+    // TODO(ets): see if this sync can be removed
     __syncthreads();
 
     // warp0 will decode the rep/def levels, warp1 will unpack a mini-batch of deltas.
@@ -125,23 +397,12 @@ __global__ void __launch_bounds__(96)
         // place value for this thread
         if (dst_pos >= 0 && sp < target_pos) {
           void* const dst = nesting_info_base[leaf_level_index].data_out + dst_pos * s->dtype_len;
+          auto const val  = db->value_at(sp + skipped_leaf_values);
           switch (s->dtype_len) {
-            case 1:
-              *static_cast<int8_t*>(dst) =
-                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
-              break;
-            case 2:
-              *static_cast<int16_t*>(dst) =
-                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
-              break;
-            case 4:
-              *static_cast<int32_t*>(dst) =
-                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
-              break;
-            case 8:
-              *static_cast<int64_t*>(dst) =
-                db->value[rolling_index<delta_rolling_buf_size>(sp + skipped_leaf_values)];
-              break;
+            case 1: *static_cast<int8_t*>(dst) = val; break;
+            case 2: *static_cast<int16_t*>(dst) = val; break;
+            case 4: *static_cast<int32_t*>(dst) = val; break;
+            case 8: *static_cast<int64_t*>(dst) = val; break;
           }
         }
       }
@@ -154,6 +415,164 @@ __global__ void __launch_bounds__(96)
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
 
+// Decode page data that is DELTA_BYTE_ARRAY packed. This encoding consists of a DELTA_BINARY_PACKED
+// array of prefix lengths, followed by a DELTA_BINARY_PACKED array of suffix lengths, followed by
+// the suffixes (technically the suffixes are DELTA_LENGTH_BYTE_ARRAY encoded). The latter two can
+// be used to create an offsets array for the suffix data, but then this needs to be combined with
+// the prefix lengths to do the final decode for each value. Because the lengths of the prefixes and
+// suffixes are not encoded in the header, we're going to have to first do a quick pass through them
+// to find the start/end of each structure.
+template <typename level_t>
+__global__ void __launch_bounds__(decode_block_size)
+  gpuDecodeDeltaByteArray(PageInfo* pages,
+                          device_span<ColumnChunkDesc const> chunks,
+                          size_t min_row,
+                          size_t num_rows,
+                          int32_t* error_code)
+{
+  using cudf::detail::warp_size;
+  __shared__ __align__(16) delta_byte_array_decoder db_state;
+  __shared__ __align__(16) page_state_s state_g;
+  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 0, 0> state_buffers;
+
+  page_state_s* const s = &state_g;
+  auto* const sb        = &state_buffers;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  int const lane_id     = t % warp_size;
+  auto* const prefix_db = &db_state.prefixes;
+  auto* const suffix_db = &db_state.suffixes;
+  auto* const dba       = &db_state;
+  [[maybe_unused]] null_count_back_copier _{s, t};
+
+  auto const mask = decode_kernel_mask::DELTA_BYTE_ARRAY;
+  if (!setupLocalPageInfo(
+        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{mask}, true)) {
+    return;
+  }
+
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
+  // choose a character parallel string copy when the average string is longer than a warp
+  auto const use_char_ll = (s->page.str_bytes / s->page.num_valids) > cudf::detail::warp_size;
+
+  // copying logic from gpuDecodePageData.
+  PageNestingDecodeInfo const* nesting_info_base = s->nesting_info;
+
+  __shared__ level_t rep[delta_rolling_buf_size];  // circular buffer of repetition level values
+  __shared__ level_t def[delta_rolling_buf_size];  // circular buffer of definition level values
+
+  // skipped_leaf_values will always be 0 for flat hierarchies.
+  uint32_t const skipped_leaf_values = s->page.skipped_leaf_values;
+
+  if (t == 0) {
+    // initialize the prefixes and suffixes blocks
+    dba->init(s->data_start, s->data_end, s->page.start_val, s->page.temp_string_buf);
+  }
+  __syncthreads();
+
+  // assert that prefix and suffix have same mini-block size
+  if (prefix_db->values_per_mb != suffix_db->values_per_mb or
+      prefix_db->block_size != suffix_db->block_size or
+      prefix_db->value_count != suffix_db->value_count) {
+    set_error(static_cast<int32_t>(decode_error::DELTA_PARAM_MISMATCH), error_code);
+    return;
+  }
+
+  // pointer to location to output final strings
+  int const leaf_level_index = s->col.max_nesting_depth - 1;
+  auto strings_data          = nesting_info_base[leaf_level_index].string_out;
+
+  auto const batch_size = prefix_db->values_per_mb;
+  if (batch_size > max_delta_mini_block_size) {
+    set_error(static_cast<int32_t>(decode_error::DELTA_PARAMS_UNSUPPORTED), error_code);
+    return;
+  }
+
+  // if this is a bounds page and nested, then we need to skip up front. non-nested will work
+  // its way through the page.
+  int string_pos          = has_repetition ? s->page.start_val : 0;
+  auto const is_bounds_pg = is_bounds_page(s, min_row, num_rows, has_repetition);
+  if (is_bounds_pg && string_pos > 0) { dba->skip(use_char_ll); }
+
+  while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+    uint32_t target_pos;
+    uint32_t const src_pos = s->src_pos;
+
+    if (t < 3 * warp_size) {  // warp 0..2
+      target_pos = min(src_pos + 2 * batch_size, s->nz_count + s->first_row + batch_size);
+    } else {  // warp 3
+      target_pos = min(s->nz_count, src_pos + batch_size);
+    }
+    // TODO(ets): see if this sync can be removed
+    __syncthreads();
+
+    // warp0 will decode the rep/def levels, warp1 will unpack a mini-batch of prefixes, warp 2 will
+    // unpack a mini-batch of suffixes. warp3 waits one cycle for warps 0-2 to produce a batch, and
+    // then stuffs values into the proper location in the output.
+    if (t < warp_size) {
+      // decode repetition and definition levels.
+      // - update validity vectors
+      // - updates offsets (for nested columns)
+      // - produces non-NULL value indices in s->nz_idx for subsequent decoding
+      gpuDecodeLevels<delta_rolling_buf_size, level_t>(s, sb, target_pos, rep, def, t);
+
+    } else if (t < 2 * warp_size) {
+      // warp 1
+      prefix_db->decode_batch();
+
+    } else if (t < 3 * warp_size) {
+      // warp 2
+      suffix_db->decode_batch();
+
+    } else if (src_pos < target_pos) {
+      // warp 3
+
+      int const nproc = min(batch_size, s->page.end_val - string_pos);
+      strings_data += use_char_ll
+                        ? dba->calculate_string_values_cp(strings_data, string_pos, nproc, lane_id)
+                        : dba->calculate_string_values(strings_data, string_pos, nproc, lane_id);
+      string_pos += nproc;
+
+      // process the mini-block in batches of 32
+      for (uint32_t sp = src_pos + lane_id; sp < src_pos + batch_size; sp += 32) {
+        // the position in the output column/buffer
+        int dst_pos = sb->nz_idx[rolling_index<delta_rolling_buf_size>(sp)];
+
+        // handle skip_rows here. flat hierarchies can just skip up to first_row.
+        if (!has_repetition) { dst_pos -= s->first_row; }
+
+        if (dst_pos >= 0 && sp < target_pos) {
+          auto const offptr =
+            reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out) + dst_pos;
+          auto const src_idx = sp + skipped_leaf_values;
+          *offptr            = prefix_db->value_at(src_idx) + suffix_db->value_at(src_idx);
+        }
+        __syncwarp();
+      }
+
+      if (lane_id == 0) { s->src_pos = src_pos + batch_size; }
+    }
+
+    __syncthreads();
+  }
+
+  // now turn array of lengths into offsets
+  int value_count = nesting_info_base[leaf_level_index].value_count;
+
+  // if no repetition we haven't calculated start/end bounds and instead just skipped
+  // values until we reach first_row. account for that here.
+  if (!has_repetition) { value_count -= s->first_row; }
+
+  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+
+  if (t == 0 and s->error != 0) {
+    cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*error_code};
+    ref.fetch_or(s->error, cuda::std::memory_order_relaxed);
+  }
+}
+
 }  // anonymous namespace
 
 /**
@@ -181,4 +600,29 @@ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages
   }
 }
 
+/**
+ * @copydoc cudf::io::parquet::gpu::DecodeDeltaByteArray
+ */
+void __host__ DecodeDeltaByteArray(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                                   cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                                   size_t num_rows,
+                                   size_t min_row,
+                                   int level_type_size,
+                                   int32_t* error_code,
+                                   rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
+
+  dim3 const dim_block(decode_block_size, 1);
+  dim3 const dim_grid(pages.size(), 1);  // 1 threadblock per page
+
+  if (level_type_size == 1) {
+    gpuDecodeDeltaByteArray<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  } else {
+    gpuDecodeDeltaByteArray<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  }
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 22add2fffc6..595dd40cdc2 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -146,18 +146,21 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type)
  * @param chunk Column chunk the page belongs to
  * @return `kernel_mask_bits` value for the given page
  */
-__device__ uint32_t kernel_mask_for_page(PageInfo const& page, ColumnChunkDesc const& chunk)
+__device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
+                                                   ColumnChunkDesc const& chunk)
 {
-  if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; }
+  if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return decode_kernel_mask::NONE; }
 
   if (page.encoding == Encoding::DELTA_BINARY_PACKED) {
-    return KERNEL_MASK_DELTA_BINARY;
+    return decode_kernel_mask::DELTA_BINARY;
+  } else if (page.encoding == Encoding::DELTA_BYTE_ARRAY) {
+    return decode_kernel_mask::DELTA_BYTE_ARRAY;
   } else if (is_string_col(chunk)) {
-    return KERNEL_MASK_STRING;
+    return decode_kernel_mask::STRING;
   }
 
   // non-string, non-delta
-  return KERNEL_MASK_GENERAL;
+  return decode_kernel_mask::GENERAL;
 }
 
 /**
@@ -380,7 +383,9 @@ __global__ void __launch_bounds__(128)
       bs->page.skipped_values      = -1;
       bs->page.skipped_leaf_values = 0;
       bs->page.str_bytes           = 0;
-      bs->page.kernel_mask         = 0;
+      bs->page.temp_string_size    = 0;
+      bs->page.temp_string_buf     = nullptr;
+      bs->page.kernel_mask         = decode_kernel_mask::NONE;
     }
     num_values     = bs->ck.num_values;
     page_info      = bs->ck.page_info;
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 4c7d8e3c20a..e29db042401 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -14,20 +14,28 @@
  * limitations under the License.
  */
 
+#include "delta_binary.cuh"
 #include "page_decode.cuh"
 #include "page_string_utils.cuh"
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/strings/detail/gather.cuh>
 
+#include <thrust/logical.h>
+#include <thrust/transform_scan.h>
+
+#include <bitset>
+
 namespace cudf::io::parquet::detail {
 
 namespace {
 
-constexpr int preprocess_block_size = 512;
-constexpr int decode_block_size     = 128;
-constexpr int rolling_buf_size      = decode_block_size * 2;
-constexpr int preproc_buf_size      = LEVEL_DECODE_BUF_SIZE;
+constexpr int preprocess_block_size    = 512;
+constexpr int decode_block_size        = 128;
+constexpr int delta_preproc_block_size = 64;
+constexpr int rolling_buf_size         = decode_block_size * 2;
+constexpr int preproc_buf_size         = LEVEL_DECODE_BUF_SIZE;
 
 /**
  * @brief Compute the start and end page value bounds for this page
@@ -450,12 +458,107 @@ __device__ size_t totalPlainEntriesSize(uint8_t const* data,
 }
 
 /**
- * @brief Kernel for computing string page output size information.
+ * @brief Compute string size information for DELTA_BYTE_ARRAY encoded strings.
+ *
+ * This traverses the packed prefix and suffix lengths, summing them to obtain the total
+ * number of bytes needed for the decoded string data. It also calculates an upper bound
+ * for the largest string length to obtain an upper bound on temporary space needed if
+ * rows will be skipped.
+ *
+ * Called with 64 threads.
+ *
+ * @param data Pointer to the start of the page data stream
+ * @param end Pointer to the end of the page data stream
+ * @param start_value Do not count values that occur before this index
+ * @param end_value Do not count values that occur after this index
+ * @return A pair of `size_t` values representing the total string size and temp buffer size
+ * required for decoding
+ */
+__device__ thrust::pair<size_t, size_t> totalDeltaByteArraySize(uint8_t const* data,
+                                                                uint8_t const* end,
+                                                                int start_value,
+                                                                int end_value)
+{
+  using cudf::detail::warp_size;
+  using WarpReduce = cub::WarpReduce<uleb128_t>;
+  __shared__ typename WarpReduce::TempStorage temp_storage[2];
+
+  __shared__ __align__(16) delta_binary_decoder prefixes;
+  __shared__ __align__(16) delta_binary_decoder suffixes;
+
+  int const t       = threadIdx.x;
+  int const lane_id = t % warp_size;
+  int const warp_id = t / warp_size;
+
+  if (t == 0) {
+    auto const* suffix_start = prefixes.find_end_of_block(data, end);
+    suffixes.init_binary_block(suffix_start, end);
+  }
+  __syncthreads();
+
+  // two warps will traverse the prefixes and suffixes and sum them up
+  auto const db = t < warp_size ? &prefixes : t < 2 * warp_size ? &suffixes : nullptr;
+
+  size_t total_bytes = 0;
+  uleb128_t max_len  = 0;
+
+  if (db != nullptr) {
+    // initialize with first value (which is stored in last_value)
+    if (lane_id == 0 && start_value == 0) { total_bytes = db->last_value; }
+
+    uleb128_t lane_sum = 0;
+    uleb128_t lane_max = 0;
+    while (db->current_value_idx < end_value &&
+           db->current_value_idx < db->num_encoded_values(true)) {
+      // calculate values for current mini-block
+      db->calc_mini_block_values(lane_id);
+
+      // get per lane sum for mini-block
+      for (uint32_t i = 0; i < db->values_per_mb; i += 32) {
+        uint32_t const idx = db->current_value_idx + i + lane_id;
+        if (idx >= start_value && idx < end_value && idx < db->value_count) {
+          lane_sum += db->value[rolling_index<delta_rolling_buf_size>(idx)];
+          lane_max = max(lane_max, db->value[rolling_index<delta_rolling_buf_size>(idx)]);
+        }
+      }
+
+      if (lane_id == 0) { db->setup_next_mini_block(true); }
+      __syncwarp();
+    }
+
+    // get sum for warp.
+    // note: warp_sum will only be valid on lane 0.
+    auto const warp_sum = WarpReduce(temp_storage[warp_id]).Sum(lane_sum);
+    auto const warp_max = WarpReduce(temp_storage[warp_id]).Reduce(lane_max, cub::Max());
+
+    if (lane_id == 0) {
+      total_bytes += warp_sum;
+      max_len = warp_max;
+    }
+  }
+  __syncthreads();
+
+  // now sum up total_bytes from the two warps
+  auto const final_bytes =
+    cudf::detail::single_lane_block_sum_reduce<delta_preproc_block_size, 0>(total_bytes);
+
+  // Sum up prefix and suffix max lengths to get a max possible string length. Multiply that
+  // by the number of strings in a mini-block, plus one to save the last string.
+  auto const temp_bytes =
+    cudf::detail::single_lane_block_sum_reduce<delta_preproc_block_size, 0>(max_len) *
+    (db->values_per_mb + 1);
+
+  return {final_bytes, temp_bytes};
+}
+
+/**
+ * @brief Kernel for computing string page bounds information.
  *
- * String columns need accurate data size information to preallocate memory in the column buffer to
- * store the char data. This calls a kernel to calculate information needed by the string decoding
- * kernel. On exit, the `str_bytes`, `num_nulls`, and `num_valids` fields of the PageInfo struct
- * are updated. This call ignores non-string columns.
+ * This kernel traverses the repetition and definition level data to determine start and end values
+ * for pages with string-like data. Also calculates the number of null and valid values in the
+ * page. Does nothing if the page mask is neither `STRING` nor `DELTA_BYTE_ARRAY`. On exit the
+ * `num_nulls`, `num_valids`, `start_val` and `end_val` fields of the `PageInfo` struct will be
+ * populated.
  *
  * @param pages All pages to be decoded
  * @param chunks All chunks to be decoded
@@ -464,7 +567,7 @@ __device__ size_t totalPlainEntriesSize(uint8_t const* data,
  * @tparam level_t Type used to store decoded repetition and definition levels
  */
 template <typename level_t>
-__global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
+__global__ void __launch_bounds__(preprocess_block_size) gpuComputeStringPageBounds(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
   __shared__ __align__(16) page_state_s state_g;
@@ -474,8 +577,13 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   int const t           = threadIdx.x;
   PageInfo* const pp    = &pages[page_idx];
 
-  // reset str_bytes to 0 in case it's already been calculated
-  if (t == 0) { pp->str_bytes = 0; }
+  if (t == 0) {
+    s->page.num_nulls  = 0;
+    s->page.num_valids = 0;
+    // reset str_bytes to 0 in case it's already been calculated (esp needed for chunked reads).
+    // TODO: need to rethink this once str_bytes is in the statistics
+    pp->str_bytes = 0;
+  }
 
   // whether or not we have repetition levels (lists)
   bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
@@ -491,23 +599,11 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
                                                                                       {rep_runs}};
 
   // setup page info
-  if (!setupLocalPageInfo(
-        s, pp, chunks, min_row, num_rows, mask_filter{KERNEL_MASK_STRING}, false)) {
-    return;
-  }
-
-  if (!t) {
-    s->page.num_nulls  = 0;
-    s->page.num_valids = 0;
-    s->page.str_bytes  = 0;
-  }
-  __syncthreads();
+  auto const mask = BitOr(decode_kernel_mask::STRING, decode_kernel_mask::DELTA_BYTE_ARRAY);
+  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, mask_filter{mask}, true)) { return; }
 
   bool const is_bounds_pg = is_bounds_page(s, min_row, num_rows, has_repetition);
 
-  // if we're skipping this page anyway, no need to count it
-  if (!is_bounds_pg && !is_page_contained(s, min_row, num_rows)) { return; }
-
   // find start/end value indices
   auto const [start_value, end_value] =
     page_bounds(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders);
@@ -516,7 +612,106 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   if (t == 0) {
     pp->num_nulls  = s->page.num_nulls;
     pp->num_valids = s->page.num_valids;
+    pp->start_val  = start_value;
+    pp->end_val    = end_value;
   }
+}
+
+/**
+ * @brief Kernel for computing string page output size information for delta_byte_array encoding.
+ *
+ * This call ignores columns that are not DELTA_BYTE_ARRAY encoded. On exit the `str_bytes` field
+ * of the `PageInfo` struct will be populated. Also fills in the `temp_string_size` field if rows
+ * are to be skipped.
+ *
+ * @param pages All pages to be decoded
+ * @param chunks All chunks to be decoded
+ * @param min_rows crop all rows below min_row
+ * @param num_rows Maximum number of rows to read
+ */
+__global__ void __launch_bounds__(delta_preproc_block_size) gpuComputeDeltaPageStringSizes(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+{
+  __shared__ __align__(16) page_state_s state_g;
+
+  page_state_s* const s = &state_g;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  PageInfo* const pp    = &pages[page_idx];
+
+  // whether or not we have repetition levels (lists)
+  bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
+
+  // setup page info
+  auto const mask = decode_kernel_mask::DELTA_BYTE_ARRAY;
+  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, mask_filter{mask}, true)) { return; }
+
+  auto const start_value = pp->start_val;
+
+  // if data size is known, can short circuit here
+  if ((chunks[pp->chunk_idx].data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
+    if (t == 0) {
+      pp->str_bytes = pp->num_valids * s->dtype_len_in;
+
+      // only need temp space if we're skipping values
+      if (start_value > 0) {
+        // just need to parse the header of the first delta binary block to get values_per_mb
+        delta_binary_decoder db;
+        db.init_binary_block(s->data_start, s->data_end);
+        // save enough for one mini-block plus some extra to save the last_string
+        pp->temp_string_size = s->dtype_len_in * (db.values_per_mb + 1);
+      }
+    }
+  } else {
+    // now process string info in the range [start_value, end_value)
+    // set up for decoding strings...can be either plain or dictionary
+    uint8_t const* data      = s->data_start;
+    uint8_t const* const end = s->data_end;
+    auto const end_value     = pp->end_val;
+
+    auto const [len, temp_bytes] = totalDeltaByteArraySize(data, end, start_value, end_value);
+
+    if (t == 0) {
+      // TODO check for overflow
+      pp->str_bytes = len;
+
+      // only need temp space if we're skipping values
+      if (start_value > 0) { pp->temp_string_size = temp_bytes; }
+    }
+  }
+}
+
+/**
+ * @brief Kernel for computing string page output size information.
+ *
+ * This call ignores non-string columns. On exit the `str_bytes` field of the `PageInfo` struct will
+ * be populated.
+ *
+ * @param pages All pages to be decoded
+ * @param chunks All chunks to be decoded
+ * @param min_rows crop all rows below min_row
+ * @param num_rows Maximum number of rows to read
+ */
+__global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
+{
+  __shared__ __align__(16) page_state_s state_g;
+
+  page_state_s* const s = &state_g;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  PageInfo* const pp    = &pages[page_idx];
+
+  // whether or not we have repetition levels (lists)
+  bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
+
+  // setup page info
+  if (!setupLocalPageInfo(
+        s, pp, chunks, min_row, num_rows, mask_filter{decode_kernel_mask::STRING}, true)) {
+    return;
+  }
+
+  bool const is_bounds_pg = is_bounds_page(s, min_row, num_rows, has_repetition);
 
   auto const& col  = s->col;
   size_t str_bytes = 0;
@@ -530,6 +725,8 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
     uint8_t const* const end = s->data_end;
     uint8_t const* dict_base = nullptr;
     int dict_size            = 0;
+    auto const start_value   = pp->start_val;
+    auto const end_value     = pp->end_val;
 
     switch (pp->encoding) {
       case Encoding::PLAIN_DICTIONARY:
@@ -561,6 +758,9 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
   if (t == 0) {
     // TODO check for overflow
     pp->str_bytes = str_bytes;
+
+    // only need temp space for delta
+    pp->temp_string_size = 0;
   }
 }
 
@@ -586,6 +786,7 @@ __global__ void __launch_bounds__(decode_block_size)
                           size_t num_rows,
                           int32_t* error_code)
 {
+  using cudf::detail::warp_size;
   __shared__ __align__(16) page_state_s state_g;
   __shared__ __align__(4) size_type last_offset;
   __shared__ __align__(16)
@@ -596,10 +797,12 @@ __global__ void __launch_bounds__(decode_block_size)
   auto* const sb        = &state_buffers;
   int const page_idx    = blockIdx.x;
   int const t           = threadIdx.x;
+  int const lane_id     = t % warp_size;
   [[maybe_unused]] null_count_back_copier _{s, t};
 
+  auto const mask = decode_kernel_mask::STRING;
   if (!setupLocalPageInfo(
-        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{KERNEL_MASK_STRING}, true)) {
+        s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{mask}, true)) {
     return;
   }
 
@@ -630,6 +833,7 @@ __global__ void __launch_bounds__(decode_block_size)
       target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
       if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); }
     }
+    // TODO(ets): see if this sync can be removed
     __syncthreads();
     if (t < 32) {
       // decode repetition and definition levels.
@@ -643,9 +847,9 @@ __global__ void __launch_bounds__(decode_block_size)
 
       // WARP1: Decode dictionary indices, booleans or string positions
       if (s->dict_base) {
-        src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, t & 0x1f).first;
+        src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, lane_id).first;
       } else {
-        gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
+        gpuInitStringDescriptors<false>(s, sb, src_target_pos, lane_id);
       }
       if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
     } else {
@@ -748,6 +952,19 @@ __global__ void __launch_bounds__(decode_block_size)
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
 
+// Functor used to set the `temp_string_buf` pointer for each page. `data` points to a buffer
+// to be used when skipping rows in the delta_byte_array decoder. Given a page and an offset,
+// set the page's `temp_string_buf` to be `data + offset`.
+struct page_tform_functor {
+  uint8_t* const data;
+
+  __device__ PageInfo operator()(PageInfo& page, int64_t offset)
+  {
+    if (page.temp_string_size != 0) { page.temp_string_buf = data + offset; }
+    return page;
+  }
+};
+
 }  // anonymous namespace
 
 /**
@@ -755,20 +972,81 @@ __global__ void __launch_bounds__(decode_block_size)
  */
 void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
                             cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                            rmm::device_uvector<uint8_t>& temp_string_buf,
                             size_t min_row,
                             size_t num_rows,
                             int level_type_size,
+                            uint32_t kernel_mask,
                             rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(preprocess_block_size, 1);
-  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+  dim3 const dim_block(preprocess_block_size, 1);
+  dim3 const dim_grid(pages.size(), 1);  // 1 threadblock per page
   if (level_type_size == 1) {
-    gpuComputePageStringSizes<uint8_t>
+    gpuComputeStringPageBounds<uint8_t>
       <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   } else {
-    gpuComputePageStringSizes<uint16_t>
+    gpuComputeStringPageBounds<uint16_t>
       <<<dim_grid, dim_block, 0, stream.value()>>>(pages.device_ptr(), chunks, min_row, num_rows);
   }
+
+  // kernel mask may contain other kernels we don't need to count
+  int const count_mask =
+    kernel_mask & BitOr(decode_kernel_mask::DELTA_BYTE_ARRAY, decode_kernel_mask::STRING);
+  int const nkernels = std::bitset<32>(count_mask).count();
+  auto const streams = cudf::detail::fork_streams(stream, nkernels);
+
+  int s_idx = 0;
+  if (BitAnd(kernel_mask, decode_kernel_mask::DELTA_BYTE_ARRAY) != 0) {
+    dim3 dim_delta(delta_preproc_block_size, 1);
+    gpuComputeDeltaPageStringSizes<<<dim_grid, dim_delta, 0, streams[s_idx++].value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows);
+  }
+  if (BitAnd(kernel_mask, decode_kernel_mask::STRING) != 0) {
+    gpuComputePageStringSizes<<<dim_grid, dim_block, 0, streams[s_idx++].value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows);
+  }
+
+  // synchronize the streams
+  cudf::detail::join_streams(streams, stream);
+
+  // check for needed temp space for DELTA_BYTE_ARRAY
+  auto const need_sizes = thrust::any_of(
+    rmm::exec_policy(stream), pages.d_begin(), pages.d_end(), [] __device__(auto& page) {
+      return page.temp_string_size != 0;
+    });
+
+  if (need_sizes) {
+    // sum up all of the temp_string_sizes
+    auto const page_sizes = [] __device__(PageInfo const& page) { return page.temp_string_size; };
+    auto const total_size = thrust::transform_reduce(rmm::exec_policy(stream),
+                                                     pages.d_begin(),
+                                                     pages.d_end(),
+                                                     page_sizes,
+                                                     0L,
+                                                     thrust::plus<int64_t>{});
+
+    // now do an exclusive scan over the temp_string_sizes to get offsets for each
+    // page's chunk of the temp buffer
+    rmm::device_uvector<int64_t> page_string_offsets(pages.size(), stream);
+    thrust::transform_exclusive_scan(rmm::exec_policy_nosync(stream),
+                                     pages.d_begin(),
+                                     pages.d_end(),
+                                     page_string_offsets.begin(),
+                                     page_sizes,
+                                     0L,
+                                     thrust::plus<int64_t>{});
+
+    // allocate the temp space
+    temp_string_buf.resize(total_size, stream);
+
+    // now use the offsets array to set each page's temp_string_buf pointers
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      pages.d_begin(),
+                      pages.d_end(),
+                      page_string_offsets.begin(),
+                      pages.d_begin(),
+                      page_tform_functor{temp_string_buf.data()});
+  }
 }
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 68851e72663..129d4e4d28c 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -35,6 +35,7 @@
 
 #include <cuda_runtime.h>
 
+#include <type_traits>
 #include <vector>
 
 namespace cudf::io::parquet::detail {
@@ -64,7 +65,8 @@ constexpr bool is_supported_encoding(Encoding enc)
     case Encoding::PLAIN_DICTIONARY:
     case Encoding::RLE:
     case Encoding::RLE_DICTIONARY:
-    case Encoding::DELTA_BINARY_PACKED: return true;
+    case Encoding::DELTA_BINARY_PACKED:
+    case Encoding::DELTA_BYTE_ARRAY: return true;
     default: return false;
   }
 }
@@ -86,13 +88,15 @@ constexpr void set_error(int32_t error, int32_t* error_code)
  * These values are used as bitmasks, so they must be powers of 2.
  */
 enum class decode_error : int32_t {
-  DATA_STREAM_OVERRUN  = 0x1,
-  LEVEL_STREAM_OVERRUN = 0x2,
-  UNSUPPORTED_ENCODING = 0x4,
-  INVALID_LEVEL_RUN    = 0x8,
-  INVALID_DATA_TYPE    = 0x10,
-  EMPTY_PAGE           = 0x20,
-  INVALID_DICT_WIDTH   = 0x40,
+  DATA_STREAM_OVERRUN      = 0x1,
+  LEVEL_STREAM_OVERRUN     = 0x2,
+  UNSUPPORTED_ENCODING     = 0x4,
+  INVALID_LEVEL_RUN        = 0x8,
+  INVALID_DATA_TYPE        = 0x10,
+  EMPTY_PAGE               = 0x20,
+  INVALID_DICT_WIDTH       = 0x40,
+  DELTA_PARAM_MISMATCH     = 0x80,
+  DELTA_PARAMS_UNSUPPORTED = 0x100,
 };
 
 /**
@@ -145,6 +149,17 @@ constexpr uint32_t BitAnd(T1 a, T2 b)
   return static_cast<uint32_t>(a) & static_cast<uint32_t>(b);
 }
 
+template <class T1,
+          class T2,
+          typename std::enable_if_t<(is_scoped_enum<T1>::value and std::is_same_v<T1, T2>) or
+                                    (is_scoped_enum<T1>::value and std::is_same_v<uint32_t, T2>) or
+                                    (is_scoped_enum<T2>::value and std::is_same_v<uint32_t, T1>)>* =
+            nullptr>
+constexpr uint32_t BitOr(T1 a, T2 b)
+{
+  return static_cast<uint32_t>(a) | static_cast<uint32_t>(b);
+}
+
 /**
  * @brief Enums for the flags in the page header
  */
@@ -168,10 +183,12 @@ enum level_type {
  *
  * Used to control which decode kernels to run.
  */
-enum kernel_mask_bits {
-  KERNEL_MASK_GENERAL      = (1 << 0),  // Run catch-all decode kernel
-  KERNEL_MASK_STRING       = (1 << 1),  // Run decode kernel for string data
-  KERNEL_MASK_DELTA_BINARY = (1 << 2)   // Run decode kernel for DELTA_BINARY_PACKED data
+enum class decode_kernel_mask {
+  NONE             = 0,
+  GENERAL          = (1 << 0),  // Run catch-all decode kernel
+  STRING           = (1 << 1),  // Run decode kernel for string data
+  DELTA_BINARY     = (1 << 2),  // Run decode kernel for DELTA_BINARY_PACKED data
+  DELTA_BYTE_ARRAY = (1 << 3)   // Run decode kernel for DELTA_BYTE_ARRAY encoded data
 };
 
 /**
@@ -252,9 +269,11 @@ struct PageInfo {
   int32_t num_input_values;
   int32_t chunk_row;  // starting row of this page relative to the start of the chunk
   int32_t num_rows;   // number of rows in this page
-  // the next two are calculated in gpuComputePageStringSizes
+  // the next four are calculated in gpuComputePageStringSizes
   int32_t num_nulls;       // number of null values (V2 header), but recalculated for string cols
   int32_t num_valids;      // number of non-null values, taking into account skip_rows/num_rows
+  int32_t start_val;       // index of first value of the string data stream to use
+  int32_t end_val;         // index of last value in string data stream
   int32_t chunk_idx;       // column chunk this page belongs to
   int32_t src_col_schema;  // schema index of this column
   uint8_t flags;           // PAGEINFO_FLAGS_XXX
@@ -291,7 +310,11 @@ struct PageInfo {
   // level decode buffers
   uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES];
 
-  uint32_t kernel_mask;
+  // temporary space for decoding DELTA_BYTE_ARRAY encoded strings
+  int64_t temp_string_size;
+  uint8_t* temp_string_buf;
+
+  decode_kernel_mask kernel_mask;
 };
 
 /**
@@ -597,16 +620,20 @@ void ComputePageSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
  *
  * @param[in,out] pages All pages to be decoded
  * @param[in] chunks All chunks to be decoded
+ * @param[out] temp_string_buf Temporary space needed for decoding DELTA_BYTE_ARRAY strings
  * @param[in] min_rows crop all rows below min_row
  * @param[in] num_rows Maximum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] kernel_mask Mask of kernels to run
  * @param[in] stream CUDA stream to use
  */
 void ComputePageStringSizes(cudf::detail::hostdevice_vector<PageInfo>& pages,
                             cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                            rmm::device_uvector<uint8_t>& temp_string_buf,
                             size_t min_row,
                             size_t num_rows,
                             int level_type_size,
+                            uint32_t kernel_mask,
                             rmm::cuda_stream_view stream);
 
 /**
@@ -665,7 +692,7 @@ void DecodeStringPageData(cudf::detail::hostdevice_vector<PageInfo>& pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[out] error_code Error code for kernel failures
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
                        cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -675,6 +702,28 @@ void DecodeDeltaBinary(cudf::detail::hostdevice_vector<PageInfo>& pages,
                        int32_t* error_code,
                        rmm::cuda_stream_view stream);
 
+/**
+ * @brief Launches kernel for reading the DELTA_BYTE_ARRAY column data stored in the pages
+ *
+ * The page data will be written to the output pointed to in the page's
+ * associated column chunk.
+ *
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[in] num_rows Total number of rows to read
+ * @param[in] min_row Minimum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] error_code Error code for kernel failures
+ * @param[in] stream CUDA stream to use
+ */
+void DecodeDeltaByteArray(cudf::detail::hostdevice_vector<PageInfo>& pages,
+                          cudf::detail::hostdevice_vector<ColumnChunkDesc> const& chunks,
+                          size_t num_rows,
+                          size_t min_row,
+                          int level_type_size,
+                          int32_t* error_code,
+                          rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for initializing encoder row group fragments
  *
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 11c20d0e540..6e799424d01 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -21,7 +21,6 @@
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <rmm/cuda_stream_pool.hpp>
 
 #include <bitset>
 #include <numeric>
@@ -30,10 +29,15 @@ namespace cudf::io::parquet::detail {
 
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
-  auto& chunks              = _pass_itm_data->chunks;
-  auto& pages               = _pass_itm_data->pages_info;
-  auto& page_nesting        = _pass_itm_data->page_nesting_info;
-  auto& page_nesting_decode = _pass_itm_data->page_nesting_decode_info;
+  auto& chunks               = _pass_itm_data->chunks;
+  auto& pages                = _pass_itm_data->pages_info;
+  auto& page_nesting         = _pass_itm_data->page_nesting_info;
+  auto& page_nesting_decode  = _pass_itm_data->page_nesting_decode_info;
+  auto const level_type_size = _pass_itm_data->level_type_size;
+
+  // temporary space for DELTA_BYTE_ARRAY decoding. this only needs to live until
+  // gpu::DecodeDeltaByteArray returns.
+  rmm::device_uvector<uint8_t> delta_temp_buf(0, _stream);
 
   // Should not reach here if there is no page data.
   CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
@@ -52,11 +56,12 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   // doing a gather operation later on.
   // TODO: This step is somewhat redundant if size info has already been calculated (nested schema,
   // chunked reader).
-  auto const has_strings = (kernel_mask & KERNEL_MASK_STRING) != 0;
+  auto const has_strings =
+    (kernel_mask & BitOr(decode_kernel_mask::STRING, decode_kernel_mask::DELTA_BYTE_ARRAY)) != 0;
   std::vector<size_t> col_sizes(_input_columns.size(), 0L);
   if (has_strings) {
     ComputePageStringSizes(
-      pages, chunks, skip_rows, num_rows, _pass_itm_data->level_type_size, _stream);
+      pages, chunks, delta_temp_buf, skip_rows, num_rows, level_type_size, kernel_mask, _stream);
 
     col_sizes = calculate_page_string_offsets();
 
@@ -163,6 +168,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunks.host_to_device_async(_stream);
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
+  if (has_strings) { chunk_nested_str_data.host_to_device_async(_stream); }
 
   // create this before we fork streams
   kernel_error error_code(_stream);
@@ -171,25 +177,27 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   int const nkernels = std::bitset<32>(kernel_mask).count();
   auto streams       = cudf::detail::fork_streams(_stream, nkernels);
 
-  auto const level_type_size = _pass_itm_data->level_type_size;
-
   // launch string decoder
   int s_idx = 0;
-  if (has_strings) {
-    auto& stream = streams[s_idx++];
-    chunk_nested_str_data.host_to_device_async(stream);
+  if (BitAnd(kernel_mask, decode_kernel_mask::STRING) != 0) {
     DecodeStringPageData(
-      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), stream);
+      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
+  }
+
+  // launch delta byte array decoder
+  if (BitAnd(kernel_mask, decode_kernel_mask::DELTA_BYTE_ARRAY) != 0) {
+    DecodeDeltaByteArray(
+      pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
   // launch delta binary decoder
-  if ((kernel_mask & KERNEL_MASK_DELTA_BINARY) != 0) {
+  if (BitAnd(kernel_mask, decode_kernel_mask::DELTA_BINARY) != 0) {
     DecodeDeltaBinary(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
 
   // launch the catch-all page decoder
-  if ((kernel_mask & KERNEL_MASK_GENERAL) != 0) {
+  if (BitAnd(kernel_mask, decode_kernel_mask::GENERAL) != 0) {
     DecodePageData(
       pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]);
   }
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 80a4d00a5a2..0bc492546e9 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1416,7 +1416,7 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
     page_index.begin(), page_to_string_size{pages.device_ptr(), chunks.device_ptr()});
 
   // do scan by key to calculate string offsets for each page
-  thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),
+  thrust::exclusive_scan_by_key(rmm::exec_policy_nosync(_stream),
                                 page_keys.begin(),
                                 page_keys.end(),
                                 val_iter,
@@ -1424,7 +1424,7 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
 
   // now sum up page sizes
   rmm::device_uvector<int> reduce_keys(col_sizes.size(), _stream);
-  thrust::reduce_by_key(rmm::exec_policy(_stream),
+  thrust::reduce_by_key(rmm::exec_policy_nosync(_stream),
                         page_keys.begin(),
                         page_keys.end(),
                         val_iter,
diff --git a/python/cudf/cudf/tests/data/parquet/delta_byte_arr.parquet b/python/cudf/cudf/tests/data/parquet/delta_byte_arr.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..7f6006a75bff0498e373d61f1265ff2e8458c917
GIT binary patch
literal 5783
zcmcII2V4``(;E^(DAHut1UN+xLQyH9NIw=L9WDW-1Uw8R0VI)_glYp74;un177(#e
z&J(bKD5wY)5JcrpKtWMdEC^Ttm9zYJ7lNGU-TlAc_kEjRcIVB!nKy4{cHYb~U_UdE
z2-e7fXJkS_4hSHBWsm>@YT7}n@c6~>rDG6z0t5SuVE_Q&;0!8EJ_7DL2*80fb*P}w
zw)j)`;!Xt*u6&`|i}u2Fc{z85&!0o*d-R73?#eZWXB;>bf2^`g3SGt^N3An!Yfq@j
z)~58>9NjXK(j#lBbH6?KH8)2uj2EobJjd%?^YKSb*rQ=*aLx)j9!43TfIaZ$P1z<V
z=Dhdjd9e+<akD-0JwPRj9#NS>o)Qrj5hs?!3b>-MXbz|ZaiA&^REejO?EkxMA~wp$
zgH@P~2$+qOP#akh4DkNZA~K#XGDd8HKhMD7Wyb6wAONj$+PaxG-(C6Rw}fN2Z4%2R
zAC|At>mA)(xoyga(l?J7-ZDlcH3d3xb>*Iqx4V|K-9v!kgzoCkZ{ozhmSW$PgS~kJ
zS@!bIu<B8b-?<k;r@71*b?wLFX7LZZ<mqV2`D--QCe!Db>1=BL&i2Oqjs-XcfX9Ei
zZiOfJ{t4x(X$y!3%YzyPnRh8d8L26hBErOMew>6YiH=>0S>ib!T8F3JAlUzTH^`Rj
zWWhSZ*I^+S7|rKLvjt>%n9ogQFXeDWlfs~a1W-Yj*^CW)lug;IR6_T<@{^*PK#T00
z$LcdLEiNzMO<ASX5K;uKWg6|4;7hVzI0Ke|89;{LS$CzsaxDFB=&}A~<-Hr@TnrV|
zNepYCwyHRFt&UPFJw-02HeyZIc0Y}@b+US=8J>D7D!8pb`;kZ+Sd8pVr<+)As*Xv-
z_{8d+j6CHBaqC>%&rVrRC>iCIQd2wg6dE)u)>F)1rJRbUWnEenW4|c&J}&kxlU_zy
zz0>&0ju$U>CSMRYdcDX{t6SkhKP$WWS^9U1(+;Ry5y-b~iJWC#&(D20>eg`P*vVs0
zu2j89pVb<2qI!hfF8*o2knB7suUXhW{fS=O!!paQbMK5aZ=T(&9^6$yl>u-En^V*$
z*`F6J5=+9CvIShs{`o{GUzS>lgyJ_%i}>M;EZBqj;m@Hb8yFQW;gX3kKb$L)pz-(i
zOZ<WI0Ck5eEv5JxPCRpWW)EkvPD*g7*PJ_kwCwGiNHwj~jeBy>E@MNy%U83zos1X~
zg;M8}pL~dP#2*&GtGPEW#s2s+qr9v6aq$+t>2#rj6N6zHHHTqYkXlqIcw-vx1ds}x
z$PhVVvs0npGV20+g6?$1$Hy3CFKg{xGVCDImL!vXx{W5B`KmJ{Tbmy@osZsHp7rk2
zN8p63=*;)V3*;*5xV)vapB~tEwKDJ6b&u=T@9$4tkrWeGJ-}X-k@)`ntpLZjosPSs
zPHwM=+S;#8JL}gfHh9bP{H3!%&*q1-+wA@rQ12b-&=S9o-gaK0`N#_6?9w+OT?xFC
zR}{|a`;`h>uh$MD%QzDQ4!#XLczWOB*RGP1ma8KhH)!Di6Tm>~6vc=zK6|oHGKo-?
zEcLQ7`c8apX2c;SIB*vZeKG#*5L_@B=lfM2=*IyZd<u?W3gBd906@<*JBvIkI7@c}
zTLYO1pcEO5lExuZveTKA^)o#nl;RtV1E8k`QWe37FrkRc<BCLF4wlTqczkvw7C<j>
zP#2E+R0CCZTm?kOjU@1nEUF`MQobS@Xasopxq0~Nc=&t6zI6XNItz{cjmdJbw=?YP
z0VC_;PkjlZVFe)P4udB2%2xiMr6$~QUlEWey(Hsm6?f%U`V7pSn<Z*$)4fg4-ZJ|v
zDQNZRCq*0UkhYFWyg%RwH~^U|>U(3#hI+g*Ms>U2yUlYy_n_!xO`o=^Q>y=Z0+H&J
zt{T1A;D(M2ZY@a;G!0EGxUgo;lqXJ3swAgZwdr|c<&<9P{%SeNTN`EN%`OOvaL4<n
zxQBx5H(Apub!C#y;|6T%UG48T{1DK{-7@9$`DbUmp4n!$a)-9;$7{uoR&>-B?tLHh
z+oFxjx5X*i;(u&-x<$})%3lASKfkazPqzBd6Yd%xi)^P$?;7mu-+A5C&yW7NqF!Sq
zjeaIZOnJ2Hb%o#1FI5KT*sA9gHafoQfnF4M9Nrge{4B{PZ7{VgY1FVX+4-l8U2RPr
zhE`Lkv}<v))V^ueIV<5niBSKCO92L3Yx);0!^r`-Mk+IOzn#y_=L?eL7`oI|keZc{
zDD|#NL|AyVND>tm5gWHuB1*#inj#O~AX1M`v$y(pw;^68gGQ*k(cjn+jvUUw*kQ!P
z0&V;zT~iJT`q;?mu|t3k1yHkR*jsS`01?#&f_18>Fd0l32|Md#40Og*zcG^e1?;6x
z5pEosh1C+4$2_`(W*iYKAkUgO6O6Is<TNjy79<u*Vq8TLmZo0Dv7$&M#1>DSl7%_|
zYoV^Hom0#$9DXD~pIN^EoR7;@_FYfBI}&i*&co2|YX3f69HBe4&nmQC5cFHku_zA$
zfAOZY2c{~!--i@=5O4!PAE2y0bzfSAVcP7x?Y^6yXPI0ax+Cv0gAxG0t|{x6XOYi^
z7%?4QA2jyV)w5VzaKt7lzV4c672CV7Pt(`x)Wd`OOODMR?%Qyw*mOaKz<anMjG6Ty
zb<uO$>BJ7JU(=U|FI#`ujp&d*>v%%30_$DN6{B{$ow|35sYN^)cyQaGf#N*1gwkZ|
z3ynOL$l(T4rgitJ{?Pi$gQeM3_inS}tkN6k589iiw<^@z+1&a**A_gj?2}fUy(jeZ
znWCwzJckz+v|{{r(lqZ3Sp`#0CSA#zwm^6kUb9wajf^HK1@aai-J$NTxaD#RPEpgI
z0HoZAve2Zc<JEt;Nm2!z9?k@SbzS?9*uTD=Gd(=0+W}d_xS)OT-h)b?eSQu!`pCW(
zv%2)7TU#rG9541LE!@gy!rJaT517Yz4yP6*wv`JuMVH^F4>$M*D(@+z=!V}?i9Yqv
z-NEyZ)TWLH1LxiH_?LCdJSu7iJ!i^x{__0VaEt2ufg=Oj>s99G@y`jJ&HQDO0`_XX
zI#Z<bVfKp1lP^u$?TU?Qt!_(=4qh!ew8gaX#HtyhcUqRE0)#g|;mP^sL7p>~9e-a(
zQX#TRb)OFzq(S!Gh;hOek7hgFT>D(G_eUZ0V7MhdF1jS4+L?b568j7{DCg>BpRP?$
zA(YSS(@ZlfEsr_umeYoRI2A5n5{qUr9qtK6Yv&Gp(we$xu*P!7QQhm#N3^Dztu#Q~
z|Bz{`Pag5|dcb+}_^#Zi>%m=bY#L7AOL~}+zPiz>KSz5(R(t2gIqyPPyKZjxa}d#g
ztBLJvmz}%IDlk`k_Khd_eW8-u*%oWXdWwF3({M|kQs0na>*nUr>!TIBViHfhoa<{9
z*xXRaSa#Fp!obD)Z7L_MU6l0Gw1yUJJ(5bz*?BrJsO3DrDEZMX*{2p4tZi+c55#p6
zJA3j6$trfYzW=OGuUm0QvQo$JM(-=5X?r(T8SVuG4qs`vJ9W$0Xi?<QOEg%z-XwQC
z;F!x@HFjFHdq~i&@5#2-8xkJB{aVRY<5Hg>9NH|<*3UK1^nYrz@zBYn+_bwPT@FW9
zI$Uh*Rmkl3{nZ`%HLRv{aHvjrHN)TGb#M4pp(a;7xve~)INfe^<zaDsBTqwa9gR6_
zkO#5mAOU~att{P%ZP;+Osb$LBVa9O7p=Q?uW`oshJJM$gTI!Vbb1kpM?@Noj#I7oj
zDND#t(arB)9d%(*Vdx<L{kvX<jo;1s4Z@SZ_l+`J&G;Xy>lE~JXHI`p7D(=?+~!x7
zkTavtH?)XrIZp;4C~e2-NmCS(7Lh~~%@1R9I3liCjHSgr)1dY0)Dj(gtN*i%grvMZ
zc<{zwS|B-^g(c0(!kUZ$<}@=Tc}W|degbgZo2r*-_^djnvdD9l<d2&ry9_kd?YcJ|
zwuURcX@!In_lIjXt<7xXTnmv@cn^x>VtyXku{1H{*NnZNv*TCf?6wVgRm1Avb$cke
z<zv%}(QturY@o@#PSel*0k?O=^=@A2-(Buwz9Bt|nG$76XSi!?Zj=A{Vy#&gS;tQ!
z1u#}5shO!7DNiS9>TX@<p6c}ISf<lOhDOiv6apZ%q561jBjUzKa}&ZiY{^*J`5F(E
z;HeoFNZI+<T~gF!D1UADCZVJ~AqY^AY6Y8)HbE3J+7F>f!Wi;PlRzO$%iBpj<FzY=
zJXT~)V#N7U$hhE12x*BzA%C-kL1z+?HE{?Vfc{YgXH1Fz;#8uL|MhS~1<H&Gl!gPc
zr*Ye%M8po-5DBcrqxBmC-XMVMq;Mq(%$FU<UseXIr0`P>unRYiKROM(K^%v(XMh3Y
zV2KX6UVR+SumDRiksyI`#9fIk62&Hji6f$rw**v2T&^ecRj;cDcxxltV3a?#eqRy3
z^dyrBlk4Ki_-_}ZlV^Ng&`hRD3y;aTuix~^v~NW4I2qzlv?+Pib9ilOwEwR=eJI}n
zgrK9=;7<Vccex3<f3h8xhW@5)hE6(wWGKT1G(z~NOy>V$VvPS^+CHcml~ZK{Is(*B
zQzzt1_6R2CpE?!j5h`QE2DAXE7oiCm-}W9R?HjEG^Z^xe*cO}xP*bK)2>IH<82kUE
zVuiLlB66E;!36;I>5K`v(%``4O{z+vBaVn)3hYod%4ieQzm6M>?f>C~fa*|f>+HY-
zIDlF?Z({v##3rV~KW10x4l3-CJ!lI+n}0yFlr~uj_VfEO(01%yCU?l1pjTp~3-aZS
zIsP5a3x*S5mq^$XMIlcZb{?lNkueN=pd$n&eIlb6<nLmNN{E2x!!SJo<?unrrU(Fg
zCLlLw^aeA~8@7}SY!Yu`;);zi5(3S49sA&iBApXZ`h0YKH(y@3n`<)1)h3A*#1gWC
zI6Rh{t0@bSFJa6>^v6{9LAj92*_kPo2g5>?o?wllmjuxQB*B4vaS+WlhUM<bk1+Rv
zE&qkz4C9X&=TD4h(fDyNoQxRFdLg>lgucv$()E!|CeKFb*j9thY@*pgo~CS9n<e4q
z0dX96PfM8oFZ?bTJph%D@FzqF0tBoe&m>l$A1~NE0JVbvvBy%B&S9Yb`}_T~P<nzG
znThT{F)G5`j~6a*m9UsDkqG4<?C-)v>Dc}d`?Cc6q%XYa%5;qW_x_{9lldcPu3}D*
zUlhlUA0LkT6b%ArCK@e2!m#yth()4fL{)5p)Sk}HekgM6LIMk!g6-EA#US=s!U^&<
z4X2s$BltE#7B<bn0U%R^Y|*kfu4JxAtP{@_iIMM+I_5NEQ)5%0jR*h?<dgJ&0A519
ALjV8(

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 0c59fd0e5aa..af4d0294293 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1284,6 +1284,15 @@ def test_parquet_reader_v2(tmpdir, simple_pdf):
     assert_eq(cudf.read_parquet(pdf_fname), simple_pdf)
 
 
+def test_parquet_delta_byte_array(datadir):
+    fname = datadir / "delta_byte_arr.parquet"
+    assert_eq(cudf.read_parquet(fname), pd.read_parquet(fname))
+
+
+def delta_num_rows():
+    return [1, 2, 23, 32, 33, 34, 64, 65, 66, 128, 129, 130, 20000, 50000]
+
+
 @pytest.mark.parametrize("nrows", [1, 100000])
 @pytest.mark.parametrize("add_nulls", [True, False])
 @pytest.mark.parametrize(
@@ -1320,6 +1329,7 @@ def test_delta_binary(nrows, add_nulls, dtype, tmpdir):
         version="2.6",
         column_encoding="DELTA_BINARY_PACKED",
         data_page_version="2.0",
+        data_page_size=64 * 1024,
         engine="pyarrow",
         use_dictionary=False,
     )
@@ -1350,6 +1360,100 @@ def test_delta_binary(nrows, add_nulls, dtype, tmpdir):
         assert_eq(cdf2, cdf)
 
 
+@pytest.mark.parametrize("nrows", delta_num_rows())
+@pytest.mark.parametrize("add_nulls", [True, False])
+@pytest.mark.parametrize("str_encoding", ["DELTA_BYTE_ARRAY"])
+def test_delta_byte_array_roundtrip(nrows, add_nulls, str_encoding, tmpdir):
+    null_frequency = 0.25 if add_nulls else 0
+
+    # Create a pandas dataframe with random data of mixed lengths
+    test_pdf = dg.rand_dataframe(
+        dtypes_meta=[
+            {
+                "dtype": "str",
+                "null_frequency": null_frequency,
+                "cardinality": nrows,
+                "max_string_length": 10,
+            },
+            {
+                "dtype": "str",
+                "null_frequency": null_frequency,
+                "cardinality": nrows,
+                "max_string_length": 100,
+            },
+        ],
+        rows=nrows,
+        seed=0,
+        use_threads=False,
+    ).to_pandas()
+
+    pdf_fname = tmpdir.join("pdfdeltaba.parquet")
+    test_pdf.to_parquet(
+        pdf_fname,
+        version="2.6",
+        column_encoding=str_encoding,
+        data_page_version="2.0",
+        data_page_size=64 * 1024,
+        engine="pyarrow",
+        use_dictionary=False,
+    )
+    cdf = cudf.read_parquet(pdf_fname)
+    pcdf = cudf.from_pandas(test_pdf)
+    assert_eq(cdf, pcdf)
+
+
+@pytest.mark.parametrize("nrows", delta_num_rows())
+@pytest.mark.parametrize("add_nulls", [True, False])
+@pytest.mark.parametrize("str_encoding", ["DELTA_BYTE_ARRAY"])
+def test_delta_struct_list(tmpdir, nrows, add_nulls, str_encoding):
+    # Struct<List<List>>
+    lists_per_row = 3
+    list_size = 4
+    num_rows = nrows
+    include_validity = add_nulls
+
+    def list_gen_wrapped(x, y):
+        return list_row_gen(
+            int_gen, x * list_size * lists_per_row, list_size, lists_per_row
+        )
+
+    def string_list_gen_wrapped(x, y):
+        return list_row_gen(
+            string_gen,
+            x * list_size * lists_per_row,
+            list_size,
+            lists_per_row,
+            include_validity,
+        )
+
+    data = struct_gen(
+        [int_gen, string_gen, list_gen_wrapped, string_list_gen_wrapped],
+        0,
+        num_rows,
+        include_validity,
+    )
+    test_pdf = pa.Table.from_pydict({"sol": data}).to_pandas()
+    pdf_fname = tmpdir.join("pdfdeltaba.parquet")
+    test_pdf.to_parquet(
+        pdf_fname,
+        version="2.6",
+        column_encoding={
+            "sol.col0": "DELTA_BINARY_PACKED",
+            "sol.col1": str_encoding,
+            "sol.col2.list.element.list.element": "DELTA_BINARY_PACKED",
+            "sol.col3.list.element.list.element": str_encoding,
+        },
+        data_page_version="2.0",
+        data_page_size=64 * 1024,
+        engine="pyarrow",
+        use_dictionary=False,
+    )
+    # sanity check to verify file is written properly
+    assert_eq(test_pdf, pd.read_parquet(pdf_fname))
+    cdf = cudf.read_parquet(pdf_fname)
+    assert_eq(cdf, cudf.from_pandas(test_pdf))
+
+
 @pytest.mark.parametrize(
     "data",
     [

From d2069f45cc518c78ca901ea65370eff5c474cfd1 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 17 Nov 2023 16:11:22 +0000
Subject: [PATCH 23/36] Match pandas join ordering obligations in
 pandas-compatible mode (#14428)

If we pass sort=True to merges we are on the hook to sort the result in order with respect to the key columns. If those key columns have repeated values there is still some space for ambiguity. Currently we get a result back whose order (for the repeated key values) is determined by the gather map that libcudf returns for the join. This does not come with any ordering guarantees.

When sort=False, pandas has join-type dependent ordering guarantees which we also do not match. To fix this, in pandas-compatible mode only, reorder the gather maps according to the order of the input keys. When sort=False this means that our result matches pandas ordering. When sort=True, it ensures that (if we use a stable sort) the tie-break for equal sort keys is the input dataframe order.

While we're here, switch from argsort + gather to sort_by_key when sorting results.

- Closes #14001

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14428
---
 python/cudf/cudf/core/_compat.py          |   1 +
 python/cudf/cudf/core/join/join.py        | 125 +++++++++--
 python/cudf/cudf/tests/test_join_order.py | 261 ++++++++++++++++++++++
 3 files changed, 373 insertions(+), 14 deletions(-)
 create mode 100644 python/cudf/cudf/tests/test_join_order.py

diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py
index 888b94e070c..e257b7a1fa1 100644
--- a/python/cudf/cudf/core/_compat.py
+++ b/python/cudf/cudf/core/_compat.py
@@ -11,3 +11,4 @@
 PANDAS_LT_153 = PANDAS_VERSION < version.parse("1.5.3")
 PANDAS_GE_200 = PANDAS_VERSION >= version.parse("2.0.0")
 PANDAS_GE_210 = PANDAS_VERSION >= version.parse("2.1.0")
+PANDAS_GE_220 = PANDAS_VERSION >= version.parse("2.2.0")
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index b94f8f583f4..20f5b7989eb 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -1,11 +1,13 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 from __future__ import annotations
 
+import itertools
 import warnings
 from typing import Any, ClassVar, List, Optional
 
 import cudf
 from cudf import _lib as libcudf
+from cudf._lib.types import size_type_dtype
 from cudf.core.copy_types import GatherMap
 from cudf.core.join._join_helpers import (
     _coerce_to_tuple,
@@ -94,7 +96,44 @@ def __init__(
         self.lhs = lhs.copy(deep=False)
         self.rhs = rhs.copy(deep=False)
         self.how = how
-        self.sort = sort
+        # If the user requests that the result is sorted or we're in
+        # pandas-compatible mode we have various obligations on the
+        # output order:
+        #
+        # compat-> | False                    | True
+        # sort     |                          |
+        # ---------+--------------------------+-------------------------------
+        #     False| no obligation            | ordering as per pandas docs(*)
+        #     True | sorted lexicographically | sorted lexicographically(*)
+        #
+        # (*) If two keys are equal, tiebreak is to use input table order.
+        #
+        # In pandas-compat mode, we have obligations on the order to
+        # match pandas (even if sort=False), see
+        # pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html.
+        # The ordering requirements differ depending on which join
+        # type is specified:
+        #
+        # - left: preserve key order (only keeping left keys)
+        # - right: preserve key order (only keeping right keys)
+        # - inner: preserve key order (of left keys)
+        # - outer: sort keys lexicographically
+        # - cross (not supported): preserve key order (of left keys)
+        #
+        # Moreover, in all cases, whenever there is a tiebreak
+        # situation (for sorting or otherwise), the deciding order is
+        # "input table order"
+        self.sort = sort or (
+            cudf.get_option("mode.pandas_compatible") and how == "outer"
+        )
+        self.preserve_key_order = cudf.get_option(
+            "mode.pandas_compatible"
+        ) and how in {
+            "inner",
+            "outer",
+            "left",
+            "right",
+        }
         self.lsuffix, self.rsuffix = suffixes
 
         # At this point validation guarantees that if on is not None we
@@ -160,6 +199,55 @@ def __init__(
             }
         )
 
+    def _gather_maps(self, left_cols, right_cols):
+        # Produce gather maps for the join, optionally reordering to
+        # match pandas-order in compat mode.
+        maps = self._joiner(
+            left_cols,
+            right_cols,
+            how=self.how,
+        )
+        if not self.preserve_key_order:
+            return maps
+        # We should only get here if we're in a join on which
+        # pandas-compat places some ordering obligation (which
+        # precludes a semi-join)
+        # We must perform this reordering even if sort=True since the
+        # obligation to ensure tiebreaks appear in input table order
+        # means that the gather maps must be permuted into an original
+        # order.
+        assert self.how in {"inner", "outer", "left", "right"}
+        # And hence both maps returned from the libcudf join should be
+        # non-None.
+        assert all(m is not None for m in maps)
+        lengths = [len(left_cols[0]), len(right_cols[0])]
+        # Only nullify those maps that need it.
+        nullify = [
+            self.how not in {"inner", "left"},
+            self.how not in {"inner", "right"},
+        ]
+        # To reorder maps so that they are in order of the input
+        # tables, we gather from iota on both right and left, and then
+        # sort the gather maps with those two columns as key.
+        key_order = list(
+            itertools.chain.from_iterable(
+                libcudf.copying.gather(
+                    [cudf.core.column.arange(n, dtype=size_type_dtype)],
+                    map_,
+                    nullify=null,
+                )
+                for map_, n, null in zip(maps, lengths, nullify)
+            )
+        )
+        return libcudf.sort.sort_by_key(
+            list(maps),
+            # If how is right, right map is primary sort key.
+            key_order[:: -1 if self.how == "right" else 1],
+            [True] * len(key_order),
+            ["last"] * len(key_order),
+            stable=True,
+        )
+
     def perform_merge(self) -> cudf.DataFrame:
         left_join_cols = []
         right_join_cols = []
@@ -184,12 +272,9 @@ def perform_merge(self) -> cudf.DataFrame:
             left_key.set(self.lhs, lcol_casted, validate=False)
             right_key.set(self.rhs, rcol_casted, validate=False)
 
-        left_rows, right_rows = self._joiner(
-            left_join_cols,
-            right_join_cols,
-            how=self.how,
+        left_rows, right_rows = self._gather_maps(
+            left_join_cols, right_join_cols
         )
-
         gather_kwargs = {
             "keep_index": self._using_left_index or self._using_right_index,
         }
@@ -305,6 +390,11 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
         # same order as given in 'on'. If the indices are used as
         # keys, the index will be sorted. If one index is specified,
         # the key columns on the other side will be used to sort.
+        # In pandas-compatible mode, tie-breaking for multiple equal
+        # sort keys is to produce output in input dataframe order.
+        # This is taken care of by using a stable sort here, and (in
+        # pandas-compat mode) reordering the gather maps before
+        # producing the input result.
         by: List[Any] = []
         if self._using_left_index and self._using_right_index:
             by.extend(result._index._data.columns)
@@ -313,15 +403,22 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
         if not self._using_right_index:
             by.extend([result._data[col.name] for col in self._right_keys])
         if by:
-            to_sort = cudf.DataFrame._from_data(dict(enumerate(by)))
-            sort_order = GatherMap.from_column_unchecked(
-                cudf.core.column.as_column(to_sort.argsort()),
-                len(result),
-                nullify=False,
+            keep_index = self._using_left_index or self._using_right_index
+            if keep_index:
+                to_sort = [*result._index._columns, *result._columns]
+                index_names = result._index.names
+            else:
+                to_sort = [*result._columns]
+                index_names = None
+            result_columns = libcudf.sort.sort_by_key(
+                to_sort,
+                by,
+                [True] * len(by),
+                ["last"] * len(by),
+                stable=True,
             )
-            result = result._gather(
-                sort_order,
-                keep_index=self._using_left_index or self._using_right_index,
+            result = result._from_columns_like_self(
+                result_columns, result._column_names, index_names
             )
         return result
 
diff --git a/python/cudf/cudf/tests/test_join_order.py b/python/cudf/cudf/tests/test_join_order.py
new file mode 100644
index 00000000000..61a2ed239cb
--- /dev/null
+++ b/python/cudf/cudf/tests/test_join_order.py
@@ -0,0 +1,261 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+import itertools
+import operator
+import string
+from collections import defaultdict
+
+import numpy as np
+import pytest
+
+import cudf
+from cudf.core._compat import PANDAS_GE_220
+from cudf.testing._utils import assert_eq
+
+
+@pytest.fixture(params=[False, True], ids=["unsorted", "sorted"])
+def sort(request):
+    return request.param
+
+
+@pytest.fixture
+def left():
+    left_key = [1, 3, 2, 1, 1, 2, 5, 1, 4, 5, 8, 12, 12312, 1] * 100
+    left_val = list(range(len(left_key)))
+    return cudf.DataFrame({"key": left_key, "val": left_val})
+
+
+@pytest.fixture
+def right():
+    right_key = [12312, 12312, 3, 2, 1, 1, 5, 7, 2] * 200
+    right_val = list(
+        itertools.islice(itertools.cycle(string.ascii_letters), len(right_key))
+    )
+    return cudf.DataFrame({"key": right_key, "val": right_val})
+
+
+if PANDAS_GE_220:
+    # Behaviour in sort=False case didn't match documentation in many
+    # cases prior to https://github.com/pandas-dev/pandas/pull/54611
+    # (released as part of pandas 2.2)
+    def expected(left, right, sort, *, how):
+        left = left.to_pandas()
+        right = right.to_pandas()
+        return left.merge(right, on="key", how=how, sort=sort)
+
+else:
+
+    def expect_inner(left, right, sort):
+        left_key = left.key.values_host.tolist()
+        left_val = left.val.values_host.tolist()
+        right_key = right.key.values_host.tolist()
+        right_val = right.val.values_host.tolist()
+
+        right_have = defaultdict(list)
+        for i, k in enumerate(right_key):
+            right_have[k].append(i)
+        keys = []
+        val_x = []
+        val_y = []
+        for k, v in zip(left_key, left_val):
+            if k not in right_have:
+                continue
+            for i in right_have[k]:
+                keys.append(k)
+                val_x.append(v)
+                val_y.append(right_val[i])
+
+        if sort:
+            # Python sort is stable, so this will preserve input order for
+            # equal items.
+            keys, val_x, val_y = zip(
+                *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0))
+            )
+        return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y})
+
+    def expect_left(left, right, sort):
+        left_key = left.key.values_host.tolist()
+        left_val = left.val.values_host.tolist()
+        right_key = right.key.values_host.tolist()
+        right_val = right.val.values_host.tolist()
+
+        right_have = defaultdict(list)
+        for i, k in enumerate(right_key):
+            right_have[k].append(i)
+        keys = []
+        val_x = []
+        val_y = []
+        for k, v in zip(left_key, left_val):
+            if k not in right_have:
+                right_vals = [None]
+            else:
+                right_vals = [right_val[i] for i in right_have[k]]
+
+            for rv in right_vals:
+                keys.append(k)
+                val_x.append(v)
+                val_y.append(rv)
+
+        if sort:
+            # Python sort is stable, so this will preserve input order for
+            # equal items.
+            keys, val_x, val_y = zip(
+                *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0))
+            )
+        return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y})
+
+    def expect_outer(left, right, sort):
+        left_key = left.key.values_host.tolist()
+        left_val = left.val.values_host.tolist()
+        right_key = right.key.values_host.tolist()
+        right_val = right.val.values_host.tolist()
+        right_have = defaultdict(list)
+        for i, k in enumerate(right_key):
+            right_have[k].append(i)
+        keys = []
+        val_x = []
+        val_y = []
+        for k, v in zip(left_key, left_val):
+            if k not in right_have:
+                right_vals = [None]
+            else:
+                right_vals = [right_val[i] for i in right_have[k]]
+            for rv in right_vals:
+                keys.append(k)
+                val_x.append(v)
+                val_y.append(rv)
+        left_have = set(left_key)
+        for k, v in zip(right_key, right_val):
+            if k not in left_have:
+                keys.append(k)
+                val_x.append(None)
+                val_y.append(v)
+
+        # Python sort is stable, so this will preserve input order for
+        # equal items.
+        # outer joins are always sorted, but we test both sort values
+        keys, val_x, val_y = zip(
+            *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0))
+        )
+        return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y})
+
+    def expected(left, right, sort, *, how):
+        if how == "inner":
+            return expect_inner(left, right, sort)
+        elif how == "outer":
+            return expect_outer(left, right, sort)
+        elif how == "left":
+            return expect_left(left, right, sort)
+        elif how == "right":
+            return expect_left(right, left, sort).rename(
+                {"val_x": "val_y", "val_y": "val_x"}, axis=1
+            )
+        else:
+            raise NotImplementedError()
+
+
+@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
+def test_join_ordering_pandas_compat(left, right, sort, how):
+    with cudf.option_context("mode.pandas_compatible", True):
+        actual = left.merge(right, on="key", how=how, sort=sort)
+    expect = expected(left, right, sort, how=how)
+    assert_eq(expect, actual)
+
+
+@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"])
+@pytest.mark.parametrize("sort", [True, False])
+@pytest.mark.parametrize("on_index", [True, False])
+@pytest.mark.parametrize("left_unique", [True, False])
+@pytest.mark.parametrize("left_monotonic", [True, False])
+@pytest.mark.parametrize("right_unique", [True, False])
+@pytest.mark.parametrize("right_monotonic", [True, False])
+def test_merge_combinations(
+    request,
+    how,
+    sort,
+    on_index,
+    left_unique,
+    left_monotonic,
+    right_unique,
+    right_monotonic,
+):
+    request.applymarker(
+        pytest.mark.xfail(
+            condition=how == "outer"
+            and on_index
+            and left_unique
+            and not left_monotonic
+            and right_unique
+            and not right_monotonic,
+            reason="https://github.com/pandas-dev/pandas/issues/55992",
+        )
+    )
+    left = [2, 3]
+    if left_unique:
+        left.append(4 if left_monotonic else 1)
+    else:
+        left.append(3 if left_monotonic else 2)
+
+    right = [2, 3]
+    if right_unique:
+        right.append(4 if right_monotonic else 1)
+    else:
+        right.append(3 if right_monotonic else 2)
+
+    left = cudf.DataFrame({"key": left})
+    right = cudf.DataFrame({"key": right})
+
+    if on_index:
+        left = left.set_index("key")
+        right = right.set_index("key")
+        on_kwargs = {"left_index": True, "right_index": True}
+    else:
+        on_kwargs = {"on": "key"}
+
+    with cudf.option_context("mode.pandas_compatible", True):
+        result = cudf.merge(left, right, how=how, sort=sort, **on_kwargs)
+    if on_index:
+        left = left.reset_index()
+        right = right.reset_index()
+
+    if how in ["left", "right", "inner"]:
+        if how in ["left", "inner"]:
+            expected, other, other_unique = left, right, right_unique
+        else:
+            expected, other, other_unique = right, left, left_unique
+        if how == "inner":
+            keep_values = set(left["key"].values_host).intersection(
+                right["key"].values_host
+            )
+            keep_mask = expected["key"].isin(keep_values)
+            expected = expected[keep_mask]
+        if sort:
+            expected = expected.sort_values("key")
+        if not other_unique:
+            other_value_counts = other["key"].value_counts()
+            repeats = other_value_counts.reindex(
+                expected["key"].values, fill_value=1
+            )
+            repeats = repeats.astype(np.intp)
+            expected = expected["key"].repeat(repeats.values)
+            expected = expected.to_frame()
+    elif how == "outer":
+        if on_index and left_unique and left["key"].equals(right["key"]):
+            expected = cudf.DataFrame({"key": left["key"]})
+        else:
+            left_counts = left["key"].value_counts()
+            right_counts = right["key"].value_counts()
+            expected_counts = left_counts.mul(right_counts, fill_value=1)
+            expected_counts = expected_counts.astype(np.intp)
+            expected = expected_counts.index.values_host.repeat(
+                expected_counts.values_host
+            )
+            expected = cudf.DataFrame({"key": expected})
+            expected = expected.sort_values("key")
+
+    if on_index:
+        expected = expected.set_index("key")
+    else:
+        expected = expected.reset_index(drop=True)
+
+    assert_eq(result, expected)

From ba5ec4080be38b795053d11bf46cb3688c201893 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 17 Nov 2023 10:36:30 -0600
Subject: [PATCH 24/36] Enable build concurrency for nightly and merge
 triggers. (#14441)

---
 .github/workflows/build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 2539057c105..e27361ab263 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -22,7 +22,7 @@ on:
         default: nightly
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
   cancel-in-progress: true
 
 jobs:

From 6c2e972cefff05f6ffbba4fd9ba894e6849b041e Mon Sep 17 00:00:00 2001
From: Trent Nelson <trent@trent.me>
Date: Fri, 17 Nov 2023 13:29:23 -0800
Subject: [PATCH 25/36] Implement user_datasource_wrapper is_empty() and
 is_device_read_preferred(). (#14357)

These two routines are missing from the current `user_datasource_wrapper` impl.

Authors:
  - Trent Nelson (https://github.com/tpn)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/14357
---
 cpp/src/io/utilities/datasource.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 5cdd92ce3b7..a466ef84133 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -360,6 +360,11 @@ class user_datasource_wrapper : public datasource {
     return source->supports_device_read();
   }
 
+  [[nodiscard]] bool is_device_read_preferred(size_t size) const override
+  {
+    return source->is_device_read_preferred(size);
+  }
+
   size_t device_read(size_t offset,
                      size_t size,
                      uint8_t* dst,
@@ -385,6 +390,8 @@ class user_datasource_wrapper : public datasource {
 
   [[nodiscard]] size_t size() const override { return source->size(); }
 
+  [[nodiscard]] bool is_empty() const override { return source->is_empty(); }
+
  private:
   datasource* const source;  ///< A non-owning pointer to the user-implemented datasource
 };

From 10218a972d3a4950d4639c41d9ae9116aff18c12 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 17 Nov 2023 14:14:55 -0800
Subject: [PATCH 26/36] JSON writer: avoid default stream use in
 `string_scalar` constructors (#14444)

Added the true/false string scalars to `column_to_strings_fn` so they are created once, instead of creating new scalars for each boolean column (using default stream).

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - https://github.com/shrshi

URL: https://github.com/rapidsai/cudf/pull/14444
---
 cpp/src/io/json/write_json.cu | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index c211d17f13a..938f9728fe8 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -500,7 +500,9 @@ struct column_to_strings_fn {
       struct_row_end_wrap("}", true, stream),
       list_value_separator(",", true, stream),
       list_row_begin_wrap("[", true, stream),
-      list_row_end_wrap("]", true, stream)
+      list_row_end_wrap("]", true, stream),
+      true_value(options_.get_true_value(), true, stream),
+      false_value(options_.get_false_value(), true, stream)
   {
   }
 
@@ -526,8 +528,7 @@ struct column_to_strings_fn {
   std::enable_if_t<std::is_same_v<column_type, bool>, std::unique_ptr<column>> operator()(
     column_view const& column) const
   {
-    return cudf::strings::detail::from_booleans(
-      column, options_.get_true_value(), options_.get_false_value(), stream_, mr_);
+    return cudf::strings::detail::from_booleans(column, true_value, false_value, stream_, mr_);
   }
 
   // strings:
@@ -742,6 +743,9 @@ struct column_to_strings_fn {
   string_scalar const list_value_separator;  // ","
   string_scalar const list_row_begin_wrap;   // "["
   string_scalar const list_row_end_wrap;     // "]"
+  // bool converter constants
+  string_scalar const true_value;
+  string_scalar const false_value;
 };
 
 }  // namespace

From 723c565f7a03e3e9a842526cd4cc94bcf6f582e5 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 17 Nov 2023 17:37:47 -0800
Subject: [PATCH 27/36] Fix intermediate type checking in expression parsing
 (#14445)

When parsing expressions, device data references are reused if there are multiple that are identical. Equality is determined by comparing the fields of the reference, but previously the data type was omitted. For column and literal references, this is OK because the `data_index` uniquely identifies the reference. For intermediates, however, the index is not sufficient to disambiguate because an expression could reuse a given location even if the operation produces a different data type. Therefore, the data type must be part of the equality operator.

Resolves #14409

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14445
---
 .../cudf/ast/detail/expression_parser.hpp     |  4 +--
 cpp/tests/ast/transform_tests.cpp             | 27 +++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index db0abe435b0..a36a831a7aa 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -67,8 +67,8 @@ struct alignas(8) device_data_reference {
 
   bool operator==(device_data_reference const& rhs) const
   {
-    return std::tie(data_index, reference_type, table_source) ==
-           std::tie(rhs.data_index, rhs.reference_type, rhs.table_source);
+    return std::tie(data_index, data_type, reference_type, table_source) ==
+           std::tie(rhs.data_index, rhs.data_type, rhs.reference_type, rhs.table_source);
   }
 };
 
diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index c0109a40cec..624a781c5b9 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -316,6 +316,33 @@ TEST_F(TransformTest, ImbalancedTreeArithmetic)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
 }
 
+TEST_F(TransformTest, ImbalancedTreeArithmeticDeep)
+{
+  auto c_0   = column_wrapper<int64_t>{4, 5, 6};
+  auto table = cudf::table_view{{c_0}};
+
+  auto col_ref_0 = cudf::ast::column_reference(0);
+
+  // expression: (c0 < c0) == (c0 < (c0 + c0))
+  //              {false, false, false} == (c0 < {8, 10, 12})
+  //              {false, false, false} == {true, true, true}
+  //              {false, false, false}
+  auto expression_left_subtree =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_0);
+  auto expression_right_inner_subtree =
+    cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_0);
+  auto expression_right_subtree =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, expression_right_inner_subtree);
+
+  auto expression_tree = cudf::ast::operation(
+    cudf::ast::ast_operator::EQUAL, expression_left_subtree, expression_right_subtree);
+
+  auto result   = cudf::compute_column(table, expression_tree);
+  auto expected = column_wrapper<bool>{false, false, false};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
+}
+
 TEST_F(TransformTest, MultiLevelTreeComparator)
 {
   auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};

From 2afb784a24b3ccb50e15f664117f87ec77aa5272 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 20 Nov 2023 06:32:30 -1000
Subject: [PATCH 28/36] REF: Remove instances of pd.core (#14421)

`pandas.core` is technically private and methods could be moved at any time. Avoiding places in the codepace where they could be avoided

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14421
---
 python/cudf/cudf/core/column/datetime.py |  6 +++++-
 python/cudf/cudf/core/dataframe.py       | 15 +++++++++++++--
 python/cudf/cudf/core/index.py           | 10 ++++------
 python/cudf/cudf/tests/test_rolling.py   |  9 ++++-----
 4 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index b03b3c905a4..33354e1c3bc 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -27,13 +27,17 @@
     is_scalar,
     is_timedelta64_dtype,
 )
+from cudf.core._compat import PANDAS_GE_220
 from cudf.core.buffer import Buffer, cuda_array_interface_wrapper
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
 from cudf.utils.utils import _all_bools_with_nulls
 
-_guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format
+if PANDAS_GE_220:
+    _guess_datetime_format = pd.tseries.api.guess_datetime_format
+else:
+    _guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format
 
 # nanoseconds per time_unit
 _dtype_to_format_conversion = {
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 16eead6ea81..fd4a15a3391 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2313,8 +2313,19 @@ def to_dict(
         if orient == "series":
             # Special case needed to avoid converting
             # cudf.Series objects into pd.Series
-            into_c = pd.core.common.standardize_mapping(into)
-            return into_c((k, v) for k, v in self.items())
+            if not inspect.isclass(into):
+                cons = type(into)  # type: ignore[assignment]
+                if isinstance(into, defaultdict):
+                    cons = functools.partial(cons, into.default_factory)
+            elif issubclass(into, abc.Mapping):
+                cons = into  # type: ignore[assignment]
+                if issubclass(into, defaultdict):
+                    raise TypeError(
+                        "to_dict() only accepts initialized defaultdicts"
+                    )
+            else:
+                raise TypeError(f"unsupported type: {into}")
+            return cons(self.items())  # type: ignore[misc]
 
         return self.to_pandas().to_dict(orient=orient, into=into)
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 9f0c66a5c74..277b5d3bb17 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -330,17 +330,15 @@ def _data(self):
 
     @_cudf_nvtx_annotate
     def __contains__(self, item):
-        if not isinstance(
+        if isinstance(item, bool) or not isinstance(
             item, tuple(np.sctypes["int"] + np.sctypes["float"] + [int, float])
         ):
             return False
         try:
-            item = pd.core.dtypes.common.ensure_python_int(item)
-        except TypeError:
-            return False
-        if not item % 1 == 0:
+            int_item = int(item)
+            return int_item == item and int_item in self._range
+        except (ValueError, OverflowError):
             return False
-        return item in range(self._start, self._stop, self._step)
 
     @_cudf_nvtx_annotate
     def copy(self, name=None, deep=False, dtype=None, names=None):
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index 43fa83e1735..19714b7b9d3 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -8,7 +8,7 @@
 import pytest
 
 import cudf
-from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140
+from cudf.core._compat import PANDAS_GE_150
 from cudf.testing._utils import (
     _create_pandas_series_float64_default,
     assert_eq,
@@ -536,10 +536,9 @@ def get_window_bounds(
     "indexer",
     [
         pd.api.indexers.FixedForwardWindowIndexer(window_size=2),
-        pd.core.window.expanding.ExpandingIndexer(),
-        pd.core.window.indexers.FixedWindowIndexer(window_size=3)
-        if PANDAS_LT_140
-        else pd.core.indexers.objects.FixedWindowIndexer(window_size=3),
+        pd.api.indexers.VariableOffsetWindowIndexer(
+            index=pd.date_range("2020", periods=5), offset=pd.offsets.BDay(1)
+        ),
     ],
 )
 def test_rolling_indexer_support(indexer):

From 58387ffee6970115b77793f3706478d41eab328d Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Mon, 20 Nov 2023 08:51:10 -0800
Subject: [PATCH 29/36] Add cuDF devcontainers (#14015)

* add devcontainers

* fix tag for CUDA 12.0

* use CUDA 11.8 for now

* default to CUDA 12.0

* install cuda-cupti-dev in conda environment

* remove MODIFY_PREFIX_PATH so the driver is found

* install cuda-nvtx-dev in conda environment

* update conda env

* add MODIFY_PREFIX_PATH back

* temporarily default to my branch with the fix for MODIFY_PREFIX_PATH in conda envs

* remove temporary rapids-cmake pin

* build all RAPIDS archs to take maximum advantage of sccache

* add clangd and nsight vscode customizations

* copy in default clangd config

* remove options for pip vs. conda unless using the launch script

* fix unified mounts

* ensure dirs exist before mounting

* add compile_commands to .gitignore

* allow defining cudf and cudf_kafka include dirs via envvars

* add kvikio

* use volumes for isolated devcontainer source dirs

* update README.md

* update to rapidsai/devcontainers 23.10

* update rapids-build-utils version to 23.10

* add .clangd config file

* update RAPIDS versions in devcontainer files

* ensure the directory for the generated jitify kernels is exists after configuring

* add clang and clang-tools 16

* remove isolated and unified devcontainers, make single the default

* separate CUDA 11.8 and 12.0 devcontainers

* fix version string for requirements.txt

* update conda envs

* clean up envvars, mounts, and build args, add codespaces post-attach command workaround

* consolidate common vscode customizations

* enumerate CUDA 11 packages, include up to CUDA 12.2

* include protoc-wheel when generating requirements.txt

* default to cuda-python for cu11

* separate devcontainer mounts by CUDA version

* add devcontainer build jobs to PR workflow

* use pypi.nvidia.com instead of pypi.ngc.nvidia.com

* fix venvs mount path

* fix lint

* ensure rmm-cuXX is included in pip requirements

* disable libcudf_kakfa build for now

* build dask-cudf

* be more explicit in update-versions.sh, make devcontainer build required in pr jobs

* revert rename devcontainer job

* install librdkafka-dev in pip containers so we can build libcudf_kafka and cudf_kafka

* separate cupy, cudf, and cudf_kafka matrices for CUDA 11 and 12

* add fallback include path for RMM

* fallback to CUDA_PATH if CUDA_HOME is not set

* define envvars in dockerfile

* define envvars for cudf_kafka

* build verbose

* include wheel and setuptools in requirements.txt

* switch workflow to branch-23.10

* update clang-tools version to 16.0.6

* fix update-version.sh

* Use 24.02 branches.

* fix version numbers

* Fix dependencies.yaml.

* Update .devcontainer/Dockerfile

---------

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 .devcontainer/Dockerfile                      |  35 +++
 .devcontainer/README.md                       |  35 +++
 .../cuda11.8-conda/devcontainer.json          |  37 ++++
 .devcontainer/cuda11.8-pip/devcontainer.json  |  36 ++++
 .../cuda12.0-conda/devcontainer.json          |  37 ++++
 .devcontainer/cuda12.0-pip/devcontainer.json  |  36 ++++
 .github/workflows/pr.yaml                     |   9 +
 .gitignore                                    |   4 +
 .pre-commit-config.yaml                       |   2 +-
 ci/release/update-version.sh                  |   6 +
 .../all_cuda-118_arch-x86_64.yaml             |   2 +
 .../all_cuda-120_arch-x86_64.yaml             |   2 +
 cpp/.clangd                                   |  65 ++++++
 .../Modules/JitifyPreprocessKernels.cmake     |   6 +-
 dependencies.yaml                             | 203 +++++++++++++-----
 15 files changed, 458 insertions(+), 57 deletions(-)
 create mode 100644 .devcontainer/Dockerfile
 create mode 100644 .devcontainer/README.md
 create mode 100644 .devcontainer/cuda11.8-conda/devcontainer.json
 create mode 100644 .devcontainer/cuda11.8-pip/devcontainer.json
 create mode 100644 .devcontainer/cuda12.0-conda/devcontainer.json
 create mode 100644 .devcontainer/cuda12.0-pip/devcontainer.json
 create mode 100644 cpp/.clangd

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
new file mode 100644
index 00000000000..c19bb68986f
--- /dev/null
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,35 @@
+# syntax=docker/dockerfile:1.5
+
+ARG BASE
+ARG PYTHON_PACKAGE_MANAGER=conda
+
+FROM ${BASE} as pip-base
+
+RUN apt update -y \
+ && DEBIAN_FRONTEND=noninteractive apt install -y \
+    librdkafka-dev \
+ && rm -rf /tmp/* /var/tmp/* /var/cache/apt/* /var/lib/apt/lists/*;
+
+ENV DEFAULT_VIRTUAL_ENV=rapids
+
+FROM ${BASE} as conda-base
+
+ENV DEFAULT_CONDA_ENV=rapids
+
+FROM ${PYTHON_PACKAGE_MANAGER}-base
+
+ARG CUDA
+ENV CUDAARCHS="RAPIDS"
+ENV CUDA_VERSION="${CUDA_VERSION:-${CUDA}}"
+
+ARG PYTHON_PACKAGE_MANAGER
+ENV PYTHON_PACKAGE_MANAGER="${PYTHON_PACKAGE_MANAGER}"
+
+ENV PYTHONSAFEPATH="1"
+ENV PYTHONUNBUFFERED="1"
+ENV PYTHONDONTWRITEBYTECODE="1"
+
+ENV SCCACHE_REGION="us-east-2"
+ENV SCCACHE_BUCKET="rapids-sccache-devs"
+ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
+ENV HISTFILE="/home/coder/.cache/._bash_history"
diff --git a/.devcontainer/README.md b/.devcontainer/README.md
new file mode 100644
index 00000000000..91ee7ef85f7
--- /dev/null
+++ b/.devcontainer/README.md
@@ -0,0 +1,35 @@
+# cuDF Development Containers
+
+This directory contains [devcontainer configurations](https://containers.dev/implementors/json_reference/) for using VSCode to [develop in a container](https://code.visualstudio.com/docs/devcontainers/containers) via the `Remote Containers` [extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) or [GitHub Codespaces](https://github.com/codespaces).
+
+This container is a turnkey development environment for building and testing the cuDF C++ and Python libraries.
+
+## Table of Contents
+
+* [Prerequisites](#prerequisites)
+* [Host bind mounts](#host-bind-mounts)
+* [Launch a Dev Container](#launch-a-dev-container)
+
+## Prerequisites
+
+* [VSCode](https://code.visualstudio.com/download)
+* [VSCode Remote Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
+
+## Host bind mounts
+
+By default, the following directories are bind-mounted into the devcontainer:
+
+* `${repo}:/home/coder/cudf`
+* `${repo}/../.aws:/home/coder/.aws`
+* `${repo}/../.local:/home/coder/.local`
+* `${repo}/../.cache:/home/coder/.cache`
+* `${repo}/../.conda:/home/coder/.conda`
+* `${repo}/../.config:/home/coder/.config`
+
+This ensures caches, configurations, dependencies, and your commits are persisted on the host across container runs.
+
+## Launch a Dev Container
+
+To launch a devcontainer from VSCode, open the cuDF repo and select the "Reopen in Container" button in the bottom right:<br/><img src="https://user-images.githubusercontent.com/178183/221771999-97ab29d5-e718-4e5f-b32f-2cdd51bba25c.png"/>
+
+Alternatively, open the VSCode command palette (typically `cmd/ctrl + shift + P`) and run the "Rebuild and Reopen in Container" command.
diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
new file mode 100644
index 00000000000..6c3322dfc61
--- /dev/null
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "11.8",
+      "PYTHON_PACKAGE_MANAGER": "conda",
+      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda11.8-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda11.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
new file mode 100644
index 00000000000..d2afd3a497e
--- /dev/null
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -0,0 +1,36 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "11.8",
+      "PYTHON_PACKAGE_MANAGER": "pip",
+      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda11.8-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda11.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.0-conda/devcontainer.json
new file mode 100644
index 00000000000..42ed334fe03
--- /dev/null
+++ b/.devcontainer/cuda12.0-conda/devcontainer.json
@@ -0,0 +1,37 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "12.0",
+      "PYTHON_PACKAGE_MANAGER": "conda",
+      "BASE": "rapidsai/devcontainers:24.02-cpp-mambaforge-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.0-envs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.0-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.0-pip/devcontainer.json
new file mode 100644
index 00000000000..306a2065ef0
--- /dev/null
+++ b/.devcontainer/cuda12.0-pip/devcontainer.json
@@ -0,0 +1,36 @@
+{
+  "build": {
+    "context": "${localWorkspaceFolder}/.devcontainer",
+    "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
+    "args": {
+      "CUDA": "12.0",
+      "PYTHON_PACKAGE_MANAGER": "pip",
+      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda12.0-ubuntu22.04"
+    }
+  },
+  "hostRequirements": {"gpu": "optional"},
+  "features": {
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+  },
+  "overrideFeatureInstallOrder": [
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
+  ],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs}"],
+  "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
+  "workspaceFolder": "/home/coder",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-python.flake8",
+        "nvidia.nsight-vscode-edition"
+      ]
+    }
+  }
+}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 462fad6e938..5a77c6749fe 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,6 +25,7 @@ jobs:
       - wheel-tests-cudf
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
+      - devcontainer
       - unit-tests-cudf-pandas
       - pandas-tests
       #- pandas-tests-diff
@@ -130,6 +131,14 @@ jobs:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: pull-request
       script: ci/test_wheel_dask_cudf.sh
+  devcontainer:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.02
+    with:
+      build_command: |
+        sccache -z;
+        build-all -DBUILD_BENCHMARKS=ON -DNVBench_ENABLE_CUPTI=OFF --verbose;
+        sccache -s;
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf
     secrets: inherit
diff --git a/.gitignore b/.gitignore
index 4a4a6a98efb..243ba73e5b0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -173,3 +173,7 @@ jupyter_execute
 
 # cibuildwheel
 /wheelhouse
+
+# clang tooling
+compile_commands.json
+.clangd/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a62104722e8..7db8d9ab52f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -145,7 +145,7 @@ repos:
                     ^CHANGELOG.md$
                   )
       - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.5.1
+        rev: v1.7.1
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 16742465c32..f9b1436495a 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -115,3 +115,9 @@ NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}-SNAPSHOT"
 sed_runner "s|<version>.*-SNAPSHOT</version>|<version>${NEXT_FULL_JAVA_TAG}</version>|g" java/pom.xml
 sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" java/ci/README.md
 sed_runner "s/cudf-.*-SNAPSHOT/cudf-${NEXT_FULL_JAVA_TAG}/g" java/ci/README.md
+
+# .devcontainer files
+find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
+    sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+done
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 1a944fbdb21..0bafdd13af5 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -14,6 +14,8 @@ dependencies:
 - botocore>=1.24.21
 - c-compiler
 - cachetools
+- clang-tools=16.0.6
+- clang==16.0.6
 - cmake>=3.26.4
 - cramjam
 - cubinlinker
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 21837b652f4..27dede9b519 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -14,6 +14,8 @@ dependencies:
 - botocore>=1.24.21
 - c-compiler
 - cachetools
+- clang-tools=16.0.6
+- clang==16.0.6
 - cmake>=3.26.4
 - cramjam
 - cuda-cudart-dev
diff --git a/cpp/.clangd b/cpp/.clangd
new file mode 100644
index 00000000000..7c4fe036ddf
--- /dev/null
+++ b/cpp/.clangd
@@ -0,0 +1,65 @@
+# https://clangd.llvm.org/config
+
+# Apply a config conditionally to all C files
+If:
+  PathMatch: .*\.(c|h)$
+
+---
+
+# Apply a config conditionally to all C++ files
+If:
+  PathMatch: .*\.(c|h)pp
+
+---
+
+# Apply a config conditionally to all CUDA files
+If:
+  PathMatch: .*\.cuh?
+CompileFlags:
+  Add:
+    - "-x"
+    - "cuda"
+    # No error on unknown CUDA versions
+    - "-Wno-unknown-cuda-version"
+    # Allow variadic CUDA functions
+    - "-Xclang=-fcuda-allow-variadic-functions"
+Diagnostics:
+  Suppress:
+    - "variadic_device_fn"
+    - "attributes_not_allowed"
+
+---
+
+# Tweak the clangd parse settings for all files
+CompileFlags:
+  Add:
+    # report all errors
+    - "-ferror-limit=0"
+    - "-fmacro-backtrace-limit=0"
+    - "-ftemplate-backtrace-limit=0"
+    # Skip the CUDA version check
+    - "--no-cuda-version-check"
+  Remove:
+    # remove gcc's -fcoroutines
+    - -fcoroutines
+    # remove nvc++ flags unknown to clang
+    - "-gpu=*"
+    - "-stdpar*"
+    # remove nvcc flags unknown to clang
+    - "-arch*"
+    - "-gencode*"
+    - "--generate-code*"
+    - "-ccbin*"
+    - "-t=*"
+    - "--threads*"
+    - "-Xptxas*"
+    - "-Xcudafe*"
+    - "-Xfatbin*"
+    - "-Xcompiler*"
+    - "--diag-suppress*"
+    - "--diag_suppress*"
+    - "--compiler-options*"
+    - "--expt-extended-lambda"
+    - "--expt-relaxed-constexpr"
+    - "-forward-unknown-to-host-compiler"
+    - "-Werror=cross-execution-space-call"
diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index df285bdea55..baabffceeac 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -54,6 +54,10 @@ function(jit_preprocess_files)
   )
 endfunction()
 
+if(NOT (EXISTS "${CUDF_GENERATED_INCLUDE_DIR}/include"))
+  make_directory("${CUDF_GENERATED_INCLUDE_DIR}/include")
+endif()
+
 jit_preprocess_files(
   SOURCE_DIRECTORY ${CUDF_SOURCE_DIR}/src FILES binaryop/jit/kernel.cu transform/jit/kernel.cu
   rolling/jit/kernel.cu
diff --git a/dependencies.yaml b/dependencies.yaml
index f00273a5db1..a84a8fa2bf2 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -19,12 +19,12 @@ files:
       - run_common
       - run_cudf
       - run_dask_cudf
-      - run_cudf_kafka
       - run_custreamz
       - test_cpp
       - test_python_common
       - test_python_cudf
       - test_python_dask_cudf
+      - depends_on_cupy
   test_cpp:
     output: none
     includes:
@@ -83,6 +83,7 @@ files:
       - run_common
       - run_cudf
       - pyarrow_run
+      - depends_on_cupy
   py_test_cudf:
     output: pyproject
     pyproject_dir: python/cudf
@@ -123,6 +124,8 @@ files:
     includes:
       - run_common
       - run_dask_cudf
+      - depends_on_cudf
+      - depends_on_cupy
   py_test_dask_cudf:
     output: pyproject
     pyproject_dir: python/dask_cudf
@@ -146,7 +149,7 @@ files:
     extras:
       table: project
     includes:
-      - run_cudf_kafka
+      - depends_on_cudf
   py_test_cudf_kafka:
     output: pyproject
     pyproject_dir: python/cudf_kafka
@@ -169,6 +172,8 @@ files:
       table: project
     includes:
       - run_custreamz
+      - depends_on_cudf
+      - depends_on_cudf_kafka
   py_test_custreamz:
     output: pyproject
     pyproject_dir: python/custreamz
@@ -229,16 +234,14 @@ dependencies:
               - nvcc_linux-aarch64=11.8
   build_cpp:
     common:
-      - output_types: [conda, requirements]
-        packages:
-          - librmm==24.2.*
-          - libkvikio==24.2.*
       - output_types: conda
         packages:
           - fmt>=9.1.0,<10
           - &gbench benchmark==1.8.0
           - &gtest gtest>=1.13.0
           - &gmock gmock>=1.13.0
+          - librmm==24.2.*
+          - libkvikio==24.2.*
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
           - libarrow-all==14.0.1.*
@@ -248,7 +251,7 @@ dependencies:
           - spdlog>=1.11.0,<1.12
   build_wheels:
     common:
-      - output_types: pyproject
+      - output_types: [requirements, pyproject]
         packages:
           - wheel
           - setuptools
@@ -267,15 +270,40 @@ dependencies:
           - pyarrow==14.0.1.*
   build_python_cudf:
     common:
-      - output_types: [conda, requirements, pyproject]
-        packages:
-          - rmm==24.2.*
       - output_types: conda
         packages:
+          - &rmm_conda rmm==24.2.*
           - &protobuf protobuf>=4.21,<5
-      - output_types: pyproject
+          - pip
+          - pip:
+              - git+https://github.com/python-streamz/streamz.git@master
+      - output_types: [requirements, pyproject]
         packages:
           - protoc-wheel
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for rmm-cu{11,12}.
+          - --extra-index-url=https://pypi.nvidia.com
+          - git+https://github.com/python-streamz/streamz.git@master
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &build_python_packages_cu12
+              - &rmm_cu12 rmm-cu12==24.2.*
+          - {matrix: {cuda: "12.1"}, packages: *build_python_packages_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *build_python_packages_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &build_python_packages_cu11
+              - &rmm_cu11 rmm-cu11==24.2.*
+          - {matrix: {cuda: "11.5"}, packages: *build_python_packages_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *build_python_packages_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *build_python_packages_cu11}
+          - {matrix: null, packages: null }
+      - output_types: pyproject
+        matrices:
+          - {matrix: null, packages: [*rmm_conda] }
   libarrow_run:
     common:
       - output_types: conda
@@ -385,6 +413,8 @@ dependencies:
           - identify>=2.5.20
       - output_types: conda
         packages:
+          - clang==16.0.6
+          - clang-tools=16.0.6
           - &doxygen doxygen=1.9.1 # pre-commit hook needs a specific version.
   docs:
     common:
@@ -444,72 +474,65 @@ dependencies:
           - nvtx>=0.2.1
           - packaging
           - rich
-          - rmm==24.2.*
           - typing_extensions>=4.0.0
           - *protobuf
       - output_types: conda
         packages:
-          - cupy>=12.0.0
-          - pip
-          - pip:
-              - git+https://github.com/python-streamz/streamz.git@master
+          - *rmm_conda
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           # This index is needed for rmm, cubinlinker, ptxcompiler.
-          - --extra-index-url=https://pypi.ngc.nvidia.com
-          - git+https://github.com/python-streamz/streamz.git@master
-          - &cupy_pip cupy-cuda11x>=12.0.0
-      - output_types: pyproject
-        packages:
-          - cubinlinker
-          - *cupy_pip
-          - ptxcompiler
+          - --extra-index-url=https://pypi.nvidia.com
     specific:
       - output_types: [conda, requirements, pyproject]
         matrices:
-          - matrix:
-              cuda: "12.0"
-            packages:
+          - matrix: {cuda: "12.2"}
+            packages: &run_cudf_packages_all_cu12
               - cuda-python>=12.0,<13.0a0
-          - matrix: # All CUDA 11 versions
-            packages:
+          - {matrix: {cuda: "12.1"}, packages: *run_cudf_packages_all_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *run_cudf_packages_all_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &run_cudf_packages_all_cu11
               - cuda-python>=11.7.1,<12.0a0
-      - output_types: [conda, pyproject]
+          - {matrix: {cuda: "11.5"}, packages: *run_cudf_packages_all_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *run_cudf_packages_all_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *run_cudf_packages_all_cu11}
+          - {matrix: null, packages: *run_cudf_packages_all_cu11}
+      - output_types: conda
         matrices:
-          - matrix:
-              cuda: "12.0"
-            packages:
-          - matrix: # All CUDA 11 versions
-            packages:
+          - matrix: {cuda: "11.8"}
+            packages: &run_cudf_packages_conda_cu11
               - cubinlinker
               - ptxcompiler
-      - output_types: requirements
+          - {matrix: {cuda: "11.5"}, packages: *run_cudf_packages_conda_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *run_cudf_packages_conda_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *run_cudf_packages_conda_cu11}
+          - {matrix: null, packages: null}
+      - output_types: [requirements, pyproject]
         matrices:
-          - matrix:
-              cuda: "12.0"
-            packages:
-          - matrix: # All CUDA 11 versions
-            packages:
+          - matrix: {cuda: "12.2"}
+            packages: &run_cudf_packages_pip_cu12
+              - rmm-cu12==24.2.*
+          - {matrix: {cuda: "12.1"}, packages: *run_cudf_packages_pip_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *run_cudf_packages_pip_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &run_cudf_packages_pip_cu11
+              - rmm-cu11==24.2.*
               - cubinlinker-cu11
               - ptxcompiler-cu11
+          - {matrix: {cuda: "11.5"}, packages: *run_cudf_packages_pip_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *run_cudf_packages_pip_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *run_cudf_packages_pip_cu11}
+          - {matrix: null, packages: null}
+      - output_types: pyproject
+        matrices:
+          - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda] }
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           - rapids-dask-dependency==24.2.*
-      - output_types: conda
-        packages:
-          - cupy>=12.0.0
-      - output_types: pyproject
-        packages:
-          - &cudf cudf==24.2.*
-          - *cupy_pip
-  run_cudf_kafka:
-    common:
-      - output_types: [requirements, pyproject]
-        packages:
-          - *cudf
   run_custreamz:
     common:
       - output_types: conda
@@ -521,8 +544,6 @@ dependencies:
       - output_types: [requirements, pyproject]
         packages:
           - confluent-kafka>=1.9.0,<1.10.0a0
-          - *cudf
-          - cudf_kafka==24.2.*
   test_cpp:
     common:
       - output_types: conda
@@ -606,6 +627,78 @@ dependencies:
         packages:
           - dask-cuda==24.2.*
           - *numba
+  depends_on_cudf:
+    common:
+      - output_types: conda
+        packages:
+          - &cudf_conda cudf==24.2.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for rmm, cubinlinker, ptxcompiler.
+          - --extra-index-url=https://pypi.nvidia.com
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &cudf_packages_pip_cu12
+              - cudf-cu12==24.2.*
+          - {matrix: {cuda: "12.1"}, packages: *cudf_packages_pip_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *cudf_packages_pip_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &cudf_packages_pip_cu11
+              - cudf-cu11==24.2.*
+          - {matrix: {cuda: "11.5"}, packages: *cudf_packages_pip_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *cudf_packages_pip_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *cudf_packages_pip_cu11}
+          - {matrix: null, packages: [*cudf_conda]}
+  depends_on_cudf_kafka:
+    common:
+      - output_types: conda
+        packages:
+          - &cudf_kafka_conda cudf_kafka==24.2.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for rmm, cubinlinker, ptxcompiler.
+          - --extra-index-url=https://pypi.nvidia.com
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.2"}
+            packages: &cudf_kafka_packages_pip_cu12
+              - cudf_kafka-cu12==24.2.*
+          - {matrix: {cuda: "12.1"}, packages: *cudf_kafka_packages_pip_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *cudf_kafka_packages_pip_cu12}
+          - matrix: {cuda: "11.8"}
+            packages: &cudf_kafka_packages_pip_cu11
+              - cudf_kafka-cu11==24.2.*
+          - {matrix: {cuda: "11.5"}, packages: *cudf_kafka_packages_pip_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *cudf_kafka_packages_pip_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *cudf_kafka_packages_pip_cu11}
+          - {matrix: null, packages: [*cudf_kafka_conda]}
+  depends_on_cupy:
+    common:
+      - output_types: conda
+        packages:
+          - cupy>=12.0.0
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          # All CUDA 12 versions
+          - matrix: {cuda: "12.2"}
+            packages: &cupy_packages_cu12
+              - cupy-cuda12x>=12.0.0
+          - {matrix: {cuda: "12.1"}, packages: *cupy_packages_cu12}
+          - {matrix: {cuda: "12.0"}, packages: *cupy_packages_cu12}
+          # All CUDA 11 versions
+          - matrix: {cuda: "11.8"}
+            packages: &cupy_packages_cu11
+              - cupy-cuda11x>=12.0.0
+          - {matrix: {cuda: "11.5"}, packages: *cupy_packages_cu11}
+          - {matrix: {cuda: "11.4"}, packages: *cupy_packages_cu11}
+          - {matrix: {cuda: "11.2"}, packages: *cupy_packages_cu11}
+          - {matrix: null, packages: *cupy_packages_cu11}
   test_python_pandas_cudf:
     common:
       - output_types: pyproject

From 3ef13d07057e87cff1cad4e0aa9460b3b5c45459 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 20 Nov 2023 13:02:29 -0600
Subject: [PATCH 30/36] Fix io reference in docs. (#14452)

cuDF CI is failing to build docs due to an ambiguous reference `io`. This PR makes that reference unambiguous.

```
/__w/cudf/cudf/docs/cudf/source/user_guide/data-types.md:139: WARNING: Multiple matches found for 'io': pandas:std:label:io, pyarrow:std:label:io, python:py:module:io [myst.iref_ambiguous]
```

I used this output to help me find the object inventory that lists this.

```bash
python -m sphinx.ext.intersphinx https://docs.rapids.ai/api/cudf/stable/objects.inv
```

I also looked at the MyST docs on external references. https://mystmd.org/guide/external-references

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14452
---
 docs/cudf/source/user_guide/data-types.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/cudf/source/user_guide/data-types.md b/docs/cudf/source/user_guide/data-types.md
index 1f4cfbc7366..e6fe3109c57 100644
--- a/docs/cudf/source/user_guide/data-types.md
+++ b/docs/cudf/source/user_guide/data-types.md
@@ -136,7 +136,7 @@ dtype: struct
 StructDtype({'a': dtype('int64'), 'b': dtype('int64')})
 ```
 
-Or by reading them from disk, using a [file format that supports nested data](io).
+Or by reading them from disk, using a [file format that supports nested data](/user_guide/io/index.md).
 
 ```python
 >>> pdf = pd.DataFrame({"a": [[1, 2], [3, 4, 5], [6, 7, 8]]})

From 823d3214a9489e3c496aa31041b5d29f650e94b3 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 20 Nov 2023 16:33:28 -0600
Subject: [PATCH 31/36] Use `pynvjitlink` for CUDA 12+ MVC (#13650)

Fixes https://github.com/rapidsai/cudf/issues/12822

This PR provides minor version compatibility in the CUDA 12.x range through `nvjitlink` via the preliminary [nvjiitlink python binding](https://github.com/gmarkall/nvjitlink). Thus far this PR merely leverages a local installation of the library and should not be merged until `nvjitlink` is hosted on `conda-forge` and cuDF's dependencies are adjusted accordingly, likely as part of this PR.

Authors:
  - https://github.com/brandon-b-miller
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/13650
---
 python/cudf/cudf/tests/test_mvc.py          | 99 +++++++++++++++++++++
 python/cudf/cudf/tests/test_numba_import.py | 48 ----------
 python/cudf/cudf/utils/_numba.py            | 53 ++++++-----
 3 files changed, 128 insertions(+), 72 deletions(-)
 create mode 100644 python/cudf/cudf/tests/test_mvc.py
 delete mode 100644 python/cudf/cudf/tests/test_numba_import.py

diff --git a/python/cudf/cudf/tests/test_mvc.py b/python/cudf/cudf/tests/test_mvc.py
new file mode 100644
index 00000000000..7dd25ebc500
--- /dev/null
+++ b/python/cudf/cudf/tests/test_mvc.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+import subprocess
+import sys
+
+import pytest
+
+IS_CUDA_11 = False
+IS_CUDA_12 = False
+try:
+    from ptxcompiler.patch import safe_get_versions
+except ModuleNotFoundError:
+    from cudf.utils._ptxcompiler import safe_get_versions
+
+# do not test cuda 12 if pynvjitlink isn't present
+HAVE_PYNVJITLINK = False
+try:
+    import pynvjitlink  # noqa: F401
+
+    HAVE_PYNVJITLINK = True
+except ModuleNotFoundError:
+    pass
+
+
+versions = safe_get_versions()
+driver_version, runtime_version = versions
+
+if (11, 0) <= driver_version < (12, 0):
+    IS_CUDA_11 = True
+if (12, 0) <= driver_version < (13, 0):
+    IS_CUDA_12 = True
+
+
+TEST_BODY = """
+@numba.cuda.jit
+def test_kernel(x):
+    id = numba.cuda.grid(1)
+    if id < len(x):
+        x[id] += 1
+
+s = cudf.Series([1, 2, 3])
+with _CUDFNumbaConfig():
+    test_kernel.forall(len(s))(s)
+"""
+
+CUDA_11_TEST = (
+    """
+import numba.cuda
+import cudf
+from cudf.utils._numba import _CUDFNumbaConfig, patch_numba_linker_cuda_11
+
+
+patch_numba_linker_cuda_11()
+"""
+    + TEST_BODY
+)
+
+
+CUDA_12_TEST = (
+    """
+import numba.cuda
+import cudf
+from cudf.utils._numba import _CUDFNumbaConfig
+from pynvjitlink.patch import (
+    patch_numba_linker as patch_numba_linker_pynvjitlink,
+)
+
+patch_numba_linker_pynvjitlink()
+"""
+    + TEST_BODY
+)
+
+
+@pytest.mark.parametrize(
+    "test",
+    [
+        pytest.param(
+            CUDA_11_TEST,
+            marks=pytest.mark.skipif(
+                not IS_CUDA_11,
+                reason="Minor Version Compatibility test for CUDA 11",
+            ),
+        ),
+        pytest.param(
+            CUDA_12_TEST,
+            marks=pytest.mark.skipif(
+                not IS_CUDA_12 or not HAVE_PYNVJITLINK,
+                reason="Minor Version Compatibility test for CUDA 12",
+            ),
+        ),
+    ],
+)
+def test_numba_mvc(test):
+    cp = subprocess.run(
+        [sys.executable, "-c", test],
+        capture_output=True,
+        cwd="/",
+    )
+
+    assert cp.returncode == 0
diff --git a/python/cudf/cudf/tests/test_numba_import.py b/python/cudf/cudf/tests/test_numba_import.py
deleted file mode 100644
index 238a32a94fa..00000000000
--- a/python/cudf/cudf/tests/test_numba_import.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-import subprocess
-import sys
-
-import pytest
-
-IS_CUDA_11 = False
-try:
-    from ptxcompiler.patch import NO_DRIVER, safe_get_versions
-
-    versions = safe_get_versions()
-    if versions != NO_DRIVER:
-        driver_version, runtime_version = versions
-        if driver_version < (12, 0):
-            IS_CUDA_11 = True
-except ModuleNotFoundError:
-    pass
-
-TEST_NUMBA_MVC_ENABLED = """
-import numba.cuda
-import cudf
-from cudf.utils._numba import _CUDFNumbaConfig, _patch_numba_mvc
-
-
-_patch_numba_mvc()
-
-@numba.cuda.jit
-def test_kernel(x):
-    id = numba.cuda.grid(1)
-    if id < len(x):
-        x[id] += 1
-
-s = cudf.Series([1, 2, 3])
-with _CUDFNumbaConfig():
-    test_kernel.forall(len(s))(s)
-"""
-
-
-@pytest.mark.skipif(
-    not IS_CUDA_11, reason="Minor Version Compatibility test for CUDA 11"
-)
-def test_numba_mvc_enabled_cuda_11():
-    cp = subprocess.run(
-        [sys.executable, "-c", TEST_NUMBA_MVC_ENABLED],
-        capture_output=True,
-        cwd="/",
-    )
-    assert cp.returncode == 0
diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index 09afb5680bd..bc0d6f37d89 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -7,6 +7,19 @@
 
 from numba import config as numba_config
 
+try:
+    from pynvjitlink.patch import (
+        patch_numba_linker as patch_numba_linker_pynvjitlink,
+    )
+except ImportError:
+
+    def patch_numba_linker_pynvjitlink():
+        warnings.warn(
+            "CUDA Toolkit is newer than CUDA driver. "
+            "Numba features will not work in this configuration. "
+        )
+
+
 CC_60_PTX_FILE = os.path.join(
     os.path.dirname(__file__), "../core/udf/shim_60.ptx"
 )
@@ -65,7 +78,7 @@ def _get_ptx_file(path, prefix):
         return regular_result[1]
 
 
-def _patch_numba_mvc():
+def patch_numba_linker_cuda_11():
     # Enable the config option for minor version compatibility
     numba_config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1
 
@@ -106,29 +119,19 @@ def _setup_numba():
     versions = safe_get_versions()
     if versions != NO_DRIVER:
         driver_version, runtime_version = versions
-        if driver_version >= (12, 0) and runtime_version > driver_version:
-            warnings.warn(
-                f"Using CUDA toolkit version {runtime_version} with CUDA "
-                f"driver version {driver_version} requires minor version "
-                "compatibility, which is not yet supported for CUDA "
-                "driver versions 12.0 and above. It is likely that many "
-                "cuDF operations will not work in this state. Please "
-                f"install CUDA toolkit version {driver_version} to "
-                "continue using cuDF."
-            )
-        else:
-            # Support MVC for all CUDA versions in the 11.x range
-            ptx_toolkit_version = _get_cuda_version_from_ptx_file(
-                CC_60_PTX_FILE
-            )
-            # Numba thinks cubinlinker is only needed if the driver is older
-            # than the CUDA runtime, but when PTX files are present, it might
-            # also need to patch because those PTX files may be compiled by
-            # a CUDA version that is newer than the driver as well
-            if (driver_version < ptx_toolkit_version) or (
-                driver_version < runtime_version
-            ):
-                _patch_numba_mvc()
+        ptx_toolkit_version = _get_cuda_version_from_ptx_file(CC_60_PTX_FILE)
+
+        # MVC is required whenever any PTX is newer than the driver
+        # This could be the shipped PTX file or the PTX emitted by
+        # the version of NVVM on the user system, the latter aligning
+        # with the runtime version
+        if (driver_version < ptx_toolkit_version) or (
+            driver_version < runtime_version
+        ):
+            if driver_version < (12, 0):
+                patch_numba_linker_cuda_11()
+            else:
+                patch_numba_linker_pynvjitlink()
 
 
 def _get_cuda_version_from_ptx_file(path):
@@ -171,6 +174,8 @@ def _get_cuda_version_from_ptx_file(path):
         "7.8": (11, 8),
         "8.0": (12, 0),
         "8.1": (12, 1),
+        "8.2": (12, 2),
+        "8.3": (12, 3),
     }
 
     cuda_ver = ver_map.get(version)

From 5831beb80dab9cc23668b5a701d9a92a4797fe70 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 20 Nov 2023 15:39:35 -0800
Subject: [PATCH 32/36] Remove the use of `volatile` in Parquet (#14448)

`volatile` should no be required in our code, unless there are compiler or synchronization issues.
This PR removes the use in Parquet reader and writer.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14448
---
 cpp/src/io/parquet/decode_preprocess.cu  |  2 +-
 cpp/src/io/parquet/page_data.cu          | 31 +++++++++---------------
 cpp/src/io/parquet/page_decode.cuh       | 29 ++++++++++------------
 cpp/src/io/parquet/page_enc.cu           |  2 +-
 cpp/src/io/parquet/page_string_decode.cu |  4 +--
 5 files changed, 28 insertions(+), 40 deletions(-)

diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index 544c93ee616..d9f91ed564c 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -61,7 +61,7 @@ __device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
   } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
     str_len = gpuInitStringDescriptors<true, unused_state_buf>(s, nullptr, target_pos, t);
   }
-  if (!t) { *(int32_t volatile*)&s->dict_pos = target_pos; }
+  if (!t) { s->dict_pos = target_pos; }
   return str_len;
 }
 
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 0c53877f7c7..1a94f05498e 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -39,10 +39,7 @@ constexpr int rolling_buf_size  = decode_block_size * 2;
  * @param[in] dstv Pointer to row output data (string descriptor or 32-bit hash)
  */
 template <typename state_buf>
-inline __device__ void gpuOutputString(volatile page_state_s* s,
-                                       volatile state_buf* sb,
-                                       int src_pos,
-                                       void* dstv)
+inline __device__ void gpuOutputString(page_state_s* s, state_buf* sb, int src_pos, void* dstv)
 {
   auto [ptr, len] = gpuGetStringData(s, sb, src_pos);
   // make sure to only hash `BYTE_ARRAY` when specified with the output type size
@@ -69,7 +66,7 @@ inline __device__ void gpuOutputString(volatile page_state_s* s,
  * @param[in] dst Pointer to row output data
  */
 template <typename state_buf>
-inline __device__ void gpuOutputBoolean(volatile state_buf* sb, int src_pos, uint8_t* dst)
+inline __device__ void gpuOutputBoolean(state_buf* sb, int src_pos, uint8_t* dst)
 {
   *dst = sb->dict_idx[rolling_index<state_buf::dict_buf_size>(src_pos)];
 }
@@ -143,8 +140,8 @@ inline __device__ void gpuStoreOutput(uint2* dst,
  * @param[out] dst Pointer to row output data
  */
 template <typename state_buf>
-inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s,
-                                               volatile state_buf* sb,
+inline __device__ void gpuOutputInt96Timestamp(page_state_s* s,
+                                               state_buf* sb,
                                                int src_pos,
                                                int64_t* dst)
 {
@@ -218,8 +215,8 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s,
  * @param[in] dst Pointer to row output data
  */
 template <typename state_buf>
-inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s* s,
-                                               volatile state_buf* sb,
+inline __device__ void gpuOutputInt64Timestamp(page_state_s* s,
+                                               state_buf* sb,
                                                int src_pos,
                                                int64_t* dst)
 {
@@ -301,10 +298,7 @@ __device__ void gpuOutputByteArrayAsInt(char const* ptr, int32_t len, T* dst)
  * @param[in] dst Pointer to row output data
  */
 template <typename T, typename state_buf>
-__device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s,
-                                                volatile state_buf* sb,
-                                                int src_pos,
-                                                T* dst)
+__device__ void gpuOutputFixedLenByteArrayAsInt(page_state_s* s, state_buf* sb, int src_pos, T* dst)
 {
   uint32_t const dtype_len_in = s->dtype_len_in;
   uint8_t const* data         = s->dict_base ? s->dict_base : s->data_start;
@@ -338,10 +332,7 @@ __device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s,
  * @param[in] dst Pointer to row output data
  */
 template <typename T, typename state_buf>
-inline __device__ void gpuOutputFast(volatile page_state_s* s,
-                                     volatile state_buf* sb,
-                                     int src_pos,
-                                     T* dst)
+inline __device__ void gpuOutputFast(page_state_s* s, state_buf* sb, int src_pos, T* dst)
 {
   uint8_t const* dict;
   uint32_t dict_pos, dict_size = s->dict_size;
@@ -371,7 +362,7 @@ inline __device__ void gpuOutputFast(volatile page_state_s* s,
  */
 template <typename state_buf>
 static __device__ void gpuOutputGeneric(
-  volatile page_state_s* s, volatile state_buf* sb, int src_pos, uint8_t* dst8, int len)
+  page_state_s* s, state_buf* sb, int src_pos, uint8_t* dst8, int len)
 {
   uint8_t const* dict;
   uint32_t dict_pos, dict_size = s->dict_size;
@@ -512,7 +503,7 @@ __global__ void __launch_bounds__(decode_block_size)
                  (s->col.data_type & 7) == FIXED_LEN_BYTE_ARRAY) {
         gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
       }
-      if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
+      if (t == 32) { s->dict_pos = src_target_pos; }
     } else {
       // WARP1..WARP3: Decode values
       int const dtype = s->col.data_type & 7;
@@ -601,7 +592,7 @@ __global__ void __launch_bounds__(decode_block_size)
         }
       }
 
-      if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; }
+      if (t == out_thread0) { s->src_pos = target_pos; }
     }
     __syncthreads();
   }
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 4db9bd3904b..a521f4af039 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -71,15 +71,15 @@ struct page_state_s {
   // points to either nesting_decode_cache above when possible, or to the global source otherwise
   PageNestingDecodeInfo* nesting_info{};
 
-  inline __device__ void set_error_code(decode_error err) volatile
+  inline __device__ void set_error_code(decode_error err)
   {
-    cuda::atomic_ref<int32_t, cuda::thread_scope_block> ref{const_cast<int&>(error)};
+    cuda::atomic_ref<int32_t, cuda::thread_scope_block> ref{error};
     ref.fetch_or(static_cast<int32_t>(err), cuda::std::memory_order_relaxed);
   }
 
-  inline __device__ void reset_error_code() volatile
+  inline __device__ void reset_error_code()
   {
-    cuda::atomic_ref<int32_t, cuda::thread_scope_block> ref{const_cast<int&>(error)};
+    cuda::atomic_ref<int32_t, cuda::thread_scope_block> ref{error};
     ref.store(0, cuda::std::memory_order_release);
   }
 };
@@ -185,8 +185,8 @@ inline __device__ bool is_page_contained(page_state_s* const s, size_t start_row
  * @return A pair containing a pointer to the string and its length
  */
 template <typename state_buf>
-inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(page_state_s volatile* s,
-                                                                        state_buf volatile* sb,
+inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(page_state_s* s,
+                                                                        state_buf* sb,
                                                                         int src_pos)
 {
   char const* ptr = nullptr;
@@ -232,8 +232,10 @@ inline __device__ cuda::std::pair<char const*, size_t> gpuGetStringData(page_sta
  * additional values.
  */
 template <bool sizes_only, typename state_buf>
-__device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
-  page_state_s volatile* s, [[maybe_unused]] state_buf volatile* sb, int target_pos, int t)
+__device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(page_state_s* s,
+                                                                [[maybe_unused]] state_buf* sb,
+                                                                int target_pos,
+                                                                int t)
 {
   uint8_t const* end = s->data_end;
   int dict_bits      = s->dict_bits;
@@ -349,10 +351,7 @@ __device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(
  * @return The new output position
  */
 template <typename state_buf>
-inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s,
-                                           state_buf volatile* sb,
-                                           int target_pos,
-                                           int t)
+inline __device__ int gpuDecodeRleBooleans(page_state_s* s, state_buf* sb, int target_pos, int t)
 {
   uint8_t const* end = s->data_end;
   int64_t pos        = s->dict_pos;
@@ -420,10 +419,8 @@ inline __device__ int gpuDecodeRleBooleans(page_state_s volatile* s,
  * @return Total length of strings processed
  */
 template <bool sizes_only, typename state_buf>
-__device__ size_type gpuInitStringDescriptors(page_state_s volatile* s,
-                                              [[maybe_unused]] state_buf volatile* sb,
-                                              int target_pos,
-                                              int t)
+__device__ size_type
+gpuInitStringDescriptors(page_state_s* s, [[maybe_unused]] state_buf* sb, int target_pos, int t)
 {
   int pos       = s->dict_pos;
   int total_len = 0;
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 2b7980c93e9..d75608930d5 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -90,7 +90,7 @@ struct page_enc_state_s {
   uint32_t rle_rpt_count;
   uint32_t page_start_val;
   uint32_t chunk_start_val;
-  volatile uint32_t rpt_map[num_encode_warps];
+  uint32_t rpt_map[num_encode_warps];
   EncPage page;
   EncColumnChunk ck;
   parquet_column_device_view col;
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index e29db042401..916eaa3d681 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -851,7 +851,7 @@ __global__ void __launch_bounds__(decode_block_size)
       } else {
         gpuInitStringDescriptors<false>(s, sb, src_target_pos, lane_id);
       }
-      if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; }
+      if (t == 32) { s->dict_pos = src_target_pos; }
     } else {
       int const me = t - out_thread0;
 
@@ -934,7 +934,7 @@ __global__ void __launch_bounds__(decode_block_size)
         }
       }
 
-      if (t == out_thread0) { *(volatile int32_t*)&s->src_pos = target_pos; }
+      if (t == out_thread0) { s->src_pos = target_pos; }
     }
     __syncthreads();
   }

From 947081f5b10ca972826942b84c5c2530050325d8 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 21 Nov 2023 00:45:37 +0000
Subject: [PATCH 33/36] Remove warning in dask-cudf docs (#14454)

Move `from_delayed` and `concat` to appropriate subsections.

- Closes #14299

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14454
---
 docs/dask_cudf/source/api.rst | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/docs/dask_cudf/source/api.rst b/docs/dask_cudf/source/api.rst
index 893f5dd7434..db32f4bbcb3 100644
--- a/docs/dask_cudf/source/api.rst
+++ b/docs/dask_cudf/source/api.rst
@@ -19,6 +19,7 @@ data reading facilities, followed by calling
    :members:
       from_cudf,
       from_dask_dataframe,
+      from_delayed,
       read_csv,
       read_json,
       read_orc,
@@ -26,14 +27,6 @@ data reading facilities, followed by calling
       read_text,
       read_parquet
 
-.. warning::
-
-   FIXME: where should the following live?
-
-   .. autofunction:: dask_cudf.concat
-
-   .. autofunction:: dask_cudf.from_delayed
-
 Grouping
 ========
 
@@ -77,3 +70,7 @@ identical. The full API is provided below.
    :members:
    :inherited-members:
    :show-inheritance:
+
+.. automodule:: dask_cudf
+   :members:
+      concat

From fcc89503c1f1e15ec287519959013adcf2bf8a52 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 21 Nov 2023 05:19:11 -1000
Subject: [PATCH 34/36] Preserve DataFrame(columns=).columns dtype during
 empty-like construction (#14381)

`.column` used to always return `pd.Index([], dtype=object)` even if an empty-dtyped columns was passed into the DataFrame constructor e.g. `DatetimeIndex([])`. Needed to preserved some information about what column dtype was passed in so we can return a correctly type Index

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14381
---
 python/cudf/cudf/core/column_accessor.py | 14 +++++++++++++-
 python/cudf/cudf/core/dataframe.py       |  6 ++++++
 python/cudf/cudf/tests/test_dataframe.py | 11 +++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 93105b4a252..b106b8bbb02 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -27,6 +27,7 @@
 from cudf.core import column
 
 if TYPE_CHECKING:
+    from cudf._typing import Dtype
     from cudf.core.column import ColumnBase
 
 
@@ -99,6 +100,9 @@ class ColumnAccessor(abc.MutableMapping):
     rangeindex : bool, optional
         Whether the keys should be returned as a RangeIndex
         in `to_pandas_index` (default=False).
+    label_dtype : Dtype, optional
+        What dtype should be returned in `to_pandas_index`
+        (default=None).
     """
 
     _data: "Dict[Any, ColumnBase]"
@@ -111,8 +115,10 @@ def __init__(
         multiindex: bool = False,
         level_names=None,
         rangeindex: bool = False,
+        label_dtype: Dtype | None = None,
     ):
         self.rangeindex = rangeindex
+        self.label_dtype = label_dtype
         if data is None:
             data = {}
         # TODO: we should validate the keys of `data`
@@ -123,6 +129,7 @@ def __init__(
             self.multiindex = multiindex
             self._level_names = level_names
             self.rangeindex = data.rangeindex
+            self.label_dtype = data.label_dtype
         else:
             # This code path is performance-critical for copies and should be
             # modified with care.
@@ -292,7 +299,12 @@ def to_pandas_index(self) -> pd.Index:
                             self.names[0], self.names[-1] + diff, diff
                         )
                         return pd.RangeIndex(new_range, name=self.name)
-            result = pd.Index(self.names, name=self.name, tupleize_cols=False)
+            result = pd.Index(
+                self.names,
+                name=self.name,
+                tupleize_cols=False,
+                dtype=self.label_dtype,
+            )
         return result
 
     def insert(
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index fd4a15a3391..43ae9b9e81e 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -734,6 +734,7 @@ def __init__(
                 rangeindex = isinstance(
                     columns, (range, pd.RangeIndex, cudf.RangeIndex)
                 )
+                label_dtype = getattr(columns, "dtype", None)
                 self._data = ColumnAccessor(
                     {
                         k: column.column_empty(
@@ -745,6 +746,7 @@ def __init__(
                     if isinstance(columns, pd.Index)
                     else None,
                     rangeindex=rangeindex,
+                    label_dtype=label_dtype,
                 )
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
@@ -995,12 +997,15 @@ def _init_from_list_like(self, data, index=None, columns=None):
             self._data.rangeindex = isinstance(
                 columns, (range, pd.RangeIndex, cudf.RangeIndex)
             )
+            self._data.label_dtype = getattr(columns, "dtype", None)
 
     @_cudf_nvtx_annotate
     def _init_from_dict_like(
         self, data, index=None, columns=None, nan_as_null=None
     ):
+        label_dtype = None
         if columns is not None:
+            label_dtype = getattr(columns, "dtype", None)
             # remove all entries in data that are not in columns,
             # inserting new empty columns for entries in columns that
             # are not in data
@@ -1069,6 +1074,7 @@ def _init_from_dict_like(
             if isinstance(columns, pd.Index)
             else self._data._level_names
         )
+        self._data.label_dtype = label_dtype
 
     @classmethod
     def _from_data(
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 5677f97408a..74165731683 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -4566,6 +4566,17 @@ def test_dataframe_columns_returns_rangeindex_single_col():
     assert_eq(result, expected)
 
 
+@pytest.mark.parametrize("dtype", ["int64", "datetime64[ns]", "int8"])
+@pytest.mark.parametrize("idx_data", [[], [1, 2]])
+@pytest.mark.parametrize("data", [None, [], {}])
+def test_dataframe_columns_empty_data_preserves_dtype(dtype, idx_data, data):
+    result = cudf.DataFrame(
+        data, columns=cudf.Index(idx_data, dtype=dtype)
+    ).columns
+    expected = pd.Index(idx_data, dtype=dtype)
+    assert_eq(result, expected)
+
+
 @pytest.mark.parametrize(
     "data",
     [

From cfa2d513667edabda6c4487f15f251f757f0c94d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 22 Nov 2023 10:21:52 -0500
Subject: [PATCH 35/36] Expose stream parameter in public nvtext APIs (#14456)

Add stream parameter to public APIs:

- `nvtext::is_letter()`
- `nvtext::porter_stemmer_measure`
- `nvtext::edit_distance()`
- `nvtext::edit_distance_matrix()`

Also cleaned up some of the doxygen comments and added stream gtests.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/14456
---
 cpp/include/nvtext/edit_distance.hpp          | 42 +++++++++--------
 cpp/include/nvtext/stemmer.hpp                | 46 +++++++++++--------
 cpp/src/text/edit_distance.cu                 | 10 ++--
 cpp/src/text/stemmer.cu                       | 20 ++++----
 cpp/tests/CMakeLists.txt                      | 10 +++-
 cpp/tests/streams/text/edit_distance_test.cpp | 33 +++++++++++++
 cpp/tests/streams/text/stemmer_test.cpp       | 42 +++++++++++++++++
 7 files changed, 148 insertions(+), 55 deletions(-)
 create mode 100644 cpp/tests/streams/text/edit_distance_test.cpp
 create mode 100644 cpp/tests/streams/text/stemmer_test.cpp

diff --git a/cpp/include/nvtext/edit_distance.hpp b/cpp/include/nvtext/edit_distance.hpp
index 953ecf7734d..9a24662455b 100644
--- a/cpp/include/nvtext/edit_distance.hpp
+++ b/cpp/include/nvtext/edit_distance.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@ namespace nvtext {
 /**
  * @brief Compute the edit distance between individual strings in two strings columns.
  *
- * The `output[i]` is the edit distance between `strings[i]` and `targets[i]`.
+ * The `output[i]` is the edit distance between `input[i]` and `targets[i]`.
  * This edit distance calculation uses the Levenshtein algorithm as documented here:
  * https://www.cuelogic.com/blog/the-levenshtein-algorithm
  *
@@ -42,23 +42,25 @@ namespace nvtext {
  * d is now [1, 7, 0]
  * @endcode
  *
- * Any null entries for either `strings` or `targets` is ignored and the edit distance
+ * Any null entries for either `input` or `targets` is ignored and the edit distance
  * is computed as though the null entry is an empty string.
  *
- * The `targets.size()` must equal `strings.size()` unless `targets.size()==1`.
- * In this case, all `strings` will be computed against the single `targets[0]` string.
+ * The `targets.size()` must equal `input.size()` unless `targets.size()==1`.
+ * In this case, all `input` will be computed against the single `targets[0]` string.
  *
- * @throw cudf::logic_error if `targets.size() != strings.size()` and
+ * @throw cudf::logic_error if `targets.size() != input.size()` and
  *                          if `targets.size() != 1`
  *
- * @param strings Strings column of input strings
- * @param targets Strings to compute edit distance against `strings`
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of with replaced strings.
+ * @param input Strings column of input strings
+ * @param targets Strings to compute edit distance against `input`
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of with replaced strings
  */
 std::unique_ptr<cudf::column> edit_distance(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::strings_column_view const& targets,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -67,7 +69,7 @@ std::unique_ptr<cudf::column> edit_distance(
  * This uses the Levenshtein algorithm to calculate the edit distance between
  * two strings as documented here: https://www.cuelogic.com/blog/the-levenshtein-algorithm
  *
- * The output is essentially a `strings.size() x strings.size()` square matrix of integers.
+ * The output is essentially a `input.size() x input.size()` square matrix of integers.
  * All values at diagonal `row == col` are 0 since the edit distance between two identical
  * strings is zero. All values above the diagonal are reflected below since the edit distance
  * calculation is also commutative.
@@ -81,20 +83,22 @@ std::unique_ptr<cudf::column> edit_distance(
  *           [1, 2, 0]]
  * @endcode
  *
- * Null entries for `strings` are ignored and the edit distance
+ * Null entries for `input` are ignored and the edit distance
  * is computed as though the null entry is an empty string.
  *
- * The output is a lists column of size `strings.size()` and where each list item
- * is `strings.size()` elements.
+ * The output is a lists column of size `input.size()` and where each list item
+ * is `input.size()` elements.
  *
  * @throw cudf::logic_error if `strings.size() == 1`
  *
- * @param strings Strings column of input strings
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New lists column of edit distance values.
+ * @param input Strings column of input strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New lists column of edit distance values
  */
 std::unique_ptr<cudf::column> edit_distance_matrix(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/nvtext/stemmer.hpp b/cpp/include/nvtext/stemmer.hpp
index 0a57f8944d4..0e1759fdc5a 100644
--- a/cpp/include/nvtext/stemmer.hpp
+++ b/cpp/include/nvtext/stemmer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ enum class letter_type {
  *
  * A negative index value will check the character starting from the end
  * of each string. That is, for `character_index < 0` the letter checked for string
- * `strings[i]` is at position `strings[i].length + index`.
+ * `input[i]` is at position `input[i].length + index`.
  *
  * @code{.pseudo}
  * Example:
@@ -68,20 +68,22 @@ enum class letter_type {
  * A null input element at row `i` produces a corresponding null entry
  * for row `i` in the output column.
  *
- * @param strings Strings column of words to measure.
- * @param ltype Specify letter type to check.
- * @param character_index The character position to check in each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New BOOL column.
+ * @param input Strings column of words to measure
+ * @param ltype Specify letter type to check
+ * @param character_index The character position to check in each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL column
  */
 std::unique_ptr<cudf::column> is_letter(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   letter_type ltype,
   cudf::size_type character_index,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Returns boolean column indicating if character at `indices[i]` of `strings[i]`
+ * @brief Returns boolean column indicating if character at `indices[i]` of `input[i]`
  * is a consonant or vowel.
  *
  * Determining consonants and vowels is described in the following
@@ -116,19 +118,21 @@ std::unique_ptr<cudf::column> is_letter(
  * A null input element at row `i` produces a corresponding null entry
  * for row `i` in the output column.
  *
- * @throw cudf::logic_error if `indices.size() != strings.size()`
+ * @throw cudf::logic_error if `indices.size() != input.size()`
  * @throw cudf::logic_error if `indices` contain nulls.
  *
- * @param strings Strings column of words to measure.
- * @param ltype Specify letter type to check.
- * @param indices The character positions to check in each string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New BOOL column.
+ * @param input Strings column of words to measure
+ * @param ltype Specify letter type to check
+ * @param indices The character positions to check in each string
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New BOOL column
  */
 std::unique_ptr<cudf::column> is_letter(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   letter_type ltype,
   cudf::column_view const& indices,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -155,12 +159,14 @@ std::unique_ptr<cudf::column> is_letter(
  * A null input element at row `i` produces a corresponding null entry
  * for row `i` in the output column.
  *
- * @param strings Strings column of words to measure.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New INT32 column of measure values.
+ * @param input Strings column of words to measure
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return New INT32 column of measure values
  */
 std::unique_ptr<cudf::column> porter_stemmer_measure(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu
index 3d5f2d72e6f..a1d97409987 100644
--- a/cpp/src/text/edit_distance.cu
+++ b/cpp/src/text/edit_distance.cu
@@ -298,22 +298,24 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
 /**
  * @copydoc nvtext::edit_distance
  */
-std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& input,
                                             cudf::strings_column_view const& targets,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::edit_distance(strings, targets, cudf::get_default_stream(), mr);
+  return detail::edit_distance(input, targets, stream, mr);
 }
 
 /**
  * @copydoc nvtext::edit_distance_matrix
  */
-std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view const& input,
+                                                   rmm::cuda_stream_view stream,
                                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::edit_distance_matrix(strings, cudf::get_default_stream(), mr);
+  return detail::edit_distance_matrix(input, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu
index 2b2b8429d9c..bdcb0b2af32 100644
--- a/cpp/src/text/stemmer.cu
+++ b/cpp/src/text/stemmer.cu
@@ -250,36 +250,36 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& strings
 
 // external APIs
 
-std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& input,
                                         letter_type ltype,
                                         cudf::size_type character_index,
+                                        rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_letter(strings,
-                           ltype,
-                           thrust::make_constant_iterator<cudf::size_type>(character_index),
-                           cudf::get_default_stream(),
-                           mr);
+  return detail::is_letter(
+    input, ltype, thrust::make_constant_iterator<cudf::size_type>(character_index), stream, mr);
 }
 
-std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& input,
                                         letter_type ltype,
                                         cudf::column_view const& indices,
+                                        rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_letter(strings, ltype, indices, cudf::get_default_stream(), mr);
+  return detail::is_letter(input, ltype, indices, stream, mr);
 }
 
 /**
  * @copydoc nvtext::porter_stemmer_measure
  */
-std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view const& input,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::porter_stemmer_measure(strings, cudf::get_default_stream(), mr);
+  return detail::porter_stemmer_measure(input, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 1be8566fb0f..b35c72b9e9d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -664,8 +664,14 @@ ConfigureTest(
   testing
 )
 ConfigureTest(
-  STREAM_TEXT_TEST streams/text/ngrams_test.cpp streams/text/replace_test.cpp
-  streams/text/tokenize_test.cpp STREAM_MODE testing
+  STREAM_TEXT_TEST
+  streams/text/edit_distance_test.cpp
+  streams/text/ngrams_test.cpp
+  streams/text/replace_test.cpp
+  streams/text/stemmer_test.cpp
+  streams/text/tokenize_test.cpp
+  STREAM_MODE
+  testing
 )
 ConfigureTest(STREAM_UNARY_TEST streams/unary_test.cpp STREAM_MODE testing)
 
diff --git a/cpp/tests/streams/text/edit_distance_test.cpp b/cpp/tests/streams/text/edit_distance_test.cpp
new file mode 100644
index 00000000000..59206c39e69
--- /dev/null
+++ b/cpp/tests/streams/text/edit_distance_test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <nvtext/edit_distance.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class TextEditDistanceTest : public cudf::test::BaseFixture {};
+
+TEST_F(TextEditDistanceTest, EditDistance)
+{
+  auto const input       = cudf::test::strings_column_wrapper({"dog", "cat", "mouse", "pupper"});
+  auto const input_view  = cudf::strings_column_view(input);
+  auto const target      = cudf::test::strings_column_wrapper({"hog", "cake", "house", "puppy"});
+  auto const target_view = cudf::strings_column_view(target);
+  nvtext::edit_distance(input_view, target_view, cudf::test::get_default_stream());
+  nvtext::edit_distance_matrix(input_view, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/text/stemmer_test.cpp b/cpp/tests/streams/text/stemmer_test.cpp
new file mode 100644
index 00000000000..7aa51befa73
--- /dev/null
+++ b/cpp/tests/streams/text/stemmer_test.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <nvtext/stemmer.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class TextStemmerTest : public cudf::test::BaseFixture {};
+
+TEST_F(TextStemmerTest, IsLetter)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"abbey", "normal", "creates", "yearly", "trouble"});
+  auto const view      = cudf::strings_column_view(input);
+  auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
+  nvtext::is_letter(view, nvtext::letter_type::VOWEL, 0, cudf::test::get_default_stream());
+  auto const indices = cudf::test::fixed_width_column_wrapper<int32_t>({0, 1, 3, 5, 4});
+  nvtext::is_letter(view, nvtext::letter_type::VOWEL, indices, cudf::test::get_default_stream());
+}
+
+TEST_F(TextStemmerTest, Porter)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"abbey", "normal", "creates", "yearly", "trouble"});
+  auto const view = cudf::strings_column_view(input);
+  nvtext::porter_stemmer_measure(view, cudf::test::get_default_stream());
+}

From f02fde9de9354a829d6f4425e086c84d36c076ae Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 22 Nov 2023 18:26:18 +0000
Subject: [PATCH 36/36] Correct dtype of count aggregations on empty dataframes
 (#14473)

A count aggregation should always return an int64 column, even if the grouped dataframe is empty. Previously we did not do this because the short-circuiting for empty inputs was hit before handling the count case. Fix this by reordering the conditions.

- Closes https://github.com/rapidsai/cudf/issues/14200

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/14473
---
 python/cudf/cudf/core/groupby/groupby.py   |  6 +++---
 python/cudf/cudf/tests/groupby/test_agg.py | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+), 3 deletions(-)
 create mode 100644 python/cudf/cudf/tests/groupby/test_agg.py

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index c48e5109ff2..73e6774f5ce 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -592,7 +592,9 @@ def agg(self, func):
                     # Structs lose their labels which we reconstruct here
                     col = col._with_type_metadata(cudf.ListDtype(orig_dtype))
 
-                if (
+                if agg_kind in {"COUNT", "SIZE"}:
+                    data[key] = col.astype("int64")
+                elif (
                     self.obj.empty
                     and (
                         isinstance(agg_name, str)
@@ -609,8 +611,6 @@ def agg(self, func):
                     )
                 ):
                     data[key] = col.astype(orig_dtype)
-                elif agg_kind in {"COUNT", "SIZE"}:
-                    data[key] = col.astype("int64")
                 else:
                     data[key] = col
         data = ColumnAccessor(data, multiindex=multilevel)
diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py
new file mode 100644
index 00000000000..7919ee4a9f1
--- /dev/null
+++ b/python/cudf/cudf/tests/groupby/test_agg.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+import numpy as np
+import pytest
+
+import cudf
+
+
+@pytest.mark.parametrize(
+    "empty",
+    [True, False],
+    ids=["empty", "nonempty"],
+)
+def test_agg_count_dtype(empty):
+    df = cudf.DataFrame({"a": [1, 2, 1], "c": ["a", "b", "c"]})
+    if empty:
+        df = df.iloc[:0]
+    result = df.groupby("a").agg({"c": "count"})
+    assert result["c"].dtype == np.dtype("int64")