From 02879724b3073675ff3384071ef9227427f57b41 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 23 Oct 2024 19:16:59 -0400
Subject: [PATCH 01/26] Use the full ref name of `rmm.DeviceBuffer` in the
 sphinx config file (#17150)

This is an improvement PR that uses the full name of `rmm.DeviceBuffer` in the sphinx config file. Its a follow-up to this [comment](https://github.com/rapidsai/cudf/pull/16913#discussion_r1792283249).

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17150
---
 docs/cudf/source/conf.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index ecf619ddc44..5942cc16850 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -342,10 +342,7 @@ def clean_all_xml_files(path):
     "cudf.Series": ("cudf.core.series.Series", "cudf.Series"),
     "cudf.Index": ("cudf.core.index.Index", "cudf.Index"),
     "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"),
-    # TODO: Replace the first entry in a follow-up with rmm.pylibrmm.device_buffer.DeviceBuffer
-    # when the RMM objects inventory is generated from branch-24.12. The RMM objects inventory
-    # can be accessed here : https://docs.rapids.ai/api/rmm/nightly/objects.inv
-    "DeviceBuffer": ("rmm.DeviceBuffer", "rmm.DeviceBuffer"),
+    "DeviceBuffer": ("rmm.pylibrmm.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"),
 }
 
 

From d7cdf44da2ba921c6fa63feff8749d141643f76e Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 23 Oct 2024 20:19:57 -0400
Subject: [PATCH 02/26] Migrate NVText Stemming APIs to pylibcudf (#17085)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17085
---
 cpp/include/nvtext/stemmer.hpp                |  8 +-
 .../api_docs/pylibcudf/nvtext/index.rst       |  1 +
 .../api_docs/pylibcudf/nvtext/stemmer.rst     |  6 ++
 python/cudf/cudf/_lib/nvtext/stemmer.pyx      | 56 +++++---------
 .../pylibcudf/libcudf/nvtext/stemmer.pxd      |  7 +-
 .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt |  2 +-
 .../pylibcudf/pylibcudf/nvtext/__init__.pxd   |  2 +
 python/pylibcudf/pylibcudf/nvtext/__init__.py |  2 +
 python/pylibcudf/pylibcudf/nvtext/stemmer.pxd | 14 ++++
 python/pylibcudf/pylibcudf/nvtext/stemmer.pyx | 76 +++++++++++++++++++
 .../pylibcudf/tests/test_nvtext_stemmer.py    | 47 ++++++++++++
 11 files changed, 178 insertions(+), 43 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/stemmer.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py

diff --git a/cpp/include/nvtext/stemmer.hpp b/cpp/include/nvtext/stemmer.hpp
index 55a4124bfd0..e5b2a4cc21b 100644
--- a/cpp/include/nvtext/stemmer.hpp
+++ b/cpp/include/nvtext/stemmer.hpp
@@ -51,7 +51,7 @@ enum class letter_type {
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * b1 = is_letter(st, VOWEL, 1)
  * b1 is now [false, true, true]
  * @endcode
@@ -62,7 +62,7 @@ enum class letter_type {
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * b2 = is_letter(st, CONSONANT, -1) // last letter checked in each string
  * b2 is now [false, true, false]
  * @endcode
@@ -99,7 +99,7 @@ std::unique_ptr<cudf::column> is_letter(
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * ix = [3, 1, 4]
  * b1 = is_letter(st, VOWEL, ix)
  * b1 is now [true, true, false]
@@ -111,7 +111,7 @@ std::unique_ptr<cudf::column> is_letter(
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * ix = [3, -2, 4] // 2nd to last character in st[1] is checked
  * b2 = is_letter(st, CONSONANT, ix)
  * b2 is now [false, false, true]
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
index c5b9533597a..e0735a197fd 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -11,3 +11,4 @@ nvtext
     ngrams_tokenize
     normalize
     replace
+    stemmer
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst
new file mode 100644
index 00000000000..b407ff8451a
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst
@@ -0,0 +1,6 @@
+=======
+stemmer
+=======
+
+.. automodule:: pylibcudf.nvtext.stemmer
+   :members:
diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
index 5bf25562fed..63a389b64d5 100644
--- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx
+++ b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
@@ -1,24 +1,19 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from enum import IntEnum
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
+from cudf.core.buffer import acquire_spill_lock
+
 from pylibcudf.libcudf.nvtext.stemmer cimport (
-    is_letter as cpp_is_letter,
     letter_type,
-    porter_stemmer_measure as cpp_porter_stemmer_measure,
     underlying_type_t_letter_type,
 )
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
 
+from pylibcudf import nvtext
+
 
 class LetterType(IntEnum):
     CONSONANT = <underlying_type_t_letter_type> letter_type.CONSONANT
@@ -27,43 +22,34 @@ class LetterType(IntEnum):
 
 @acquire_spill_lock()
 def porter_stemmer_measure(Column strings):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_porter_stemmer_measure(c_strings))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        nvtext.stemmer.porter_stemmer_measure(
+            strings.to_pylibcudf(mode="read"),
+        )
+    )
 
 
 @acquire_spill_lock()
 def is_letter(Column strings,
               object ltype,
               size_type index):
-    cdef column_view c_strings = strings.view()
-    cdef letter_type c_ltype = <letter_type>(
-        <underlying_type_t_letter_type> ltype
+    return Column.from_pylibcudf(
+        nvtext.stemmer.is_letter(
+            strings.to_pylibcudf(mode="read"),
+            ltype==LetterType.VOWEL,
+            index,
+        )
     )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_is_letter(c_strings, c_ltype, index))
-
-    return Column.from_unique_ptr(move(c_result))
 
 
 @acquire_spill_lock()
 def is_letter_multi(Column strings,
                     object ltype,
                     Column indices):
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_indices = indices.view()
-    cdef letter_type c_ltype = <letter_type>(
-        <underlying_type_t_letter_type> ltype
+    return Column.from_pylibcudf(
+        nvtext.stemmer.is_letter(
+            strings.to_pylibcudf(mode="read"),
+            ltype==LetterType.VOWEL,
+            indices.to_pylibcudf(mode="read"),
+        )
     )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_is_letter(c_strings, c_ltype, c_indices))
-
-    return Column.from_unique_ptr(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
index 673bffa28ae..be3a2d75718 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
@@ -8,9 +9,9 @@ from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil:
-    ctypedef enum letter_type:
-        CONSONANT 'nvtext::letter_type::CONSONANT'
-        VOWEL 'nvtext::letter_type::VOWEL'
+    cpdef enum class letter_type:
+        CONSONANT
+        VOWEL
 
     cdef unique_ptr[column] porter_stemmer_measure(
         const column_view & strings
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index 7a94490998a..d97c0a73267 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
-                   ngrams_tokenize.pyx normalize.pyx replace.pyx
+                   ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index 5a5e665d309..a658e57018e 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -8,6 +8,7 @@ from . cimport (
     ngrams_tokenize,
     normalize,
     replace,
+    stemmer,
 )
 
 __all__ = [
@@ -18,4 +19,5 @@ __all__ = [
     "ngrams_tokenize",
     "normalize",
     "replace",
+    "stemmer",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index 77187f0845d..2c1feb089a2 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -8,6 +8,7 @@
     ngrams_tokenize,
     normalize,
     replace,
+    stemmer,
 )
 
 __all__ = [
@@ -18,4 +19,5 @@
     "ngrams_tokenize",
     "normalize",
     "replace",
+    "stemmer",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd
new file mode 100644
index 00000000000..48762efc01f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.stemmer cimport letter_type
+from pylibcudf.libcudf.types cimport size_type
+
+ctypedef fused ColumnOrSize:
+    Column
+    size_type
+
+cpdef Column is_letter(Column input, bool check_vowels, ColumnOrSize indices)
+
+cpdef Column porter_stemmer_measure(Column input)
diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
new file mode 100644
index 00000000000..854d1053624
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
@@ -0,0 +1,76 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.stemmer cimport (
+    is_letter as cpp_is_letter,
+    letter_type,
+    porter_stemmer_measure as cpp_porter_stemmer_measure,
+)
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column is_letter(
+    Column input,
+    bool check_vowels,
+    ColumnOrSize indices
+):
+    """
+    Returns boolean column indicating if the character
+    or characters at the provided character index or
+    indices (respectively) are consonants or vowels
+
+    For details, see :cpp:func:`is_letter`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    check_vowels : bool
+        If true, the check is for vowels. Otherwise the check is
+        for consonants.
+    indices : Union[Column, size_type]
+        The character position(s) to check in each string
+
+    Returns
+    -------
+    Column
+        New boolean column.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_is_letter(
+            input.view(),
+            letter_type.VOWEL if check_vowels else letter_type.CONSONANT,
+            indices if ColumnOrSize is size_type else indices.view()
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column porter_stemmer_measure(Column input):
+    """
+    Returns the Porter Stemmer measurements of a strings column.
+
+    For details, see :cpp:func:`porter_stemmer_measure`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column of words to measure
+
+    Returns
+    -------
+    Column
+        New column of measure values
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_porter_stemmer_measure(input.view())
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
new file mode 100644
index 00000000000..75d56f587a4
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    arr = ["trouble", "toy", "syzygy"]
+    return pa.array(arr)
+
+
+@pytest.mark.parametrize("check_vowels", [True, False])
+@pytest.mark.parametrize("indices", [[3, 1, 4], 1])
+def test_is_letter(input_col, check_vowels, indices):
+    def is_letter(s, i, check):
+        vowels = "aeiouy"
+        return (s[i] in vowels) == check
+
+    result = plc.nvtext.stemmer.is_letter(
+        plc.interop.from_arrow(input_col),
+        check_vowels,
+        plc.interop.from_arrow(pa.array(indices))
+        if isinstance(indices, list)
+        else indices,
+    )
+    expected = pa.array(
+        [
+            is_letter(
+                s,
+                indices[i] if isinstance(indices, list) else indices,
+                check_vowels,
+            )
+            for i, s in enumerate(input_col.to_pylist())
+        ]
+    )
+    assert_column_eq(result, expected)
+
+
+def test_porter_stemmer_measure(input_col):
+    result = plc.nvtext.stemmer.porter_stemmer_measure(
+        plc.interop.from_arrow(input_col),
+    )
+    expected = pa.array([1, 1, 2], type=pa.int32())
+    assert_column_eq(result, expected)

From 3a623149827ec347e721dd1a18072f18b0b4bcc1 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 24 Oct 2024 14:10:33 +0100
Subject: [PATCH 03/26] Upgrade to polars 1.11 in cudf-polars (#17154)

Polars 1.11 is out, with slight updates to the IR, so we can correctly raise for dynamic groupbys and see inequality joins.

These changes adapt to that and do a first pass at supporting inequality joins (by translating to cross + filter). A followup (#17000) will use libcudf's conditional joins.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mike Sarahan (https://github.com/msarahan)

URL: https://github.com/rapidsai/cudf/pull/17154
---
 .../all_cuda-118_arch-x86_64.yaml             |  2 +-
 .../all_cuda-125_arch-x86_64.yaml             |  2 +-
 conda/recipes/cudf-polars/meta.yaml           |  2 +-
 dependencies.yaml                             |  2 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 17 ++---
 .../cudf_polars/cudf_polars/dsl/translate.py  | 76 ++++++++++++++++++-
 .../cudf_polars/cudf_polars/testing/plugin.py | 38 ++++++++--
 python/cudf_polars/pyproject.toml             |  2 +-
 python/cudf_polars/tests/test_join.py         | 60 ++++++++++++++-
 9 files changed, 172 insertions(+), 29 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index bd5e6c3d569..c3716c4759a 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -65,7 +65,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.8,<1.9
+- polars>=1.11,<1.12
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<18.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 565a3ebfa3c..38e131e79cb 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -63,7 +63,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.8,<1.9
+- polars>=1.11,<1.12
 - pre-commit
 - pyarrow>=14.0.0,<18.0.0a0
 - pydata-sphinx-theme!=0.14.2
diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml
index e8fef715c60..edf92b930d9 100644
--- a/conda/recipes/cudf-polars/meta.yaml
+++ b/conda/recipes/cudf-polars/meta.yaml
@@ -43,7 +43,7 @@ requirements:
   run:
     - python
     - pylibcudf ={{ version }}
-    - polars >=1.8,<1.9
+    - polars >=1.11,<1.12
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/dependencies.yaml b/dependencies.yaml
index ff97b67f0ce..4804f7b00b0 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -727,7 +727,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.8,<1.9
+          - polars>=1.11,<1.12
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index eb93929cf61..f79e229d3f3 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -666,11 +666,11 @@ def __init__(
             raise NotImplementedError(
                 "rolling window/groupby"
             )  # pragma: no cover; rollingwindow constructor has already raised
+        if self.options.dynamic:
+            raise NotImplementedError("dynamic group by")
         if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests):
             raise NotImplementedError("Nested aggregations in groupby")
         self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
-        if len(self.keys) == 0:
-            raise NotImplementedError("dynamic groupby")
 
     @staticmethod
     def check_agg(agg: expr.Expr) -> int:
@@ -802,10 +802,10 @@ class Join(IR):
     right_on: tuple[expr.NamedExpr, ...]
     """List of expressions used as keys in the right frame."""
     options: tuple[
-        Literal["inner", "left", "right", "full", "leftsemi", "leftanti", "cross"],
+        Literal["inner", "left", "right", "full", "semi", "anti", "cross"],
         bool,
         tuple[int, int] | None,
-        str | None,
+        str,
         bool,
     ]
     """
@@ -840,7 +840,7 @@ def __init__(
     @staticmethod
     @cache
     def _joiners(
-        how: Literal["inner", "left", "right", "full", "leftsemi", "leftanti"],
+        how: Literal["inner", "left", "right", "full", "semi", "anti"],
     ) -> tuple[
         Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None
     ]:
@@ -862,13 +862,13 @@ def _joiners(
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
             )
-        elif how == "leftsemi":
+        elif how == "semi":
             return (
                 plc.join.left_semi_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 None,
             )
-        elif how == "leftanti":
+        elif how == "anti":
             return (
                 plc.join.left_anti_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
@@ -933,7 +933,6 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         left, right = (c.evaluate(cache=cache) for c in self.children)
         how, join_nulls, zlice, suffix, coalesce = self.options
-        suffix = "_right" if suffix is None else suffix
         if how == "cross":
             # Separate implementation, since cross_join returns the
             # result, not the gather maps
@@ -955,7 +954,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                     columns[left.num_columns :], right.column_names, strict=True
                 )
             ]
-            return DataFrame([*left_cols, *right_cols])
+            return DataFrame([*left_cols, *right_cols]).slice(zlice)
         # TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184
         left_on = DataFrame(broadcast(*(e.evaluate(left) for e in self.left_on)))
         right_on = DataFrame(broadcast(*(e.evaluate(right) for e in self.right_on)))
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 522c4a6729c..c28f2c2651a 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -5,10 +5,11 @@
 
 from __future__ import annotations
 
+import functools
 import json
 from contextlib import AbstractContextManager, nullcontext
 from functools import singledispatch
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import pyarrow as pa
 import pylibcudf as plc
@@ -19,9 +20,13 @@
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
 from cudf_polars.dsl import expr, ir
+from cudf_polars.dsl.traversal import make_recursive, reuse_if_unchanged
 from cudf_polars.typing import NodeTraverser
 from cudf_polars.utils import dtypes, sorting
 
+if TYPE_CHECKING:
+    from cudf_polars.typing import ExprTransformer
+
 __all__ = ["translate_ir", "translate_named_expr"]
 
 
@@ -182,7 +187,71 @@ def _(
     with set_node(visitor, node.input_right):
         inp_right = translate_ir(visitor, n=None)
         right_on = [translate_named_expr(visitor, n=e) for e in node.right_on]
-    return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right)
+    if (how := node.options[0]) in {
+        "inner",
+        "left",
+        "right",
+        "full",
+        "cross",
+        "semi",
+        "anti",
+    }:
+        return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right)
+    else:
+        how, op1, op2 = how
+        if how != "ie_join":
+            raise NotImplementedError(
+                f"Unsupported join type {how}"
+            )  # pragma: no cover; asof joins not yet exposed
+        # No exposure of mixed/conditional joins in pylibcudf yet, so in
+        # the first instance, implement by doing a cross join followed by
+        # a filter.
+        _, join_nulls, zlice, suffix, coalesce = node.options
+        cross = ir.Join(
+            schema,
+            [],
+            [],
+            ("cross", join_nulls, None, suffix, coalesce),
+            inp_left,
+            inp_right,
+        )
+        dtype = plc.DataType(plc.TypeId.BOOL8)
+        if op2 is None:
+            ops = [op1]
+        else:
+            ops = [op1, op2]
+        suffix = cross.options[3]
+
+        # Column references in the right table refer to the post-join
+        # names, so with suffixes.
+        def _rename(e: expr.Expr, rec: ExprTransformer) -> expr.Expr:
+            if isinstance(e, expr.Col) and e.name in inp_left.schema:
+                return type(e)(e.dtype, f"{e.name}{suffix}")
+            return reuse_if_unchanged(e, rec)
+
+        mapper = make_recursive(_rename)
+        right_on = [
+            expr.NamedExpr(
+                f"{old.name}{suffix}" if old.name in inp_left.schema else old.name, new
+            )
+            for new, old in zip(
+                (mapper(e.value) for e in right_on), right_on, strict=True
+            )
+        ]
+        mask = functools.reduce(
+            functools.partial(
+                expr.BinOp, dtype, plc.binaryop.BinaryOperator.LOGICAL_AND
+            ),
+            (
+                expr.BinOp(dtype, expr.BinOp._MAPPING[op], left.value, right.value)
+                for op, left, right in zip(ops, left_on, right_on, strict=True)
+            ),
+        )
+        filtered = ir.Filter(schema, expr.NamedExpr("mask", mask), cross)
+        if zlice is not None:
+            offset, length = zlice
+            return ir.Slice(schema, offset, length, filtered)
+        return filtered
 
 
 @_translate_ir.register
@@ -319,8 +388,7 @@ def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR:
     # IR is versioned with major.minor, minor is bumped for backwards
     # compatible changes (e.g. adding new nodes), major is bumped for
     # incompatible changes (e.g. renaming nodes).
-    # Polars 1.7 changes definition of the CSV reader options schema name.
-    if (version := visitor.version()) >= (3, 0):
+    if (version := visitor.version()) >= (4, 0):
         raise NotImplementedError(
             f"No support for polars IR {version=}"
         )  # pragma: no cover; no such version for now.
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index 05b76d76808..a3607159e01 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -53,12 +53,34 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-columns]": "Correctly raises but different error",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "Correctly raises but different error",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "Correctly raises but different error",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "Correctly raises but different error",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[False-False]": "Needs some variant of cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[True-False]": "Needs some variant of cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception",
     "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394",
     "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match",
@@ -107,6 +129,14 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero",
     "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list",
     "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func0-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func1-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func2-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func3-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func0-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func1-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func2-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func3-none]": "cudf-polars doesn't nullify division by zero",
     "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values",
     "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852",
     "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
@@ -124,13 +154,6 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_by_monday_and_offset_5444": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[left-expected0]": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[right-expected1]": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[datapoint-expected2]": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_rolling_dynamic_sortedness_check": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_validation": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_15225": "IR needs to expose groupby-dynamic information",
     "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
     "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU",
     "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU",
@@ -140,6 +163,7 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match",
     "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
     "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852",
+    "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised",
     # Maybe flaky, order-dependent?
     "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
     "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index a8bb634732f..2afdab1be4b 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.8,<1.9",
+    "polars>=1.11,<1.12",
     "pylibcudf==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 7d9ec98db97..501560d15b8 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -2,9 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+from contextlib import nullcontext
+
 import pytest
 
 import polars as pl
+from polars.testing import assert_frame_equal
 
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
@@ -22,6 +25,11 @@ def how(request):
     return request.param
 
 
+@pytest.fixture(params=[None, (1, 5), (1, None), (0, 2), (0, None)])
+def zlice(request):
+    return request.param
+
+
 @pytest.fixture
 def left():
     return pl.LazyFrame(
@@ -37,8 +45,9 @@ def left():
 def right():
     return pl.LazyFrame(
         {
-            "a": [1, 4, 3, 7, None, None],
-            "c": [2, 3, 4, 5, 6, 7],
+            "a": [1, 4, 3, 7, None, None, 1],
+            "c": [2, 3, 4, 5, 6, 7, 8],
+            "d": [6, None, 7, 8, -1, 2, 4],
         }
     )
 
@@ -70,11 +79,31 @@ def test_coalesce_join(left, right, how, join_nulls, join_expr):
     query = left.join(
         right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=True
     )
-    assert_gpu_result_equal(query, check_row_order=False)
+    assert_gpu_result_equal(query, check_row_order=how == "left")
 
 
-def test_cross_join(left, right):
+def test_left_join_with_slice(left, right, join_nulls, zlice):
+    q = left.join(right, on="a", how="left", join_nulls=join_nulls, coalesce=True)
+    ctx = nullcontext()
+    if zlice is not None:
+        q_expect = q.collect().slice(*zlice)
+        q = q.slice(*zlice)
+        if zlice == (1, 5) or zlice == (0, 2):
+            # https://github.com/pola-rs/polars/issues/19403
+            # https://github.com/pola-rs/polars/issues/19405
+            ctx = pytest.raises(AssertionError)
+            assert_frame_equal(
+                q_expect, q.collect(engine=pl.GPUEngine(raise_on_fail=True))
+            )
+
+    with ctx:
+        assert_gpu_result_equal(q)
+
+
+def test_cross_join(left, right, zlice):
     q = left.join(right, how="cross")
+    if zlice is not None:
+        q = q.slice(*zlice)
 
     assert_gpu_result_equal(q)
 
@@ -86,3 +115,26 @@ def test_join_literal_key_unsupported(left, right, left_on, right_on):
     q = left.join(right, left_on=left_on, right_on=right_on, how="inner")
 
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize(
+    "conditions",
+    [
+        [pl.col("a") < pl.col("a_right")],
+        [pl.col("a_right") <= pl.col("a") * 2],
+        [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")],
+        [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")],
+        [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2],
+    ],
+)
+def test_join_where(left, right, conditions, zlice):
+    q = left.join_where(right, *conditions)
+
+    assert_gpu_result_equal(q, check_row_order=False)
+
+    if zlice is not None:
+        q_len = q.slice(*zlice).select(pl.len())
+        # Can't compare result, since row order is not guaranteed and
+        # therefore we only check the length
+
+        assert_gpu_result_equal(q_len)

From b75036b12a8d5713e34162571cec24ac91941b85 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 24 Oct 2024 15:34:15 -0400
Subject: [PATCH 04/26] Remove unused variable in internal merge_tdigests
 utility (#17151)

Removes unused variable that contains host copy of the group_offsets data.
This host variable appears to have been made obsolete by a combination of #16897 and #16780
Found while working on #17149

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17151
---
 .../quantiles/tdigest/tdigest_aggregation.cu  | 40 +++++--------------
 1 file changed, 9 insertions(+), 31 deletions(-)

diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index b0a84a6d50c..d27420658d6 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -1126,12 +1126,8 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
  * `max` of 0.
  *
  * @param tdv input tdigests. The tdigests within this column are grouped by key.
- * @param h_group_offsets a host iterator of the offsets to the start of each group. A group is
- * counted as one even when the cluster is empty in it. The offsets should have the same values as
- * the ones in `group_offsets`.
  * @param group_offsets a device iterator of the offsets to the start of each group. A group is
- * counted as one even when the cluster is empty in it. The offsets should have the same values as
- * the ones in `h_group_offsets`.
+ * counted as one even when the cluster is empty in it.
  * @param group_labels a device iterator of the the group label for each tdigest cluster including
  * empty clusters.
  * @param num_group_labels the number of unique group labels.
@@ -1142,9 +1138,8 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
  *
  * @return A column containing the merged tdigests.
  */
-template <typename HGroupOffsetIter, typename GroupOffsetIter, typename GroupLabelIter>
+template <typename GroupOffsetIter, typename GroupLabelIter>
 std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
-                                       HGroupOffsetIter h_group_offsets,
                                        GroupOffsetIter group_offsets,
                                        GroupLabelIter group_labels,
                                        size_t num_group_labels,
@@ -1313,21 +1308,13 @@ std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
 
   if (input.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); }
 
-  auto group_offsets_  = group_offsets_fn{input.size()};
-  auto h_group_offsets = cudf::detail::make_counting_transform_iterator(0, group_offsets_);
-  auto group_offsets   = cudf::detail::make_counting_transform_iterator(0, group_offsets_);
-  auto group_labels    = thrust::make_constant_iterator(0);
-  return to_tdigest_scalar(merge_tdigests(tdv,
-                                          h_group_offsets,
-                                          group_offsets,
-                                          group_labels,
-                                          input.size(),
-                                          1,
-                                          max_centroids,
-                                          stream,
-                                          mr),
-                           stream,
-                           mr);
+  auto group_offsets_ = group_offsets_fn{input.size()};
+  auto group_offsets  = cudf::detail::make_counting_transform_iterator(0, group_offsets_);
+  auto group_labels   = thrust::make_constant_iterator(0);
+  return to_tdigest_scalar(
+    merge_tdigests(tdv, group_offsets, group_labels, input.size(), 1, max_centroids, stream, mr),
+    stream,
+    mr);
 }
 
 std::unique_ptr<column> group_tdigest(column_view const& col,
@@ -1376,16 +1363,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
     return cudf::tdigest::detail::make_empty_tdigests_column(num_groups, stream, mr);
   }
 
-  // bring group offsets back to the host
-  std::vector<size_type> h_group_offsets(group_offsets.size());
-  cudaMemcpyAsync(h_group_offsets.data(),
-                  group_offsets.begin(),
-                  sizeof(size_type) * group_offsets.size(),
-                  cudaMemcpyDefault,
-                  stream);
-
   return merge_tdigests(tdv,
-                        h_group_offsets.begin(),
                         group_offsets.data(),
                         group_labels.data(),
                         group_labels.size(),

From 7115f20e91a314f07333cbd5c01adc62bf2fbb0c Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 24 Oct 2024 15:34:44 -0400
Subject: [PATCH 05/26] Move `segmented_gather` function from the copying
 module to the lists module (#17148)

This PR moves `segmented_gather` out of the copying module and into the lists module. And it uses the pylibcudf `segmented_gather` implementation in cudf python.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/17148
---
 python/cudf/cudf/_lib/copying.pyx     | 26 +-----------------
 python/cudf/cudf/_lib/lists.pyx       | 38 +++++++++++++++++----------
 python/cudf/cudf/core/column/lists.py |  2 +-
 3 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 30353c4be6c..4221e745e65 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -4,7 +4,7 @@ import pickle
 
 from libc.stdint cimport uint8_t, uintptr_t
 from libcpp cimport bool
-from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -30,10 +30,6 @@ from libcpp.memory cimport make_unique
 cimport pylibcudf.libcudf.contiguous_split as cpp_contiguous_split
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.lists.gather cimport (
-    segmented_gather as cpp_segmented_gather,
-)
-from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.types cimport size_type
 
@@ -339,26 +335,6 @@ def get_element(Column input_column, size_type index):
     )
 
 
-@acquire_spill_lock()
-def segmented_gather(Column source_column, Column gather_map):
-    cdef shared_ptr[lists_column_view] source_LCV = (
-        make_shared[lists_column_view](source_column.view())
-    )
-    cdef shared_ptr[lists_column_view] gather_map_LCV = (
-        make_shared[lists_column_view](gather_map.view())
-    )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_segmented_gather(
-                source_LCV.get()[0], gather_map_LCV.get()[0])
-        )
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
-
-
 cdef class _CPackedColumns:
 
     @staticmethod
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 7e8710bedb6..12432ac6d5d 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.types cimport null_order, size_type
 from cudf._lib.column cimport Column
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
-import pylibcudf
+import pylibcudf as plc
 
 from pylibcudf cimport Scalar
 
@@ -17,7 +17,7 @@ from pylibcudf cimport Scalar
 @acquire_spill_lock()
 def count_elements(Column col):
     return Column.from_pylibcudf(
-        pylibcudf.lists.count_elements(
+        plc.lists.count_elements(
             col.to_pylibcudf(mode="read"))
     )
 
@@ -25,8 +25,8 @@ def count_elements(Column col):
 @acquire_spill_lock()
 def explode_outer(list source_columns, int explode_column_idx):
     return columns_from_pylibcudf_table(
-        pylibcudf.lists.explode_outer(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]),
+        plc.lists.explode_outer(
+            plc.Table([c.to_pylibcudf(mode="read") for c in source_columns]),
             explode_column_idx,
         )
     )
@@ -35,7 +35,7 @@ def explode_outer(list source_columns, int explode_column_idx):
 @acquire_spill_lock()
 def distinct(Column col, bool nulls_equal, bool nans_all_equal):
     return Column.from_pylibcudf(
-        pylibcudf.lists.distinct(
+        plc.lists.distinct(
             col.to_pylibcudf(mode="read"),
             nulls_equal,
             nans_all_equal,
@@ -46,7 +46,7 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal):
 @acquire_spill_lock()
 def sort_lists(Column col, bool ascending, str na_position):
     return Column.from_pylibcudf(
-        pylibcudf.lists.sort_lists(
+        plc.lists.sort_lists(
             col.to_pylibcudf(mode="read"),
             ascending,
             null_order.BEFORE if na_position == "first" else null_order.AFTER,
@@ -58,7 +58,7 @@ def sort_lists(Column col, bool ascending, str na_position):
 @acquire_spill_lock()
 def extract_element_scalar(Column col, size_type index):
     return Column.from_pylibcudf(
-        pylibcudf.lists.extract_list_element(
+        plc.lists.extract_list_element(
             col.to_pylibcudf(mode="read"),
             index,
         )
@@ -68,7 +68,7 @@ def extract_element_scalar(Column col, size_type index):
 @acquire_spill_lock()
 def extract_element_column(Column col, Column index):
     return Column.from_pylibcudf(
-        pylibcudf.lists.extract_list_element(
+        plc.lists.extract_list_element(
             col.to_pylibcudf(mode="read"),
             index.to_pylibcudf(mode="read"),
         )
@@ -78,7 +78,7 @@ def extract_element_column(Column col, Column index):
 @acquire_spill_lock()
 def contains_scalar(Column col, py_search_key):
     return Column.from_pylibcudf(
-        pylibcudf.lists.contains(
+        plc.lists.contains(
             col.to_pylibcudf(mode="read"),
             <Scalar> py_search_key.device_value.c_value,
         )
@@ -88,7 +88,7 @@ def contains_scalar(Column col, py_search_key):
 @acquire_spill_lock()
 def index_of_scalar(Column col, object py_search_key):
     return Column.from_pylibcudf(
-        pylibcudf.lists.index_of(
+        plc.lists.index_of(
             col.to_pylibcudf(mode="read"),
             <Scalar> py_search_key.device_value.c_value,
             True,
@@ -99,7 +99,7 @@ def index_of_scalar(Column col, object py_search_key):
 @acquire_spill_lock()
 def index_of_column(Column col, Column search_keys):
     return Column.from_pylibcudf(
-        pylibcudf.lists.index_of(
+        plc.lists.index_of(
             col.to_pylibcudf(mode="read"),
             search_keys.to_pylibcudf(mode="read"),
             True,
@@ -110,8 +110,8 @@ def index_of_column(Column col, Column search_keys):
 @acquire_spill_lock()
 def concatenate_rows(list source_columns):
     return Column.from_pylibcudf(
-        pylibcudf.lists.concatenate_rows(
-            pylibcudf.Table([
+        plc.lists.concatenate_rows(
+            plc.Table([
                 c.to_pylibcudf(mode="read") for c in source_columns
             ])
         )
@@ -121,8 +121,18 @@ def concatenate_rows(list source_columns):
 @acquire_spill_lock()
 def concatenate_list_elements(Column input_column, dropna=False):
     return Column.from_pylibcudf(
-        pylibcudf.lists.concatenate_list_elements(
+        plc.lists.concatenate_list_elements(
             input_column.to_pylibcudf(mode="read"),
             dropna,
         )
     )
+
+
+@acquire_spill_lock()
+def segmented_gather(Column source_column, Column gather_map):
+    return Column.from_pylibcudf(
+        plc.lists.segmented_gather(
+            source_column.to_pylibcudf(mode="read"),
+            gather_map.to_pylibcudf(mode="read"),
+        )
+    )
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index c6a39199e3b..e9d24d4f450 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -11,7 +11,6 @@
 from typing_extensions import Self
 
 import cudf
-from cudf._lib.copying import segmented_gather
 from cudf._lib.lists import (
     concatenate_list_elements,
     concatenate_rows,
@@ -22,6 +21,7 @@
     extract_element_scalar,
     index_of_column,
     index_of_scalar,
+    segmented_gather,
     sort_lists,
 )
 from cudf._lib.strings.convert.convert_lists import format_list_column

From 03777f6b5d44d54316e55bff4e31d3e8e6583c25 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 25 Oct 2024 09:06:25 -0400
Subject: [PATCH 06/26] Fix host-to-device copy missing sync in
 strings/duration convert (#17149)

Fixes a missing stream sync when copying a temporary host vector to device. The host vector could be destroyed before the copy is completed. Updates the code to use vector factory function `make_device_uvector_sync()` instead of `cudaMemcpyAsync`

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/17149
---
 cpp/src/strings/convert/convert_durations.cu | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 0db1adf1223..f5d052c6657 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -16,6 +16,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/convert/convert_durations.hpp>
 #include <cudf/strings/detail/convert/int_to_string.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
@@ -152,12 +153,8 @@ struct format_compiler {
     }
 
     // create program in device memory
-    d_items.resize(items.size(), stream);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(d_items.data(),
-                                  items.data(),
-                                  items.size() * sizeof(items[0]),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
+    d_items = cudf::detail::make_device_uvector_sync(
+      items, stream, cudf::get_current_device_resource_ref());
   }
 
   format_item const* compiled_format_items() { return d_items.data(); }

From e98e6b9209ff8557d85cb9b828b895884b0c7b7a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 25 Oct 2024 09:07:06 -0400
Subject: [PATCH 07/26] Deprecate current libcudf nvtext minhash functions
 (#17152)

Deprecates the current nvtext minhash functions some of which will be replaced in #16756 with a different signature. The others will no longer be used and removed in future release. The existing gtests and benchmarks will be retained for rework in the future release as well.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17152
---
 cpp/benchmarks/CMakeLists.txt  |  4 ++--
 cpp/include/nvtext/minhash.hpp | 24 ++++++++++++++++++------
 cpp/tests/CMakeLists.txt       |  1 -
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index f013b31b3de..7f82b603912 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -348,8 +348,8 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary
 ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
 
 ConfigureNVBench(
-  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
+  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/normalize.cpp
+  text/replace.cpp text/tokenize.cpp text/vocab.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 7c909f1a948..42124461cdf 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -41,6 +41,8 @@ namespace CUDF_EXPORT nvtext {
  *
  * This function uses MurmurHash3_x86_32 for the hash algorithm.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if the width < 2
  *
  * @param input Strings column to compute minhash
@@ -51,7 +53,7 @@ namespace CUDF_EXPORT nvtext {
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Minhash values for each string in input
  */
-std::unique_ptr<cudf::column> minhash(
+[[deprecated]] std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
   cudf::numeric_scalar<uint32_t> seed = 0,
   cudf::size_type width               = 4,
@@ -71,6 +73,8 @@ std::unique_ptr<cudf::column> minhash(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12 - to be replaced in a future release
+ *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
@@ -83,7 +87,7 @@ std::unique_ptr<cudf::column> minhash(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> minhash(
+[[deprecated]] std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
   cudf::device_span<uint32_t const> seeds,
   cudf::size_type width             = 4,
@@ -102,6 +106,8 @@ std::unique_ptr<cudf::column> minhash(
  * The hash function returns 2 uint64 values but only the first value
  * is used with the minhash calculation.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if the width < 2
  *
  * @param input Strings column to compute minhash
@@ -112,7 +118,7 @@ std::unique_ptr<cudf::column> minhash(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Minhash values as UINT64 for each string in input
  */
-std::unique_ptr<cudf::column> minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> minhash64(
   cudf::strings_column_view const& input,
   cudf::numeric_scalar<uint64_t> seed = 0,
   cudf::size_type width               = 4,
@@ -132,6 +138,8 @@ std::unique_ptr<cudf::column> minhash64(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12 - to be replaced in a future release
+ *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
@@ -144,7 +152,7 @@ std::unique_ptr<cudf::column> minhash64(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> minhash64(
   cudf::strings_column_view const& input,
   cudf::device_span<uint64_t const> seeds,
   cudf::size_type width             = 4,
@@ -164,6 +172,8 @@ std::unique_ptr<cudf::column> minhash64(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
@@ -173,7 +183,7 @@ std::unique_ptr<cudf::column> minhash64(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> word_minhash(
+[[deprecated]] std::unique_ptr<cudf::column> word_minhash(
   cudf::lists_column_view const& input,
   cudf::device_span<uint32_t const> seeds,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -193,6 +203,8 @@ std::unique_ptr<cudf::column> word_minhash(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
@@ -202,7 +214,7 @@ std::unique_ptr<cudf::column> word_minhash(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> word_minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> word_minhash64(
   cudf::lists_column_view const& input,
   cudf::device_span<uint64_t const> seeds,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a4213dcbe94..b78a64d0e55 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -611,7 +611,6 @@ ConfigureTest(
   text/bpe_tests.cpp
   text/edit_distance_tests.cpp
   text/jaccard_tests.cpp
-  text/minhash_tests.cpp
   text/ngrams_tests.cpp
   text/ngrams_tokenize_tests.cpp
   text/normalize_tests.cpp

From 0bb699e7616bbfb8564fb3d9db986756713aec8c Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 25 Oct 2024 13:10:10 -0400
Subject: [PATCH 08/26] Move nvtext ngrams benchmarks to nvbench (#17173)

Moves the `nvtext::generate_ngrams` and `nvtext::generate_character_ngrams` benchmarks from google-bench to nvbench.
Target parameters are exposed to help with profiling.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/17173
---
 cpp/benchmarks/CMakeLists.txt  |  6 ++--
 cpp/benchmarks/text/ngrams.cpp | 65 ++++++++++++++--------------------
 2 files changed, 29 insertions(+), 42 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 7f82b603912..2a4ac789046 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -345,11 +345,11 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
-ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
+ConfigureBench(TEXT_BENCH text/subword.cpp)
 
 ConfigureNVBench(
-  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/normalize.cpp
-  text/replace.cpp text/tokenize.cpp text/vocab.cpp
+  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp
+  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp
index 8e48f8e9a05..43d57201b20 100644
--- a/cpp/benchmarks/text/ngrams.cpp
+++ b/cpp/benchmarks/text/ngrams.cpp
@@ -15,58 +15,45 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/string/string_bench_args.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/generate_ngrams.hpp>
 
-class TextNGrams : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-enum class ngrams_type { tokens, characters };
-
-static void BM_ngrams(benchmark::State& state, ngrams_type nt)
+static void bench_ngrams(nvbench::state& state)
 {
-  auto const n_rows          = static_cast<cudf::size_type>(state.range(0));
-  auto const max_str_length  = static_cast<cudf::size_type>(state.range(1));
+  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const ngram_type = state.get_string("type");
+
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
   auto const separator = cudf::string_scalar("_");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    switch (nt) {
-      case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break;
-      case ngrams_type::characters: nvtext::generate_character_ngrams(input); break;
-    }
-  }
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
-}
+  auto chars_size = input.chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size * 2);
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 5;
-  int const max_rowlen = 40;
-  int const len_mult   = 2;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+  if (ngram_type == "chars") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::generate_character_ngrams(input);
+    });
+  } else {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::generate_ngrams(input, 2, separator);
+    });
+  }
 }
 
-#define NVTEXT_BENCHMARK_DEFINE(name)                             \
-  BENCHMARK_DEFINE_F(TextNGrams, name)                            \
-  (::benchmark::State & st) { BM_ngrams(st, ngrams_type::name); } \
-  BENCHMARK_REGISTER_F(TextNGrams, name)                          \
-    ->Apply(generate_bench_args)                                  \
-    ->UseManualTime()                                             \
-    ->Unit(benchmark::kMillisecond);
-
-NVTEXT_BENCHMARK_DEFINE(tokens)
-NVTEXT_BENCHMARK_DEFINE(characters)
+NVBENCH_BENCH(bench_ngrams)
+  .set_name("ngrams")
+  .add_int64_axis("num_rows", {131072, 262144, 524288, 1048578})
+  .add_int64_axis("row_width", {10, 20, 40, 100})
+  .add_string_axis("type", {"chars", "tokens"});

From 2113bd6bbce62028eff0fa523a85ea859bf2bc08 Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Fri, 25 Oct 2024 19:18:53 +0200
Subject: [PATCH 09/26] devcontainer: replace `VAULT_HOST` with `AWS_ROLE_ARN`
 (#17134)

This PR is replacing the `VAULT_HOST` variable with `AWS_ROLE_ARN`. This is required to use the new token service to get AWS credentials.

Authors:
  - Jordan Jacobelli (https://github.com/jjacobelli)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Paul Taylor (https://github.com/trxcllnt)

URL: https://github.com/rapidsai/cudf/pull/17134
---
 .devcontainer/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 8190b5d0297..315a389339a 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -31,6 +31,6 @@ ENV PYTHONDONTWRITEBYTECODE="1"
 
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
-ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
+ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV HISTFILE="/home/coder/.cache/._bash_history"
 ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache"

From 5cba4fb18883dd511e3f892bfbe3ac46caa2db6c Mon Sep 17 00:00:00 2001
From: Jirka Borovec <6035284+Borda@users.noreply.github.com>
Date: Fri, 25 Oct 2024 22:38:38 +0200
Subject: [PATCH 10/26] lint: replace `isort` with Ruff's rule I (#16685)

since #15312 moved formatting from Black to Rufft, it would make sense also unify import formatting under the same ruff so use build-in `I` rule instead of additional `isort`

Authors:
  - Jirka Borovec (https://github.com/Borda)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/16685
---
 .pre-commit-config.yaml                       | 12 +---
 CONTRIBUTING.md                               |  4 +-
 .../developer_guide/contributing_guide.md     |  3 +-
 docs/cudf/source/user_guide/10min.ipynb       |  2 +-
 .../source/user_guide/guide-to-udfs.ipynb     |  6 ++
 pyproject.toml                                |  2 +
 python/cudf/benchmarks/conftest.py            | 18 +++---
 python/cudf/cudf/_typing.py                   | 10 ++--
 python/cudf/cudf/core/buffer/buffer.py        |  5 +-
 .../core/buffer/exposure_tracked_buffer.py    |  5 +-
 python/cudf/cudf/core/column/__init__.py      |  1 -
 python/cudf/cudf/core/column/categorical.py   |  3 +-
 python/cudf/cudf/core/column/column.py        |  3 +-
 python/cudf/cudf/core/column/datetime.py      |  4 +-
 python/cudf/cudf/core/column/decimal.py       |  3 +-
 python/cudf/cudf/core/column/lists.py         |  4 +-
 python/cudf/cudf/core/column/methods.py       |  4 +-
 python/cudf/cudf/core/column/numerical.py     |  4 +-
 python/cudf/cudf/core/column/string.py        |  4 +-
 python/cudf/cudf/core/column/timedelta.py     |  4 +-
 python/cudf/cudf/core/column_accessor.py      |  3 +-
 python/cudf/cudf/core/dataframe.py            |  4 +-
 python/cudf/cudf/core/df_protocol.py          |  7 ++-
 python/cudf/cudf/core/frame.py                |  3 +-
 python/cudf/cudf/core/groupby/groupby.py      |  4 +-
 python/cudf/cudf/core/index.py                |  4 +-
 python/cudf/cudf/core/indexed_frame.py        |  4 +-
 python/cudf/cudf/core/indexing_utils.py       | 12 ++--
 python/cudf/cudf/core/multiindex.py           |  4 +-
 python/cudf/cudf/core/series.py               |  4 +-
 python/cudf/cudf/core/tools/datetimes.py      |  5 +-
 python/cudf/cudf/pandas/fast_slow_proxy.py    |  4 +-
 python/cudf/cudf/pandas/module_accelerator.py |  2 +-
 .../pandas/scripts/analyze-test-failures.py   |  1 +
 .../cudf/pandas/scripts/conftest-patch.py     |  2 +-
 .../pandas/scripts/summarize-test-results.py  |  3 +-
 .../cudf_pandas_tests/test_fast_slow_proxy.py |  3 +-
 python/cudf/pyproject.toml                    | 59 +++++--------------
 python/cudf_kafka/pyproject.toml              | 59 +++++--------------
 python/custreamz/custreamz/tests/conftest.py  |  1 +
 python/custreamz/pyproject.toml               | 58 +++++-------------
 python/dask_cudf/dask_cudf/__init__.py        | 20 +++----
 python/dask_cudf/dask_cudf/expr/__init__.py   |  4 +-
 python/dask_cudf/dask_cudf/io/__init__.py     | 12 ++--
 python/dask_cudf/pyproject.toml               | 51 +++-------------
 .../pylibcudf/pylibcudf/tests/common/utils.py |  3 +-
 python/pylibcudf/pylibcudf/tests/conftest.py  |  3 +-
 .../pylibcudf/pylibcudf/tests/io/test_avro.py |  3 +-
 .../pylibcudf/pylibcudf/tests/io/test_csv.py  |  5 +-
 .../pylibcudf/pylibcudf/tests/io/test_json.py |  5 +-
 .../pylibcudf/pylibcudf/tests/io/test_orc.py  |  3 +-
 .../pylibcudf/tests/io/test_parquet.py        |  5 +-
 .../tests/io/test_source_sink_info.py         |  3 +-
 .../pylibcudf/tests/io/test_timezone.py       |  3 +-
 .../pylibcudf/tests/test_binaryops.py         |  3 +-
 .../pylibcudf/tests/test_column_factories.py  |  3 +-
 .../tests/test_column_from_device.py          |  3 +-
 .../pylibcudf/tests/test_contiguous_split.py  |  3 +-
 .../pylibcudf/pylibcudf/tests/test_copying.py |  3 +-
 .../pylibcudf/tests/test_datetime.py          |  3 +-
 .../pylibcudf/tests/test_expressions.py       |  3 +-
 .../pylibcudf/pylibcudf/tests/test_interop.py |  3 +-
 python/pylibcudf/pylibcudf/tests/test_join.py |  3 +-
 python/pylibcudf/pylibcudf/tests/test_json.py |  3 +-
 .../pylibcudf/tests/test_labeling.py          |  3 +-
 .../pylibcudf/pylibcudf/tests/test_lists.py   |  3 +-
 .../pylibcudf/tests/test_null_mask.py         |  5 +-
 .../tests/test_nvtext_edit_distance.py        |  3 +-
 .../tests/test_nvtext_generate_ngrams.py      |  3 +-
 .../pylibcudf/tests/test_nvtext_jaccard.py    |  3 +-
 .../pylibcudf/tests/test_nvtext_minhash.py    |  3 +-
 .../tests/test_nvtext_ngrams_tokenize.py      |  3 +-
 .../pylibcudf/tests/test_nvtext_normalize.py  |  3 +-
 .../pylibcudf/tests/test_nvtext_replace.py    |  3 +-
 .../pylibcudf/tests/test_nvtext_stemmer.py    |  3 +-
 .../pylibcudf/tests/test_partitioning.py      |  3 +-
 .../pylibcudf/tests/test_quantiles.py         |  3 +-
 .../pylibcudf/tests/test_regex_program.py     |  3 +-
 .../pylibcudf/pylibcudf/tests/test_reshape.py |  3 +-
 .../pylibcudf/pylibcudf/tests/test_round.py   |  3 +-
 .../pylibcudf/tests/test_string_attributes.py |  3 +-
 .../pylibcudf/tests/test_string_capitalize.py |  3 +-
 .../pylibcudf/tests/test_string_case.py       |  3 +-
 .../pylibcudf/tests/test_string_char_types.py |  3 +-
 .../pylibcudf/tests/test_string_combine.py    |  3 +-
 .../pylibcudf/tests/test_string_contains.py   |  3 +-
 .../pylibcudf/tests/test_string_convert.py    |  3 +-
 .../tests/test_string_convert_booleans.py     |  3 +-
 .../tests/test_string_convert_datetime.py     |  3 +-
 .../tests/test_string_convert_durations.py    |  3 +-
 .../tests/test_string_convert_fixed_point.py  |  3 +-
 .../tests/test_string_convert_floats.py       |  3 +-
 .../tests/test_string_convert_integers.py     |  3 +-
 .../tests/test_string_convert_ipv4.py         |  3 +-
 .../tests/test_string_convert_lists.py        |  3 +-
 .../tests/test_string_convert_urls.py         |  3 +-
 .../pylibcudf/tests/test_string_extract.py    |  1 +
 .../pylibcudf/tests/test_string_find.py       |  3 +-
 .../tests/test_string_find_multiple.py        |  3 +-
 .../pylibcudf/tests/test_string_findall.py    |  3 +-
 .../pylibcudf/tests/test_string_padding.py    |  1 +
 .../pylibcudf/tests/test_string_repeat.py     |  3 +-
 .../pylibcudf/tests/test_string_replace.py    |  3 +-
 .../pylibcudf/tests/test_string_replace_re.py |  3 +-
 .../pylibcudf/tests/test_string_slice.py      |  3 +-
 .../tests/test_string_split_partition.py      |  3 +-
 .../tests/test_string_split_split.py          |  3 +-
 .../pylibcudf/tests/test_string_strip.py      |  3 +-
 .../pylibcudf/tests/test_string_translate.py  |  3 +-
 .../pylibcudf/tests/test_string_wrap.py       |  3 +-
 .../pylibcudf/pylibcudf/tests/test_table.py   |  3 +-
 .../pylibcudf/tests/test_transform.py         |  3 +-
 .../pylibcudf/tests/test_transpose.py         |  3 +-
 python/pylibcudf/pyproject.toml               | 56 +++++-------------
 114 files changed, 318 insertions(+), 380 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 174dc72bf02..0e86407de11 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,17 +16,6 @@ repos:
             ^cpp/cmake/thirdparty/patches/.*|
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
           )
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.13.2
-    hooks:
-      - id: isort
-        # Use the config file specific to each subproject so that each
-        # project can specify its own first/third-party packages.
-        args: ["--config-root=python/", "--resolve-all-configs"]
-        files: python/.*
-        exclude: |
-          (?x)^(^python/cudf_polars/.*)
-        types_or: [python, cython, pyi]
   - repo: https://github.com/MarcoGorelli/cython-lint
     rev: v0.16.2
     hooks:
@@ -150,6 +139,7 @@ repos:
     rev: v0.4.8
     hooks:
       - id: ruff
+        args: ["--fix"]
         files: python/.*$
       - id: ruff-format
         files: python/.*$
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f9cdde7c2b7..b55af21a300 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -293,8 +293,8 @@ In order to run doxygen as a linter on C++/CUDA code, run
 ./ci/checks/doxygen.sh
 ```
 
-Python code runs several linters including [Black](https://black.readthedocs.io/en/stable/),
-[isort](https://pycqa.github.io/isort/), and [flake8](https://flake8.pycqa.org/en/latest/).
+Python code runs several linters including [Ruff](https://docs.astral.sh/ruff/)
+with its various rules  for Black-like formatting or Isort.
 
 cuDF also uses [codespell](https://github.com/codespell-project/codespell) to find spelling
 mistakes, and this check is run as a pre-commit hook. To apply the suggested spelling fixes,
diff --git a/docs/cudf/source/developer_guide/contributing_guide.md b/docs/cudf/source/developer_guide/contributing_guide.md
index 6fce268f309..f4d2c7319b3 100644
--- a/docs/cudf/source/developer_guide/contributing_guide.md
+++ b/docs/cudf/source/developer_guide/contributing_guide.md
@@ -15,8 +15,7 @@ Developers are strongly recommended to set up `pre-commit` prior to any developm
 The `.pre-commit-config.yaml` file at the root of the repo is the primary source of truth linting.
 Specifically, cuDF uses the following tools:
 
-- [`ruff`](https://beta.ruff.rs/) checks for general code formatting compliance.
-- [`isort`](https://pycqa.github.io/isort/) ensures imports are sorted consistently.
+- [`ruff`](https://docs.astral.sh/ruff/) checks for general code formatting compliance.
 - [`mypy`](http://mypy-lang.org/) performs static type checking.
   In conjunction with [type hints](https://docs.python.org/3/library/typing.html),
   `mypy` can help catch various bugs that are otherwise difficult to find.
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index 95f5f9734dd..46221b6015b 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -38,10 +38,10 @@
     "import os\n",
     "\n",
     "import cupy as cp\n",
+    "import dask_cudf\n",
     "import pandas as pd\n",
     "\n",
     "import cudf\n",
-    "import dask_cudf\n",
     "\n",
     "cp.random.seed(12)\n",
     "\n",
diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
index 75eafcc5387..abfe5a1b178 100644
--- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb
+++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
@@ -101,6 +101,8 @@
    "outputs": [],
    "source": [
     "# define a scalar function\n",
+    "\n",
+    "\n",
     "def f(x):\n",
     "    return x + 1"
    ]
@@ -247,6 +249,8 @@
    "outputs": [],
    "source": [
     "# redefine the same function from above\n",
+    "\n",
+    "\n",
     "def f(x):\n",
     "    return x + 1"
    ]
@@ -1622,6 +1626,8 @@
    "outputs": [],
    "source": [
     "# a user defined aggregation function.\n",
+    "\n",
+    "\n",
     "def udaf(df):\n",
     "    return df[\"b\"].max() - df[\"b\"].min() / 2"
    ]
diff --git a/pyproject.toml b/pyproject.toml
index 661c68ee62e..6933484f4e7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,8 @@ select = [
     "F",
     # pycodestyle Warning
     "W",
+    # isort
+    "I",
     # no-blank-line-before-function
     "D201",
     # one-blank-line-after-class
diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py
index 7b2b71cf216..0e4afadccf5 100644
--- a/python/cudf/benchmarks/conftest.py
+++ b/python/cudf/benchmarks/conftest.py
@@ -56,27 +56,23 @@
 # into the main repo.
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
 
-from config import cudf  # noqa: W0611, E402, F401
-from utils import (  # noqa: E402
-    OrderedSet,
-    collapse_fixtures,
-    column_generators,
-    make_fixture,
-)
-
 # Turn off isort until we upgrade to 5.8.0
 # https://github.com/pycqa/isort/issues/1594
-# isort: off
 from config import (  # noqa: W0611, E402, F401
     NUM_COLS,
     NUM_ROWS,
     collect_ignore,
+    cudf,  # noqa: W0611, E402, F401
     pytest_collection_modifyitems,
     pytest_sessionfinish,
     pytest_sessionstart,
 )
-
-# isort: on
+from utils import (  # noqa: E402
+    OrderedSet,
+    collapse_fixtures,
+    column_generators,
+    make_fixture,
+)
 
 
 @pytest_cases.fixture(params=[0, 1], ids=["AxisIndex", "AxisColumn"])
diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py
index 6e8ad556b08..3b13cc258ab 100644
--- a/python/cudf/cudf/_typing.py
+++ b/python/cudf/cudf/_typing.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 import sys
-from collections.abc import Callable
-from typing import TYPE_CHECKING, Any, Dict, Iterable, TypeVar, Union
+from collections.abc import Callable, Iterable
+from typing import TYPE_CHECKING, Any, TypeVar, Union
 
 import numpy as np
 from pandas import Period, Timedelta, Timestamp
@@ -42,7 +42,7 @@
 SeriesOrSingleColumnIndex = Union["cudf.Series", "cudf.core.index.Index"]
 
 # Groupby aggregation
-AggType = Union[str, Callable]
-MultiColumnAggType = Union[
-    AggType, Iterable[AggType], Dict[Any, Iterable[AggType]]
+AggType = Union[str, Callable]  # noqa: UP007
+MultiColumnAggType = Union[  # noqa: UP007
+    AggType, Iterable[AggType], dict[Any, Iterable[AggType]]
 ]
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index caff019f575..ffa306bf93f 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -6,7 +6,7 @@
 import pickle
 import weakref
 from types import SimpleNamespace
-from typing import Any, Literal, Mapping
+from typing import TYPE_CHECKING, Any, Literal
 
 import numpy
 from typing_extensions import Self
@@ -18,6 +18,9 @@
 from cudf.core.abc import Serializable
 from cudf.utils.string import format_bytes
 
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
 
 def host_memory_allocation(nbytes: int) -> memoryview:
     """Allocate host memory using NumPy
diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
index 0bd8d6054b3..ecf9807cfc2 100644
--- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
+++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
@@ -2,13 +2,16 @@
 
 from __future__ import annotations
 
-from typing import Literal, Mapping
+from typing import TYPE_CHECKING, Literal
 
 from typing_extensions import Self
 
 import cudf
 from cudf.core.buffer.buffer import Buffer, BufferOwner
 
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
 
 class ExposureTrackedBuffer(Buffer):
     """An exposure tracked buffer.
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index 06791df7dc0..a1e87d04bc9 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -29,4 +29,3 @@
     Decimal128Column,
     DecimalBaseColumn,
 )
-from cudf.core.column.interval import IntervalColumn  # noqa: F401
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 864e87b5377..087d0ed65f5 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -4,7 +4,7 @@
 
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Mapping, Sequence, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 import pandas as pd
@@ -26,6 +26,7 @@
 
 if TYPE_CHECKING:
     from collections import abc
+    from collections.abc import Mapping, Sequence
 
     import numba.cuda
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 7674565e2c3..d2cd6e8ac8f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -4,10 +4,11 @@
 
 import pickle
 from collections import abc
+from collections.abc import MutableSequence, Sequence
 from functools import cached_property
 from itertools import chain
 from types import SimpleNamespace
-from typing import TYPE_CHECKING, Any, Literal, MutableSequence, Sequence, cast
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 import cupy
 import numpy as np
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 2c9b0baa9b6..b6dc250e64d 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -8,7 +8,7 @@
 import locale
 import re
 from locale import nl_langinfo
-from typing import TYPE_CHECKING, Literal, Sequence, cast
+from typing import TYPE_CHECKING, Literal, cast
 
 import numpy as np
 import pandas as pd
@@ -31,6 +31,8 @@
 from cudf.utils.utils import _all_bools_with_nulls
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from cudf._typing import (
         ColumnBinaryOperand,
         DatetimeLikeScalar,
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 8803ebd6791..8ae06f72d1e 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -3,8 +3,9 @@
 from __future__ import annotations
 
 import warnings
+from collections.abc import Sequence
 from decimal import Decimal
-from typing import TYPE_CHECKING, Sequence, cast
+from typing import TYPE_CHECKING, cast
 
 import cupy as cp
 import numpy as np
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index e9d24d4f450..6b25e568f00 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from functools import cached_property
-from typing import TYPE_CHECKING, Sequence, cast
+from typing import TYPE_CHECKING, cast
 
 import numpy as np
 import pandas as pd
@@ -34,6 +34,8 @@
 from cudf.core.missing import NA
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
     from cudf.core.buffer import Buffer
 
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index 05a0ab2e09a..a91c080fe21 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -2,9 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Union, overload
-
-from typing_extensions import Literal
+from typing import Literal, Union, overload
 
 import cudf
 import cudf.core.column
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 78d2814ed26..620cae65374 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import functools
-from typing import TYPE_CHECKING, Any, Sequence, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 import pandas as pd
@@ -28,7 +28,7 @@
 from .numerical_base import NumericalBaseColumn
 
 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from collections.abc import Callable, Sequence
 
     from cudf._typing import (
         ColumnBinaryOperand,
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index b25e486d855..856ce0f75de 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5,7 +5,7 @@
 import re
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Sequence, cast, overload
+from typing import TYPE_CHECKING, cast, overload
 
 import numpy as np
 import pandas as pd
@@ -35,6 +35,8 @@ def str_to_boolean(column: StringColumn):
 
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     import cupy
     import numba.cuda
 
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 6b6f3e517a8..087d6474e7f 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -4,7 +4,7 @@
 
 import datetime
 import functools
-from typing import TYPE_CHECKING, Sequence, cast
+from typing import TYPE_CHECKING, cast
 
 import numpy as np
 import pandas as pd
@@ -19,6 +19,8 @@
 from cudf.utils.utils import _all_bools_with_nulls
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype
 
 _unit_to_nanoseconds_conversion = {
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index bc093fdaa9a..496e86ed709 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -5,8 +5,9 @@
 import itertools
 import sys
 from collections import abc
+from collections.abc import Mapping
 from functools import cached_property, reduce
-from typing import TYPE_CHECKING, Any, Mapping, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 import pandas as pd
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7d4d34f5b04..bf1c39b23da 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -13,8 +13,8 @@
 import textwrap
 import warnings
 from collections import abc, defaultdict
-from collections.abc import Callable, Iterator
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast
+from collections.abc import Callable, Iterator, MutableMapping
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 import cupy
 import numba
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 5250a741d3d..aa601a2b322 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -3,7 +3,7 @@
 
 import enum
 from collections import abc
-from typing import Any, Iterable, Mapping, Sequence, Tuple, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import cupy as cp
 import numpy as np
@@ -20,6 +20,9 @@
     build_column,
 )
 
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Mapping, Sequence
+
 # Implementation of interchange protocol classes
 # ----------------------------------------------
 
@@ -61,7 +64,7 @@ class _MaskKind(enum.IntEnum):
     _DtypeKind.BOOL,
     _DtypeKind.STRING,
 }
-ProtoDtype = Tuple[_DtypeKind, int, str, str]
+ProtoDtype = tuple[_DtypeKind, int, str, str]
 
 
 class _CuDFBuffer:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 37ad6b8fabb..205edd91d9d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6,7 +6,7 @@
 import pickle
 import warnings
 from collections import abc
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping
+from typing import TYPE_CHECKING, Any, Literal
 
 # TODO: The `numpy` import is needed for typing purposes during doc builds
 # only, need to figure out why the `np` alias is insufficient then remove.
@@ -36,6 +36,7 @@
 from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf
 
 if TYPE_CHECKING:
+    from collections.abc import MutableMapping
     from types import ModuleType
 
     from cudf._typing import Dtype, ScalarLike
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 81b20488d8d..6630bd96c01 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -8,7 +8,7 @@
 import warnings
 from collections import abc
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Iterable, Literal
+from typing import TYPE_CHECKING, Any, Literal
 
 import cupy as cp
 import numpy as np
@@ -36,6 +36,8 @@
 from cudf.utils.utils import GetAttrGetItemMixin
 
 if TYPE_CHECKING:
+    from collections.abc import Iterable
+
     from cudf._typing import (
         AggType,
         DataFrameOrSeries,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index cd07c58c5d9..1b90e9f9df0 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -5,10 +5,10 @@
 import operator
 import pickle
 import warnings
-from collections.abc import Hashable
+from collections.abc import Hashable, MutableMapping
 from functools import cache, cached_property
 from numbers import Number
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 import cupy
 import numpy as np
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 5952815deef..e031f2a4e8e 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -10,9 +10,7 @@
 from typing import (
     TYPE_CHECKING,
     Any,
-    Callable,
     Literal,
-    MutableMapping,
     TypeVar,
     cast,
 )
@@ -63,6 +61,8 @@
 from cudf.utils.utils import _warn_no_dask_cudf
 
 if TYPE_CHECKING:
+    from collections.abc import Callable, MutableMapping
+
     from cudf._typing import (
         ColumnLike,
         DataFrameOrSeries,
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index 8182e5cede2..ce6a5c960dd 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -3,9 +3,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Any, List, Union
-
-from typing_extensions import TypeAlias
+from typing import Any, TypeAlias
 
 import cudf
 from cudf.api.types import _is_scalar_or_zero_d_array, is_integer
@@ -46,11 +44,11 @@ class ScalarIndexer:
     key: GatherMap
 
 
-IndexingSpec: TypeAlias = Union[
-    EmptyIndexer, MapIndexer, MaskIndexer, ScalarIndexer, SliceIndexer
-]
+IndexingSpec: TypeAlias = (
+    EmptyIndexer | MapIndexer | MaskIndexer | ScalarIndexer | SliceIndexer
+)
 
-ColumnLabels: TypeAlias = List[str]
+ColumnLabels: TypeAlias = list[str]
 
 
 def destructure_iloc_key(
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 92d094d9de5..bfff62f0a89 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -8,7 +8,7 @@
 import pickle
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, MutableMapping
+from typing import TYPE_CHECKING, Any
 
 import cupy as cp
 import numpy as np
@@ -36,7 +36,7 @@
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Hashable
+    from collections.abc import Generator, Hashable, MutableMapping
 
     from typing_extensions import Self
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 29ed18ac0ce..9b60424c924 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -9,7 +9,7 @@
 import warnings
 from collections import abc
 from shutil import get_terminal_size
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping
+from typing import TYPE_CHECKING, Any, Literal
 
 import cupy
 import numpy as np
@@ -71,6 +71,8 @@
 from cudf.utils.performance_tracking import _performance_tracking
 
 if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+
     import pyarrow as pa
 
     from cudf._typing import (
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 68f34fa28ff..885e7b16644 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -4,7 +4,7 @@
 import math
 import re
 import warnings
-from typing import Literal, Sequence
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 import pandas as pd
@@ -20,6 +20,9 @@
 from cudf.core import column
 from cudf.core.index import ensure_index
 
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
 # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112
 _unit_map = {
     "year": "year",
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index c364d55e677..73afde407db 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -10,9 +10,9 @@
 import pickle
 import types
 import warnings
-from collections.abc import Callable, Iterator
+from collections.abc import Callable, Iterator, Mapping
 from enum import IntEnum
-from typing import Any, Literal, Mapping
+from typing import Any, Literal
 
 import numpy as np
 
diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index f82e300e83d..38103a71908 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -17,7 +17,7 @@
 from abc import abstractmethod
 from importlib._bootstrap import _ImportLockContext as ImportLock
 from types import ModuleType
-from typing import Any, ContextManager, NamedTuple
+from typing import Any, ContextManager, NamedTuple  # noqa: UP035
 
 from typing_extensions import Self
 
diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
index 8870fbc5c28..bb2fc00d9fc 100644
--- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
+++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
@@ -9,6 +9,7 @@
     python analyze-test-failures.py <path-to-test-log> <file-or-pattern>
 
 Example:
+-------
     python analyze-test-failures.py log.json frame/*
 """
 
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index d12d2697729..59966a5ff0c 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -35,7 +35,7 @@ def null_assert_warnings(*args, **kwargs):
 
 @pytest.fixture(scope="session", autouse=True)  # type: ignore
 def patch_testing_functions():
-    tm.assert_produces_warning = null_assert_warnings
+    tm.assert_produces_warning = null_assert_warnings  # noqa: F821
     pytest.raises = replace_kwargs({"match": None})(pytest.raises)
 
 
diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
index 4ea0b3b4413..a0ad872e4c7 100644
--- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py
+++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
@@ -5,7 +5,8 @@
 """
 Summarizes the test results per module.
 
-Examples:
+Examples
+--------
     python summarize-test-results.py log.json
     python summarize-test-results.py log.json --output json
     python summarize-test-results.py log.json --output table
diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
index a75a20a4681..63fd9601fc1 100644
--- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
+++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
@@ -387,7 +387,8 @@ def test_dir_bound_method(
 ):
     """This test will fail because dir for bound methods is currently
     incorrect, but we have no way to fix it without materializing the slow
-    type, which is unnecessarily expensive."""
+    type, which is unnecessarily expensive.
+    """
     Fast, FastIntermediate = fast_and_intermediate_with_doc
     Slow, SlowIntermediate = slow_and_intermediate_with_doc
 
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index feab04ffadc..80201dd84db 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -81,50 +81,6 @@ cudf-pandas-tests = [
 Homepage = "https://github.com/rapidsai/cudf"
 Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-    "pylibcudf"
-]
-known_first_party = [
-    "cudf",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
-
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
 empty_parameter_set_mark = "fail_at_collect"
@@ -174,3 +130,18 @@ wheel.packages = ["cudf"]
 provider = "scikit_build_core.metadata.regex"
 input = "cudf/VERSION"
 regex = "(?P<value>.*)"
+
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["cudf"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm", "pylibcudf"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 87e19a2bccf..667cd7b1db8 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -32,51 +32,20 @@ test = [
 Homepage = "https://github.com/rapidsai/cudf"
 Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-    "streamz",
-]
-known_rapids = [
-    "rmm",
-    "cudf",
-    "dask_cudf",
-]
-known_first_party = [
-    "cudf_kafka",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["cudf_kafka"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda", "streamz"]
+rapids = ["rmm", "cudf", "dask_cudf"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
 
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
diff --git a/python/custreamz/custreamz/tests/conftest.py b/python/custreamz/custreamz/tests/conftest.py
index 1cda9b71387..c5135bc6414 100644
--- a/python/custreamz/custreamz/tests/conftest.py
+++ b/python/custreamz/custreamz/tests/conftest.py
@@ -2,6 +2,7 @@
 import socket
 
 import pytest
+
 from custreamz import kafka
 
 
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index af45f49d9b4..a8ab05a3922 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -65,50 +65,20 @@ include = [
 ]
 exclude = ["*tests*"]
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-    "cudf",
-    "dask_cudf",
-]
-known_first_party = [
-    "streamz",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["streamz"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm", "cudf", "dask_cudf"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
 
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index 04c2ad65b99..f9df22cc436 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -7,15 +7,15 @@
 # do anything for dask==2024.2.0)
 config.set({"dataframe.query-planning-warning": False})
 
-import dask.dataframe as dd
-from dask.dataframe import from_delayed
+import dask.dataframe as dd  # noqa: E402
+from dask.dataframe import from_delayed  # noqa: E402
 
-import cudf
+import cudf  # noqa: E402
 
-from . import backends
-from ._version import __git_commit__, __version__
-from .core import concat, from_cudf, from_dask_dataframe
-from .expr import QUERY_PLANNING_ON
+from . import backends  # noqa: E402, F401
+from ._version import __git_commit__, __version__  # noqa: E402, F401
+from .core import concat, from_cudf, from_dask_dataframe  # noqa: E402
+from .expr import QUERY_PLANNING_ON  # noqa: E402
 
 
 def read_csv(*args, **kwargs):
@@ -55,9 +55,9 @@ def inner_func(*args, **kwargs):
     to_orc = raise_not_implemented_error("to_orc")
 
 else:
-    from .core import DataFrame, Index, Series
-    from .groupby import groupby_agg
-    from .io import read_text, to_orc
+    from .core import DataFrame, Index, Series  # noqa: F401
+    from .groupby import groupby_agg  # noqa: F401
+    from .io import read_text, to_orc  # noqa: F401
 
 
 __all__ = [
diff --git a/python/dask_cudf/dask_cudf/expr/__init__.py b/python/dask_cudf/dask_cudf/expr/__init__.py
index a76b655ef42..6dadadd5263 100644
--- a/python/dask_cudf/dask_cudf/expr/__init__.py
+++ b/python/dask_cudf/dask_cudf/expr/__init__.py
@@ -12,8 +12,8 @@
     config.set({"dataframe.shuffle.method": "tasks"})
 
     try:
-        import dask_cudf.expr._collection
-        import dask_cudf.expr._expr
+        import dask_cudf.expr._collection  # noqa: F401
+        import dask_cudf.expr._expr  # noqa: F401
 
     except ImportError as err:
         # Dask *should* raise an error before this.
diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py
index 76bb2ea99b4..0421bd755f4 100644
--- a/python/dask_cudf/dask_cudf/io/__init__.py
+++ b/python/dask_cudf/dask_cudf/io/__init__.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
-from .csv import read_csv
-from .json import read_json
-from .orc import read_orc, to_orc
-from .text import read_text
+from .csv import read_csv  # noqa: F401
+from .json import read_json  # noqa: F401
+from .orc import read_orc, to_orc  # noqa: F401
+from .text import read_text  # noqa: F401
 
 try:
-    from .parquet import read_parquet, to_parquet
+    from .parquet import read_parquet, to_parquet  # noqa: F401
 except ImportError:
     pass
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 705865d083b..862e8f36eaa 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -69,50 +69,17 @@ version = {file = "dask_cudf/VERSION"}
 [tool.setuptools.packages.find]
 exclude = ["*tests*"]
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
+[tool.ruff]
+extend = "../../pyproject.toml"
 
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-    "cudf",
-]
-known_first_party = [
-    "dask_cudf",
-]
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["dask_cudf"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
 
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-]
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm", "cudf"]
 
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
diff --git a/python/pylibcudf/pylibcudf/tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py
index 9f389fa42c4..d95849ef371 100644
--- a/python/pylibcudf/pylibcudf/tests/common/utils.py
+++ b/python/pylibcudf/pylibcudf/tests/common/utils.py
@@ -7,10 +7,11 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from pyarrow.orc import write_table as orc_write_table
 from pyarrow.parquet import write_table as pq_write_table
+
+import pylibcudf as plc
 from pylibcudf.io.types import CompressionType
 
 
diff --git a/python/pylibcudf/pylibcudf/tests/conftest.py b/python/pylibcudf/pylibcudf/tests/conftest.py
index fdce6f353ca..a19a8835498 100644
--- a/python/pylibcudf/pylibcudf/tests/conftest.py
+++ b/python/pylibcudf/pylibcudf/tests/conftest.py
@@ -8,8 +8,9 @@
 
 import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
+
+import pylibcudf as plc
 from pylibcudf.io.types import CompressionType
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_avro.py b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
index 0cd5064a697..3d9d99ffa61 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_avro.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
@@ -5,10 +5,11 @@
 
 import fastavro
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_and_meta_eq
 
+import pylibcudf as plc
+
 avro_dtype_pairs = [
     ("boolean", pa.bool_()),
     ("int", pa.int32()),
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
index ab26f23418d..22c83acc47c 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
@@ -5,9 +5,7 @@
 
 import pandas as pd
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
-from pylibcudf.io.types import CompressionType
 from utils import (
     _convert_types,
     assert_table_and_meta_eq,
@@ -15,6 +13,9 @@
     write_source_str,
 )
 
+import pylibcudf as plc
+from pylibcudf.io.types import CompressionType
+
 # Shared kwargs to pass to make_source
 _COMMON_CSV_SOURCE_KWARGS = {
     "format": "csv",
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_json.py b/python/pylibcudf/pylibcudf/tests/io/test_json.py
index 9d976fedf00..453e5ce32a8 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_json.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_json.py
@@ -3,9 +3,7 @@
 
 import pandas as pd
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
-from pylibcudf.io.types import CompressionType
 from utils import (
     assert_table_and_meta_eq,
     make_source,
@@ -13,6 +11,9 @@
     write_source_str,
 )
 
+import pylibcudf as plc
+from pylibcudf.io.types import CompressionType
+
 # Shared kwargs to pass to make_source
 _COMMON_JSON_SOURCE_KWARGS = {"format": "json", "orient": "records"}
 
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
index 42b14b1feff..5ed660ba6cf 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import _convert_types, assert_table_and_meta_eq, make_source
 
+import pylibcudf as plc
+
 # Shared kwargs to pass to make_source
 _COMMON_ORC_SOURCE_KWARGS = {"format": "orc"}
 
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
index f6e843ccf66..41298601539 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
@@ -1,9 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from pyarrow.parquet import read_table
+from utils import assert_table_and_meta_eq, make_source
+
+import pylibcudf as plc
 from pylibcudf.expressions import (
     ASTOperator,
     ColumnNameReference,
@@ -11,7 +13,6 @@
     Literal,
     Operation,
 )
-from utils import assert_table_and_meta_eq, make_source
 
 # Shared kwargs to pass to make_source
 _COMMON_PARQUET_SOURCE_KWARGS = {"format": "parquet"}
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
index 747f58ec8cf..0c43c363e55 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
@@ -2,9 +2,10 @@
 
 import io
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
 def io_class(request):
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
index 76b0424b2af..b3555013927 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import zoneinfo
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 def test_make_timezone_transition_table():
     if len(zoneinfo.TZPATH) == 0:
diff --git a/python/pylibcudf/pylibcudf/tests/test_binaryops.py b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
index f784cb3c191..bbb08e8b95a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_binaryops.py
+++ b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
@@ -4,10 +4,11 @@
 
 import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def idfn(param):
     ltype, rtype, outtype, plc_op, _ = param
diff --git a/python/pylibcudf/pylibcudf/tests/test_column_factories.py b/python/pylibcudf/pylibcudf/tests/test_column_factories.py
index 8cedbc6d42f..e317362a76b 100644
--- a/python/pylibcudf/pylibcudf/tests/test_column_factories.py
+++ b/python/pylibcudf/pylibcudf/tests/test_column_factories.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import DEFAULT_STRUCT_TESTING_TYPE, assert_column_eq
 
+import pylibcudf as plc
+
 EMPTY_COL_SIZE = 3
 
 NUMERIC_TYPES = [
diff --git a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
index 0e129fdf0ef..24cd6b9e35f 100644
--- a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
+++ b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
@@ -1,12 +1,13 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
 import rmm
 
+import pylibcudf as plc
+
 VALID_TYPES = [
     pa.int8(),
     pa.int16(),
diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
index 7a5c1664eed..6d8b5993964 100644
--- a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
+++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 param_pyarrow_tables = [
     pa.table([]),
     pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}),
diff --git a/python/pylibcudf/pylibcudf/tests/test_copying.py b/python/pylibcudf/pylibcudf/tests/test_copying.py
index 628682d0a66..c0a41b96b1a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_copying.py
+++ b/python/pylibcudf/pylibcudf/tests/test_copying.py
@@ -2,7 +2,6 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import (
     DEFAULT_STRUCT_TESTING_TYPE,
@@ -16,6 +15,8 @@
     metadata_from_arrow_type,
 )
 
+import pylibcudf as plc
+
 
 # TODO: consider moving this to conftest and "pairing"
 # it with pa_type, so that they don't get out of sync
diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py
index 75930d59058..a80ab8d9f65 100644
--- a/python/pylibcudf/pylibcudf/tests/test_datetime.py
+++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py
@@ -4,10 +4,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module", params=["s", "ms", "us", "ns"])
 def datetime_column(has_nulls, request):
diff --git a/python/pylibcudf/pylibcudf/tests/test_expressions.py b/python/pylibcudf/pylibcudf/tests/test_expressions.py
index 5894ef4624c..6eabd6db617 100644
--- a/python/pylibcudf/pylibcudf/tests/test_expressions.py
+++ b/python/pylibcudf/pylibcudf/tests/test_expressions.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 # We can't really evaluate these expressions, so just make sure
 # construction works properly
 
diff --git a/python/pylibcudf/pylibcudf/tests/test_interop.py b/python/pylibcudf/pylibcudf/tests/test_interop.py
index 01c998f16d4..e4f5174ad9b 100644
--- a/python/pylibcudf/pylibcudf/tests/test_interop.py
+++ b/python/pylibcudf/pylibcudf/tests/test_interop.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 def test_list_dtype_roundtrip():
     list_type = pa.list_(pa.int32())
diff --git a/python/pylibcudf/pylibcudf/tests/test_join.py b/python/pylibcudf/pylibcudf/tests/test_join.py
index 61e02f4d28d..f43a56046a4 100644
--- a/python/pylibcudf/pylibcudf/tests/test_join.py
+++ b/python/pylibcudf/pylibcudf/tests/test_join.py
@@ -2,9 +2,10 @@
 
 import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 
 def test_cross_join():
     left = pa.Table.from_arrays([[0, 1, 2], [3, 4, 5]], names=["a", "b"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_json.py b/python/pylibcudf/pylibcudf/tests/test_json.py
index 3d2955211f8..486a9524e92 100644
--- a/python/pylibcudf/pylibcudf/tests/test_json.py
+++ b/python/pylibcudf/pylibcudf/tests/test_json.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def plc_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py
index f7fb7463b50..beacfc63ce5 100644
--- a/python/pylibcudf/pylibcudf/tests/test_labeling.py
+++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("left_inclusive", [True, False])
 @pytest.mark.parametrize("right_inclusive", [True, False])
diff --git a/python/pylibcudf/pylibcudf/tests/test_lists.py b/python/pylibcudf/pylibcudf/tests/test_lists.py
index 2353a6ff8f9..f3ef555f11d 100644
--- a/python/pylibcudf/pylibcudf/tests/test_lists.py
+++ b/python/pylibcudf/pylibcudf/tests/test_lists.py
@@ -3,10 +3,11 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def test_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_null_mask.py b/python/pylibcudf/pylibcudf/tests/test_null_mask.py
index 3edcae59edc..cd3da856de2 100644
--- a/python/pylibcudf/pylibcudf/tests/test_null_mask.py
+++ b/python/pylibcudf/pylibcudf/tests/test_null_mask.py
@@ -1,12 +1,13 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
-from pylibcudf.null_mask import MaskState
 
 import rmm
 
+import pylibcudf as plc
+from pylibcudf.null_mask import MaskState
+
 
 @pytest.fixture(params=[False, True])
 def nullable(request):
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
index 7d93c471cc4..8b14e0db576 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def edit_distance_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
index 5cf9874d595..fae4685f81b 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def input_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
index d5a168426b1..05fe7b53c16 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def input_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
index 4e389a63f90..ead9ee094af 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()])
 def minhash_input_data(request):
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py
index 283a009288d..84748b5597e 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def input_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
index fe28b83c09a..25b6d1389ec 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def norm_spaces_input_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py
index 0fb54bb4ee1..65687f31c85 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def input_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
index 75d56f587a4..e7f4a971f08 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def input_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_partitioning.py b/python/pylibcudf/pylibcudf/tests/test_partitioning.py
index 444d0089d2c..c55e54cebc6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_partitioning.py
+++ b/python/pylibcudf/pylibcudf/tests/test_partitioning.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def partitioning_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_quantiles.py b/python/pylibcudf/pylibcudf/tests/test_quantiles.py
index bac56691306..e4a24fb1c98 100644
--- a/python/pylibcudf/pylibcudf/tests/test_quantiles.py
+++ b/python/pylibcudf/pylibcudf/tests/test_quantiles.py
@@ -3,10 +3,11 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
+import pylibcudf as plc
+
 # Map pylibcudf interpolation options to pyarrow options
 interp_mapping = {
     plc.types.Interpolation.LINEAR: "linear",
diff --git a/python/pylibcudf/pylibcudf/tests/test_regex_program.py b/python/pylibcudf/pylibcudf/tests/test_regex_program.py
index 777315df538..52598f2c462 100644
--- a/python/pylibcudf/pylibcudf/tests/test_regex_program.py
+++ b/python/pylibcudf/pylibcudf/tests/test_regex_program.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("pat", ["(", "*", "\\"])
 def test_regex_program_invalid(pat):
diff --git a/python/pylibcudf/pylibcudf/tests/test_reshape.py b/python/pylibcudf/pylibcudf/tests/test_reshape.py
index 01115bc363a..ef23e23766a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_reshape.py
+++ b/python/pylibcudf/pylibcudf/tests/test_reshape.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def reshape_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_round.py b/python/pylibcudf/pylibcudf/tests/test_round.py
index 0b30316b9a0..2526580bc13 100644
--- a/python/pylibcudf/pylibcudf/tests/test_round.py
+++ b/python/pylibcudf/pylibcudf/tests/test_round.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(params=["float32", "float64"])
 def column(request, has_nulls):
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
index a1820def0b1..f461657281a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture()
 def str_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
index 176ccc55b96..3e31c75c38a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def str_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_case.py b/python/pylibcudf/pylibcudf/tests/test_string_case.py
index 233cc253b14..08ac371fd96 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_case.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_case.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def string_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
index bcd030c019e..06b44210d74 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
@@ -2,9 +2,10 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_all_characters_of_type():
     pa_array = pa.array(["1", "A"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_combine.py b/python/pylibcudf/pylibcudf/tests/test_string_combine.py
index 4a7007a0d6b..eea3ac68e84 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_combine.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_combine.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_concatenate_scalar_seperator():
     plc_table = plc.interop.from_arrow(
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
index 4e4dd7cbb00..ba9a4a7d3b8 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_contains.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def target_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
index 69f7a0fdd33..3f3f452c4f6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(
     scope="module",
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py
index 117c59ff1b8..b391d2b290e 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_to_booleans():
     pa_array = pa.array(["true", None, "True"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py
index f3e84286a36..c9368d858a4 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py
@@ -3,10 +3,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def fmt():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py
index 6d704309bfd..2d3578e4e71 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py
@@ -3,10 +3,11 @@
 from datetime import datetime, timedelta
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(
     params=[
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py
index b1c4d729604..012e722038e 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py
@@ -2,9 +2,10 @@
 import decimal
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_to_fixed_point():
     typ = pa.decimal128(38, 2)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
index e9918fab559..8ee2b5075af 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_to_floats():
     typ = pa.float32()
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py
index 6d1d565af30..01192c2d1f8 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_to_integers():
     typ = pa.int8()
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py
index 4dc3e512624..b533809f106 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_ipv4_to_integers():
     arr = pa.array(["123.45.67.890", None])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
index 8591732b39e..737036a4f0f 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("na_rep", [None, pa.scalar("")])
 @pytest.mark.parametrize("separators", [None, pa.array([",", "[", "]"])])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
index fee8c3fb8f6..528736798c7 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
@@ -2,9 +2,10 @@
 import urllib
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_url_encode():
     data = ["/home/nfs", None]
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
index 788b86423c4..e70edf4fb33 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_extract.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
@@ -2,6 +2,7 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
+
 import pylibcudf as plc
 
 
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find.py b/python/pylibcudf/pylibcudf/tests/test_string_find.py
index db3b13a5aae..82ec18832a9 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_find.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_find.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
index d6b37a388f0..fa9eee3594b 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_find_multiple():
     arr = pa.array(["abc", "def"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py
index debfad92d00..b73d812c898 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_findall.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py
@@ -2,9 +2,10 @@
 import re
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_findall():
     arr = pa.array(["bunny", "rabbit", "hare", "dog"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_padding.py b/python/pylibcudf/pylibcudf/tests/test_string_padding.py
index 2ba775d17ae..79498132097 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_padding.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_padding.py
@@ -2,6 +2,7 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
+
 import pylibcudf as plc
 
 
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
index 18b5d8bf4d0..c06c06be7c6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
@@ -2,9 +2,10 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("repeats", [pa.array([2, 2]), 2])
 def test_repeat_strings(repeats):
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace.py b/python/pylibcudf/pylibcudf/tests/test_string_replace.py
index 5a9c2007b73..2c7d25133de 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_replace.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_replace.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py
index ff2ce348d3b..511f826441a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("max_replace_count", [-1, 1])
 def test_replace_re_regex_program_scalar(max_replace_count):
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_slice.py b/python/pylibcudf/pylibcudf/tests/test_string_slice.py
index d9ce5591b98..1759f739e31 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_slice.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_slice.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def pa_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
index 80cae8d1c6b..4e80f19b814 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
index 2aeffac8209..450b336ce65 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_strip.py b/python/pylibcudf/pylibcudf/tests/test_string_strip.py
index 005e5e4a405..5869e5f4920 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_strip.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_strip.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 data_strings = [
     "AbC",
     "123abc",
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_translate.py b/python/pylibcudf/pylibcudf/tests/test_string_translate.py
index 2ae893e69fb..84fd3354ac6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_translate.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_translate.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
index a1c820cd586..00442d866e9 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
@@ -2,9 +2,10 @@
 import textwrap
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_wrap():
     width = 12
diff --git a/python/pylibcudf/pylibcudf/tests/test_table.py b/python/pylibcudf/pylibcudf/tests/test_table.py
index e822d6a97a8..ac39ef4c5c9 100644
--- a/python/pylibcudf/pylibcudf/tests/test_table.py
+++ b/python/pylibcudf/pylibcudf/tests/test_table.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize(
     "arrow_tbl",
diff --git a/python/pylibcudf/pylibcudf/tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py
index d5c618f07e4..49802fe64ac 100644
--- a/python/pylibcudf/pylibcudf/tests/test_transform.py
+++ b/python/pylibcudf/pylibcudf/tests/test_transform.py
@@ -3,9 +3,10 @@
 import math
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_nans_to_nulls(has_nans):
     if has_nans:
diff --git a/python/pylibcudf/pylibcudf/tests/test_transpose.py b/python/pylibcudf/pylibcudf/tests/test_transpose.py
index ac11123f680..b0c0bc72ead 100644
--- a/python/pylibcudf/pylibcudf/tests/test_transpose.py
+++ b/python/pylibcudf/pylibcudf/tests/test_transpose.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from packaging.version import parse
 
+import pylibcudf as plc
+
 
 @pytest.mark.skipif(
     parse(pa.__version__) < parse("16.0.0"),
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index ea5b3065896..a80c85a1fa8 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -53,48 +53,20 @@ test = [
 Homepage = "https://github.com/rapidsai/cudf"
 Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-]
-known_first_party = [
-    "cudf",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["cudf"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
 
 [tool.pytest.ini_options]
 # --import-mode=importlib because two test_json.py exists and tests directory is not a structured module

From 8bc9f19ebbb57bbc9bfa98efd94c8d7f8c65d316 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 25 Oct 2024 10:50:11 -1000
Subject: [PATCH 11/26] Add to_dlpack/from_dlpack APIs to pylibcudf (#17055)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Could use some advice how to type the input of `from_dlpack` and outut of `to_dlpack` which are PyCapsule objects.
EDIT: I notice Cython just types them as object https://github.com/cython/cython/blob/master/Cython/Includes/cpython/pycapsule.pxd. Stylistically do we want add `object var_name` or just leave untyped?

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17055
---
 python/cudf/cudf/_lib/interop.pyx             | 69 ++------------
 python/pylibcudf/pylibcudf/__init__.pxd       |  2 +
 python/pylibcudf/pylibcudf/interop.pxd        |  8 ++
 python/pylibcudf/pylibcudf/interop.pyx        | 94 ++++++++++++++++++-
 .../pylibcudf/pylibcudf/libcudf/interop.pxd   | 10 +-
 .../pylibcudf/pylibcudf/tests/test_interop.py | 31 ++++++
 6 files changed, 148 insertions(+), 66 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/interop.pxd

diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 1dc586bb257..1c9d3a01b80 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -1,49 +1,22 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cpython cimport pycapsule
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 import pylibcudf
 
-from pylibcudf.libcudf.interop cimport (
-    DLManagedTensor,
-    from_dlpack as cpp_from_dlpack,
-    to_dlpack as cpp_to_dlpack,
-)
-from pylibcudf.libcudf.table.table cimport table
-from pylibcudf.libcudf.table.table_view cimport table_view
-
-from cudf._lib.utils cimport (
-    columns_from_pylibcudf_table,
-    columns_from_unique_ptr,
-    table_view_from_columns,
-)
+from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.dtypes import ListDtype, StructDtype
 
 
-def from_dlpack(dlpack_capsule):
+def from_dlpack(object dlpack_capsule):
     """
     Converts a DLPack Tensor PyCapsule into a list of columns.
 
     DLPack Tensor PyCapsule is expected to have the name "dltensor".
     """
-    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>pycapsule.\
-        PyCapsule_GetPointer(dlpack_capsule, 'dltensor')
-    pycapsule.PyCapsule_SetName(dlpack_capsule, 'used_dltensor')
-
-    cdef unique_ptr[table] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_from_dlpack(dlpack_tensor)
-        )
-
-    res = columns_from_unique_ptr(move(c_result))
-    dlpack_tensor.deleter(dlpack_tensor)
-    return res
+    return columns_from_pylibcudf_table(
+        pylibcudf.interop.from_dlpack(dlpack_capsule)
+    )
 
 
 def to_dlpack(list source_columns):
@@ -52,39 +25,13 @@ def to_dlpack(list source_columns):
 
     DLPack Tensor PyCapsule will have the name "dltensor".
     """
-    if any(column.null_count for column in source_columns):
-        raise ValueError(
-            "Cannot create a DLPack tensor with null values. \
-                Input is required to have null count as zero."
-        )
-
-    cdef DLManagedTensor *dlpack_tensor
-    cdef table_view source_table_view = table_view_from_columns(source_columns)
-
-    with nogil:
-        dlpack_tensor = cpp_to_dlpack(
-            source_table_view
+    return pylibcudf.interop.to_dlpack(
+        pylibcudf.Table(
+            [col.to_pylibcudf(mode="read") for col in source_columns]
         )
-
-    return pycapsule.PyCapsule_New(
-        dlpack_tensor,
-        'dltensor',
-        dlmanaged_tensor_pycapsule_deleter
     )
 
 
-cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept:
-    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>0
-    try:
-        dlpack_tensor = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(
-            pycap_obj, 'used_dltensor')
-        return  # we do not call a used capsule's deleter
-    except Exception:
-        dlpack_tensor = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(
-            pycap_obj, 'dltensor')
-    dlpack_tensor.deleter(dlpack_tensor)
-
-
 def gather_metadata(object cols_dtypes):
     """
     Generates a ColumnMetadata vector for each column.
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index aa67b4b1149..9bdfdab97c2 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -13,6 +13,7 @@ from . cimport (
     expressions,
     filling,
     groupby,
+    interop,
     join,
     json,
     labeling,
@@ -62,6 +63,7 @@ __all__ = [
     "filling",
     "gpumemoryview",
     "groupby",
+    "interop",
     "join",
     "json",
     "lists",
diff --git a/python/pylibcudf/pylibcudf/interop.pxd b/python/pylibcudf/pylibcudf/interop.pxd
new file mode 100644
index 00000000000..2a0a8c15fdd
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/interop.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.table cimport Table
+
+
+cpdef Table from_dlpack(object managed_tensor)
+
+cpdef object to_dlpack(Table input)
diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx
index 642516a1b90..61e812353b7 100644
--- a/python/pylibcudf/pylibcudf/interop.pyx
+++ b/python/pylibcudf/pylibcudf/interop.pyx
@@ -1,6 +1,11 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
-from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_New
+from cpython.pycapsule cimport (
+    PyCapsule_GetPointer,
+    PyCapsule_IsValid,
+    PyCapsule_New,
+    PyCapsule_SetName,
+)
 from libc.stdlib cimport free
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -16,11 +21,14 @@ from pylibcudf.libcudf.interop cimport (
     ArrowArray,
     ArrowArrayStream,
     ArrowSchema,
+    DLManagedTensor,
     column_metadata,
     from_arrow_column as cpp_from_arrow_column,
     from_arrow_stream as cpp_from_arrow_stream,
+    from_dlpack as cpp_from_dlpack,
     to_arrow_host_raw,
     to_arrow_schema_raw,
+    to_dlpack as cpp_to_dlpack,
 )
 from pylibcudf.libcudf.table.table cimport table
 
@@ -315,3 +323,87 @@ def _to_arrow_scalar(cudf_object, metadata=None):
     # Note that metadata for scalars is primarily important for preserving
     # information on nested types since names are otherwise irrelevant.
     return to_arrow(Column.from_scalar(cudf_object, 1), metadata=metadata)[0]
+
+
+cpdef Table from_dlpack(object managed_tensor):
+    """
+    Convert a DLPack DLTensor into a cudf table.
+
+    For details, see :cpp:func:`cudf::from_dlpack`
+
+    Parameters
+    ----------
+    managed_tensor : PyCapsule
+        A 1D or 2D column-major (Fortran order) tensor.
+
+    Returns
+    -------
+    Table
+        Table with a copy of the tensor data.
+    """
+    if not PyCapsule_IsValid(managed_tensor, "dltensor"):
+        raise ValueError("Invalid PyCapsule object")
+    cdef unique_ptr[table] c_result
+    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>PyCapsule_GetPointer(
+        managed_tensor, "dltensor"
+    )
+    if dlpack_tensor is NULL:
+        raise ValueError("PyCapsule object contained a NULL pointer")
+    PyCapsule_SetName(managed_tensor, "used_dltensor")
+
+    # Note: A copy is always performed when converting the dlpack
+    # data to a libcudf table. We also delete the dlpack_tensor pointer
+    # as the pointer is not deleted by libcudf's from_dlpack function.
+    # TODO: https://github.com/rapidsai/cudf/issues/10874
+    # TODO: https://github.com/rapidsai/cudf/issues/10849
+    with nogil:
+        c_result = cpp_from_dlpack(dlpack_tensor)
+
+    cdef Table result = Table.from_libcudf(move(c_result))
+    dlpack_tensor.deleter(dlpack_tensor)
+    return result
+
+
+cpdef object to_dlpack(Table input):
+    """
+    Convert a cudf table into a DLPack DLTensor.
+
+    For details, see :cpp:func:`cudf::to_dlpack`
+
+    Parameters
+    ----------
+    input : Table
+        A 1D or 2D column-major (Fortran order) tensor.
+
+    Returns
+    -------
+    PyCapsule
+        1D or 2D DLPack tensor with a copy of the table data, or nullptr.
+    """
+    for col in input._columns:
+        if col.null_count():
+            raise ValueError(
+                "Cannot create a DLPack tensor with null values. "
+                "Input is required to have null count as zero."
+            )
+    cdef DLManagedTensor *dlpack_tensor
+
+    with nogil:
+        dlpack_tensor = cpp_to_dlpack(input.view())
+
+    return PyCapsule_New(
+        dlpack_tensor,
+        "dltensor",
+        dlmanaged_tensor_pycapsule_deleter
+    )
+
+
+cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept:
+    if PyCapsule_IsValid(pycap_obj, "used_dltensor"):
+        # we do not call a used capsule's deleter
+        return
+    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>PyCapsule_GetPointer(
+        pycap_obj, "dltensor"
+    )
+    if dlpack_tensor is not NULL:
+        dlpack_tensor.deleter(dlpack_tensor)
diff --git a/python/pylibcudf/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
index 30b97fdec34..b75e9ca7001 100644
--- a/python/pylibcudf/pylibcudf/libcudf/interop.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
@@ -32,11 +32,13 @@ cdef extern from "cudf/interop.hpp" nogil:
 
 cdef extern from "cudf/interop.hpp" namespace "cudf" \
         nogil:
-    cdef unique_ptr[table] from_dlpack(const DLManagedTensor* tensor
-                                       ) except +
+    cdef unique_ptr[table] from_dlpack(
+        const DLManagedTensor* managed_tensor
+    ) except +
 
-    DLManagedTensor* to_dlpack(table_view input_table
-                               ) except +
+    DLManagedTensor* to_dlpack(
+        const table_view& input
+    ) except +
 
     cdef cppclass column_metadata:
         column_metadata() except +
diff --git a/python/pylibcudf/pylibcudf/tests/test_interop.py b/python/pylibcudf/pylibcudf/tests/test_interop.py
index e4f5174ad9b..af80b6e5978 100644
--- a/python/pylibcudf/pylibcudf/tests/test_interop.py
+++ b/python/pylibcudf/pylibcudf/tests/test_interop.py
@@ -1,7 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+import cupy as cp
+import numpy as np
 import pyarrow as pa
 import pytest
+from utils import assert_table_eq
 
 import pylibcudf as plc
 
@@ -67,3 +70,31 @@ def test_decimal_other(data_type):
 
     arrow_type = plc.interop.to_arrow(data_type, precision=precision)
     assert arrow_type == pa.decimal128(precision, 0)
+
+
+def test_round_trip_dlpack_plc_table():
+    expected = pa.table({"a": [1, 2, 3], "b": [5, 6, 7]})
+    plc_table = plc.interop.from_arrow(expected)
+    result = plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table))
+    assert_table_eq(expected, result)
+
+
+@pytest.mark.parametrize("array", [np.array, cp.array])
+def test_round_trip_dlpack_array(array):
+    arr = array([1, 2, 3])
+    result = plc.interop.from_dlpack(arr.__dlpack__())
+    expected = pa.table({"a": [1, 2, 3]})
+    assert_table_eq(expected, result)
+
+
+def test_to_dlpack_error():
+    plc_table = plc.interop.from_arrow(
+        pa.table({"a": [1, None, 3], "b": [5, 6, 7]})
+    )
+    with pytest.raises(ValueError, match="Cannot create a DLPack tensor"):
+        plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table))
+
+
+def test_from_dlpack_error():
+    with pytest.raises(ValueError, match="Invalid PyCapsule object"):
+        plc.interop.from_dlpack(1)

From 8c4d1f201043a6802598bea3dcb58fa1e061d9e5 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 28 Oct 2024 09:22:20 -0400
Subject: [PATCH 12/26] Use make_device_uvector instead of cudaMemcpyAsync in
 inplace_bitmask_binop (#17181)

Changes  `cudf::detail::inplace_bitmask_binop()` to use `make_device_uvector()` instead of `cudaMemcpyAsync()`
Found while working on #17149

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17181
---
 cpp/include/cudf/detail/null_mask.cuh | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index 482265d633e..025e2ccc3ec 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -166,16 +166,9 @@ size_type inplace_bitmask_binop(Binop op,
 
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref();
   cudf::detail::device_scalar<size_type> d_counter{0, stream, mr};
-  rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
-  rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);
-
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    d_masks.data(), masks.data(), masks.size_bytes(), cudaMemcpyDefault, stream.value()));
-  CUDF_CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(),
-                                masks_begin_bits.data(),
-                                masks_begin_bits.size_bytes(),
-                                cudaMemcpyDefault,
-                                stream.value()));
+
+  auto d_masks      = cudf::detail::make_device_uvector_async(masks, stream, mr);
+  auto d_begin_bits = cudf::detail::make_device_uvector_async(masks_begin_bits, stream, mr);
 
   auto constexpr block_size = 256;
   cudf::detail::grid_1d config(dest_mask.size(), block_size);

From ef28cddeccbcc790e05dd49794ecdfcae4f008c2 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 28 Oct 2024 10:04:33 -0700
Subject: [PATCH 13/26] Add compute_mapping_indices used by shared memory
 groupby (#17147)

This work is part of splitting the original bulk shared memory groupby PR https://github.com/rapidsai/cudf/pull/16619.

This PR introduces the `compute_mapping_indices` API, which is used by the shared memory groupby. libcudf will opt for the shared memory code path when the aggregation request is compatible with shared memory, i.e. there is enough shared memory space and no dictionary aggregation requests. Aggregating with shared memory involves two steps. The first step, introduced in this PR, calculates the offset for each input key within the shared memory aggregation storage, as well as the offset when merging the shared memory results into global memory.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17147
---
 cpp/CMakeLists.txt                            |   2 +
 .../groupby/hash/compute_mapping_indices.cu   |  35 ++++
 .../groupby/hash/compute_mapping_indices.cuh  | 192 ++++++++++++++++++
 .../groupby/hash/compute_mapping_indices.hpp  |  43 ++++
 .../hash/compute_mapping_indices_null.cu      |  35 ++++
 5 files changed, 307 insertions(+)
 create mode 100644 cpp/src/groupby/hash/compute_mapping_indices.cu
 create mode 100644 cpp/src/groupby/hash/compute_mapping_indices.cuh
 create mode 100644 cpp/src/groupby/hash/compute_mapping_indices.hpp
 create mode 100644 cpp/src/groupby/hash/compute_mapping_indices_null.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e4b9cbf8921..60132f651d2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -369,6 +369,8 @@ add_library(
   src/filling/sequence.cu
   src/groupby/groupby.cu
   src/groupby/hash/compute_groupby.cu
+  src/groupby/hash/compute_mapping_indices.cu
+  src/groupby/hash/compute_mapping_indices_null.cu
   src/groupby/hash/compute_single_pass_aggs.cu
   src/groupby/hash/create_sparse_results_table.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cu b/cpp/src/groupby/hash/compute_mapping_indices.cu
new file mode 100644
index 00000000000..519d7cd2f1c
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_mapping_indices.cuh"
+#include "compute_mapping_indices.hpp"
+
+namespace cudf::groupby::detail::hash {
+template cudf::size_type max_occupancy_grid_size<hash_set_ref_t<cuco::insert_and_find_tag>>(
+  cudf::size_type n);
+
+template void compute_mapping_indices<hash_set_ref_t<cuco::insert_and_find_tag>>(
+  cudf::size_type grid_size,
+  cudf::size_type num,
+  hash_set_ref_t<cuco::insert_and_find_tag> global_set,
+  bitmask_type const* row_bitmask,
+  bool skip_rows_with_nulls,
+  cudf::size_type* local_mapping_index,
+  cudf::size_type* global_mapping_index,
+  cudf::size_type* block_cardinality,
+  cuda::std::atomic_flag* needs_global_memory_fallback,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh
new file mode 100644
index 00000000000..d353830780f
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "compute_mapping_indices.hpp"
+#include "helpers.cuh"
+
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cooperative_groups.h>
+#include <cuco/static_set_ref.cuh>
+#include <cuda/std/atomic>
+#include <cuda/std/utility>
+
+#include <algorithm>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+__device__ void find_local_mapping(cooperative_groups::thread_block const& block,
+                                   cudf::size_type idx,
+                                   cudf::size_type num_input_rows,
+                                   SetType shared_set,
+                                   bitmask_type const* row_bitmask,
+                                   bool skip_rows_with_nulls,
+                                   cudf::size_type* cardinality,
+                                   cudf::size_type* local_mapping_index,
+                                   cudf::size_type* shared_set_indices)
+{
+  auto const is_valid_input =
+    idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx));
+  auto const [result_idx, inserted] = [&]() {
+    if (is_valid_input) {
+      auto const result      = shared_set.insert_and_find(idx);
+      auto const matched_idx = *result.first;
+      auto const inserted    = result.second;
+      // inserted a new element
+      if (result.second) {
+        auto const shared_set_index          = atomicAdd(cardinality, 1);
+        shared_set_indices[shared_set_index] = idx;
+        local_mapping_index[idx]             = shared_set_index;
+      }
+      return cuda::std::pair{matched_idx, inserted};
+    }
+    return cuda::std::pair{0, false};  // dummy values
+  }();
+  // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
+  // threads in the thread block.
+  block.sync();
+  if (is_valid_input) {
+    // element was already in set
+    if (!inserted) { local_mapping_index[idx] = local_mapping_index[result_idx]; }
+  }
+}
+
+template <typename SetRef>
+__device__ void find_global_mapping(cooperative_groups::thread_block const& block,
+                                    cudf::size_type cardinality,
+                                    SetRef global_set,
+                                    cudf::size_type* shared_set_indices,
+                                    cudf::size_type* global_mapping_index)
+{
+  // for all unique keys in shared memory hash set, stores their matches in
+  // global hash set to `global_mapping_index`
+  for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
+    auto const input_idx = shared_set_indices[idx];
+    global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx] =
+      *global_set.insert_and_find(input_idx).first;
+  }
+}
+
+/*
+ * @brief Inserts keys into the shared memory hash set, and stores the block-wise rank for a given
+ * row index in `local_mapping_index`. If the number of unique keys found in a threadblock exceeds
+ * `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without updating
+ * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the
+ * global hash set, and save the row index of the global sparse table in `global_mapping_index`.
+ */
+template <class SetRef>
+CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows,
+                                        SetRef global_set,
+                                        bitmask_type const* row_bitmask,
+                                        bool skip_rows_with_nulls,
+                                        cudf::size_type* local_mapping_index,
+                                        cudf::size_type* global_mapping_index,
+                                        cudf::size_type* block_cardinality,
+                                        cuda::std::atomic_flag* needs_global_memory_fallback)
+{
+  __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
+
+  // Shared set initialization
+  __shared__ cuco::window<cudf::size_type, GROUPBY_WINDOW_SIZE> windows[window_extent.value()];
+
+  auto raw_set = cuco::static_set_ref{
+    cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+    global_set.key_eq(),
+    probing_scheme_t{global_set.hash_function()},
+    cuco::thread_scope_block,
+    cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, decltype(window_extent)>{
+      window_extent, windows}};
+  auto shared_set = raw_set.rebind_operators(cuco::insert_and_find);
+
+  auto const block = cooperative_groups::this_thread_block();
+  shared_set.initialize(block);
+
+  __shared__ cudf::size_type cardinality;
+  if (block.thread_rank() == 0) { cardinality = 0; }
+  block.sync();
+
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+
+  for (auto idx = cudf::detail::grid_1d::global_thread_id();
+       idx - block.thread_rank() < num_input_rows;
+       idx += stride) {
+    find_local_mapping(block,
+                       idx,
+                       num_input_rows,
+                       shared_set,
+                       row_bitmask,
+                       skip_rows_with_nulls,
+                       &cardinality,
+                       local_mapping_index,
+                       shared_set_indices);
+
+    block.sync();
+
+    if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) {
+      if (block.thread_rank() == 0) { needs_global_memory_fallback->test_and_set(); }
+      break;
+    }
+  }
+
+  // Insert unique keys from shared to global hash set if block-cardinality
+  // doesn't exceed the threshold upper-limit
+  if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) {
+    find_global_mapping(block, cardinality, global_set, shared_set_indices, global_mapping_index);
+  }
+
+  if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; }
+}
+
+template <class SetRef>
+cudf::size_type max_occupancy_grid_size(cudf::size_type n)
+{
+  cudf::size_type max_active_blocks{-1};
+  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &max_active_blocks, mapping_indices_kernel<SetRef>, GROUPBY_BLOCK_SIZE, 0));
+  auto const grid_size  = max_active_blocks * cudf::detail::num_multiprocessors();
+  auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE);
+  return std::min(grid_size, num_blocks);
+}
+
+template <class SetRef>
+void compute_mapping_indices(cudf::size_type grid_size,
+                             cudf::size_type num,
+                             SetRef global_set,
+                             bitmask_type const* row_bitmask,
+                             bool skip_rows_with_nulls,
+                             cudf::size_type* local_mapping_index,
+                             cudf::size_type* global_mapping_index,
+                             cudf::size_type* block_cardinality,
+                             cuda::std::atomic_flag* needs_global_memory_fallback,
+                             rmm::cuda_stream_view stream)
+{
+  mapping_indices_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(
+    num,
+    global_set,
+    row_bitmask,
+    skip_rows_with_nulls,
+    local_mapping_index,
+    global_mapping_index,
+    block_cardinality,
+    needs_global_memory_fallback);
+}
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.hpp b/cpp/src/groupby/hash/compute_mapping_indices.hpp
new file mode 100644
index 00000000000..473ad99e650
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_mapping_indices.hpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cuda/std/atomic>
+
+namespace cudf::groupby::detail::hash {
+/*
+ * @brief Computes the maximum number of active blocks of the given kernel that can be executed on
+ * the underlying device
+ */
+template <class SetRef>
+[[nodiscard]] cudf::size_type max_occupancy_grid_size(cudf::size_type n);
+
+template <class SetRef>
+void compute_mapping_indices(cudf::size_type grid_size,
+                             cudf::size_type num,
+                             SetRef global_set,
+                             bitmask_type const* row_bitmask,
+                             bool skip_rows_with_nulls,
+                             cudf::size_type* local_mapping_index,
+                             cudf::size_type* global_mapping_index,
+                             cudf::size_type* block_cardinality,
+                             cuda::std::atomic_flag* needs_global_memory_fallback,
+                             rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices_null.cu b/cpp/src/groupby/hash/compute_mapping_indices_null.cu
new file mode 100644
index 00000000000..81c3c9e456f
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_mapping_indices_null.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_mapping_indices.cuh"
+#include "compute_mapping_indices.hpp"
+
+namespace cudf::groupby::detail::hash {
+template cudf::size_type
+max_occupancy_grid_size<nullable_hash_set_ref_t<cuco::insert_and_find_tag>>(cudf::size_type n);
+
+template void compute_mapping_indices<nullable_hash_set_ref_t<cuco::insert_and_find_tag>>(
+  cudf::size_type grid_size,
+  cudf::size_type num,
+  nullable_hash_set_ref_t<cuco::insert_and_find_tag> global_set,
+  bitmask_type const* row_bitmask,
+  bool skip_rows_with_nulls,
+  cudf::size_type* local_mapping_index,
+  cudf::size_type* global_mapping_index,
+  cudf::size_type* block_cardinality,
+  cuda::std::atomic_flag* needs_global_memory_fallback,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash

From a83e1a3766d8b647e34a09f4bc79530e298dfed9 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 28 Oct 2024 14:54:32 -0400
Subject: [PATCH 14/26] Add 2-cpp approvers text to contributing guide [no ci]
 (#17182)

Adds text to the contributing guide mentioning 2 cpp-codeowner approvals are required for any C++changes.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17182
---
 CONTRIBUTING.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b55af21a300..3db1ed35294 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -38,6 +38,7 @@ conduct. More information can be found at:
 8. Verify that CI passes all [status checks](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks).
    Fix if needed.
 9. Wait for other developers to review your code and update code as needed.
+   Changes to any C++ files require at least 2 approvals from the cudf-cpp-codeowners before merging.
 10. Once reviewed and approved, a RAPIDS developer will merge your pull request.
 
 If you are unsure about anything, don't hesitate to comment on issues and ask for clarification!

From 7b17fbe41b3bd5f56ec0c1836f80d3d942578f78 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Mon, 28 Oct 2024 14:43:29 -0500
Subject: [PATCH 15/26] Remove java reservation (#17189)

This removes a file for a feature that we intended to use, but never was. The other parts of that feature were already removed, but this was missed.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17189
---
 .../ai/rapids/cudf/HostMemoryReservation.java | 32 -------------------
 1 file changed, 32 deletions(-)
 delete mode 100644 java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java

diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java b/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java
deleted file mode 100644
index 72c2e659372..00000000000
--- a/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *
- *  Copyright (c) 2023, NVIDIA CORPORATION.
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- */
-
-package ai.rapids.cudf;
-
-/**
- * Represents some amount of host memory that has been reserved. A reservation guarantees that one
- * or more allocations up to the reserved amount, minus padding for alignment will succeed. A
- * reservation typically guarantees the amount can be allocated one, meaning when a buffer
- * allocated from a reservation is freed it is not returned to the reservation, but to the pool of
- * memory the reservation originally came from. If more memory is allocated from the reservation
- * an OutOfMemoryError may be thrown, but it is not guaranteed to happen.
- *
- * When the reservation is closed any unused reservation will be returned to the pool of memory
- * the reservation came from.
- */
-public interface HostMemoryReservation extends HostMemoryAllocator, AutoCloseable {}

From abecd0b54d65dd2678f617f8c1a88320f523465f Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 28 Oct 2024 15:33:59 -0500
Subject: [PATCH 16/26] build wheels without build isolation (#17088)

Contributes to https://github.com/rapidsai/build-planning/issues/108
Contributes to https://github.com/rapidsai/build-planning/issues/111

Proposes some small packaging/CI changes, matching similar changes being made across RAPIDS.

* building `libcudf` wheels with `--no-build-isolation` (for better `sccache` hit rate)
* printing `sccache` stats to CI logs
* updating to the latest `rapids-dependency-file-generator` (v1.16.0)
* always explicitly specifying `cpp` / `python` in calls to `rapids-upload-wheels-to-s3`

#

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17088
---
 .pre-commit-config.yaml       |  2 +-
 ci/build_cpp.sh               |  4 ++++
 ci/build_python.sh            | 10 ++++++++++
 ci/build_wheel.sh             | 15 +++++++++++++--
 ci/build_wheel_cudf.sh        |  2 +-
 ci/build_wheel_cudf_polars.sh |  4 ++--
 ci/build_wheel_dask_cudf.sh   |  4 ++--
 ci/build_wheel_libcudf.sh     | 24 ++++++++++++++++++++++--
 ci/build_wheel_pylibcudf.sh   |  4 ++--
 dependencies.yaml             |  2 +-
 10 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0e86407de11..f5234f58efe 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -155,7 +155,7 @@ repos:
           )
       - id: verify-alpha-spec
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.13.11
+    rev: v1.16.0
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index e5fcef17a83..3d06eacf9ff 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -15,8 +15,12 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
+sccache --zero-stats
+
 # With boa installed conda build forward to boa
 RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild \
     conda/recipes/libcudf
 
+sccache --show-adv-stats
+
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 823d7f62290..ed90041cc77 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -19,6 +19,8 @@ rapids-logger "Begin py build"
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
+sccache --zero-stats
+
 # TODO: Remove `--no-test` flag once importing on a CPU
 # node works correctly
 # With boa installed conda build forwards to the boa builder
@@ -28,12 +30,18 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/pylibcudf
 
+sccache --show-adv-stats
+sccache --zero-stats
+
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cudf
 
+sccache --show-adv-stats
+sccache --zero-stats
+
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
@@ -46,6 +54,8 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cudf_kafka
 
+sccache --show-adv-stats
+
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index bf76f4ed29a..78b8a8a08cf 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -3,7 +3,8 @@
 
 set -euo pipefail
 
-package_dir=$1
+package_name=$1
+package_dir=$2
 
 source rapids-configure-sccache
 source rapids-date-string
@@ -12,4 +13,14 @@ rapids-generate-version > ./VERSION
 
 cd "${package_dir}"
 
-python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check
+sccache --zero-stats
+
+rapids-logger "Building '${package_name}' wheel"
+python -m pip wheel \
+    -w dist \
+    -v \
+    --no-deps \
+    --disable-pip-version-check \
+    .
+
+sccache --show-adv-stats
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index fb93b06dbe2..fef4416a366 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -18,7 +18,7 @@ echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf
 echo "pylibcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/pylibcudf_dist/pylibcudf_*.whl)" >> /tmp/constraints.txt
 export PIP_CONSTRAINT="/tmp/constraints.txt"
 
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh cudf ${package_dir}
 
 python -m auditwheel repair \
     --exclude libcudf.so \
diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh
index 9c945e11c00..79853cdbdb2 100755
--- a/ci/build_wheel_cudf_polars.sh
+++ b/ci/build_wheel_cudf_polars.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 
 package_dir="python/cudf_polars"
 
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh cudf-polars ${package_dir}
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist
+RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh
index eb2a91289f7..00c64afa2ef 100755
--- a/ci/build_wheel_dask_cudf.sh
+++ b/ci/build_wheel_dask_cudf.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 
 package_dir="python/dask_cudf"
 
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh dask-cudf ${package_dir}
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist
+RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
index 91bc071583e..b3d6778ea04 100755
--- a/ci/build_wheel_libcudf.sh
+++ b/ci/build_wheel_libcudf.sh
@@ -3,10 +3,30 @@
 
 set -euo pipefail
 
+package_name="libcudf"
 package_dir="python/libcudf"
 
+rapids-logger "Generating build requirements"
+
+rapids-dependency-file-generator \
+  --output requirements \
+  --file-key "py_build_${package_name}" \
+  --file-key "py_rapids_build_${package_name}" \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};cuda_suffixed=true" \
+| tee /tmp/requirements-build.txt
+
+rapids-logger "Installing build requirements"
+python -m pip install \
+    -v \
+    --prefer-binary \
+    -r /tmp/requirements-build.txt
+
+# build with '--no-build-isolation', for better sccache hit rate
+# 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
+export PIP_NO_BUILD_ISOLATION=0
+
 export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
@@ -16,4 +36,4 @@ python -m auditwheel repair \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
-RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist
+RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp "${package_dir}/final_dist"
diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
index 5e9f7f8a0c4..839d98846fe 100755
--- a/ci/build_wheel_pylibcudf.sh
+++ b/ci/build_wheel_pylibcudf.sh
@@ -16,7 +16,7 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f
 echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf_*.whl)" > /tmp/constraints.txt
 export PIP_CONSTRAINT="/tmp/constraints.txt"
 
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh pylibcudf ${package_dir}
 
 python -m auditwheel repair \
     --exclude libcudf.so \
@@ -24,4 +24,4 @@ python -m auditwheel repair \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist
diff --git a/dependencies.yaml b/dependencies.yaml
index 4804f7b00b0..7c7aa43fa41 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -232,7 +232,7 @@ files:
       key: cudf-pandas-tests
     includes:
       - test_python_cudf_pandas
-  py_rapids_build_cudf_polars:
+  py_build_cudf_polars:
     output: pyproject
     pyproject_dir: python/cudf_polars
     extras:

From 4c04b7c8790263dc68c5753609f3cb867806359f Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Mon, 28 Oct 2024 21:15:29 +0000
Subject: [PATCH 17/26] Added strings AST vs BINARY_OP benchmarks (#17128)

This merge request implements benchmarks to compare the strings AST and BINARY_OPs. It also moves out the common string input generator function to a common benchmarks header as it is repeated across other benchmarks.

Authors:
  - Basit Ayantunde (https://github.com/lamarrr)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/17128
---
 cpp/benchmarks/ast/transform.cpp              | 95 ++++++++++++++++++-
 cpp/benchmarks/binaryop/binaryop.cpp          | 82 +++++++++++++++-
 cpp/benchmarks/binaryop/compiled_binaryop.cpp |  2 +-
 cpp/benchmarks/common/generate_input.cu       | 56 +++++++++++
 cpp/benchmarks/common/generate_input.hpp      | 12 +++
 cpp/benchmarks/string/contains.cpp            | 57 +----------
 cpp/benchmarks/string/find.cpp                |  7 +-
 cpp/benchmarks/string/like.cpp                | 58 +----------
 8 files changed, 246 insertions(+), 123 deletions(-)

diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
index f44f26e4d2c..7fe61054a26 100644
--- a/cpp/benchmarks/ast/transform.cpp
+++ b/cpp/benchmarks/ast/transform.cpp
@@ -16,16 +16,29 @@
 
 #include <benchmarks/common/generate_input.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
 #include <nvbench/nvbench.cuh>
+#include <nvbench/types.cuh>
 
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <iterator>
 #include <list>
 #include <memory>
 #include <optional>
@@ -86,7 +99,71 @@ static void BM_ast_transform(nvbench::state& state)
   auto const& expression_tree_root = expressions.back();
 
   // Use the number of bytes read from global memory
-  state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
+  state.add_global_memory_reads<key_type>(static_cast<size_t>(table_size) * (tree_levels + 1));
+  state.add_global_memory_writes<key_type>(table_size);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
+}
+
+template <cudf::ast::ast_operator cmp_op, cudf::ast::ast_operator reduce_op>
+static void BM_string_compare_ast_transform(nvbench::state& state)
+{
+  auto const string_width    = static_cast<cudf::size_type>(state.get_int64("string_width"));
+  auto const num_rows        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_comparisons = static_cast<cudf::size_type>(state.get_int64("num_comparisons"));
+  auto const hit_rate        = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+
+  CUDF_EXPECTS(num_comparisons > 0, "benchmarks require 1 or more comparisons");
+
+  // Create table data
+  auto const num_cols = num_comparisons * 2;
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  std::for_each(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) {
+      columns.emplace_back(create_string_column(num_rows, string_width, hit_rate));
+    });
+
+  cudf::table table{std::move(columns)};
+  cudf::table_view const table_view = table.view();
+
+  int64_t const chars_size = std::accumulate(
+    table_view.begin(),
+    table_view.end(),
+    static_cast<int64_t>(0),
+    [](int64_t size, auto& column) -> int64_t {
+      return size + cudf::strings_column_view{column}.chars_size(cudf::get_default_stream());
+    });
+
+  // Create column references
+  auto column_refs = std::vector<cudf::ast::column_reference>();
+  std::transform(thrust::make_counting_iterator(0),
+                 thrust::make_counting_iterator(num_cols),
+                 std::back_inserter(column_refs),
+                 [](auto const& column_id) { return cudf::ast::column_reference(column_id); });
+
+  // Create expression trees
+  std::list<cudf::ast::operation> expressions;
+
+  // Construct AST tree (a == b && c == d && e == f && ...)
+
+  expressions.emplace_back(cudf::ast::operation(cmp_op, column_refs[0], column_refs[1]));
+
+  std::for_each(thrust::make_counting_iterator(1),
+                thrust::make_counting_iterator(num_comparisons),
+                [&](size_t idx) {
+                  auto const& lhs = expressions.back();
+                  auto const& rhs = expressions.emplace_back(
+                    cudf::ast::operation(cmp_op, column_refs[idx * 2], column_refs[idx * 2 + 1]));
+                  expressions.emplace_back(cudf::ast::operation(reduce_op, lhs, rhs));
+                });
+
+  auto const& expression_tree_root = expressions.back();
+
+  // Use the number of bytes read from global memory
+  state.add_element_count(chars_size, "chars_size");
+  state.add_global_memory_reads<nvbench::uint8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows);
 
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
@@ -115,3 +192,19 @@ AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true);
 AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true);
+
+#define AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(name, cmp_op, reduce_op) \
+  static void name(::nvbench::state& st)                                       \
+  {                                                                            \
+    ::BM_string_compare_ast_transform<cmp_op, reduce_op>(st);                  \
+  }                                                                            \
+  NVBENCH_BENCH(name)                                                          \
+    .set_name(#name)                                                           \
+    .add_int64_axis("string_width", {32, 64, 128, 256})                        \
+    .add_int64_axis("num_rows", {32768, 262144, 2097152})                      \
+    .add_int64_axis("num_comparisons", {1, 2, 3, 4})                           \
+    .add_int64_axis("hit_rate", {50, 100})
+
+AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(ast_string_equal_logical_and,
+                                              cudf::ast::ast_operator::EQUAL,
+                                              cudf::ast::ast_operator::LOGICAL_AND);
diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp
index 7d267a88764..35e41c6c2a4 100644
--- a/cpp/benchmarks/binaryop/binaryop.cpp
+++ b/cpp/benchmarks/binaryop/binaryop.cpp
@@ -17,12 +17,18 @@
 #include <benchmarks/common/generate_input.hpp>
 
 #include <cudf/binaryop.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+
 #include <nvbench/nvbench.cuh>
 
 #include <algorithm>
+#include <cstddef>
+#include <memory>
 
 // This set of benchmarks is designed to be a comparison for the AST benchmarks
 
@@ -44,7 +50,8 @@ static void BM_binaryop_transform(nvbench::state& state)
   cudf::table_view table{*source_table};
 
   // Use the number of bytes read from global memory
-  state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
+  state.add_global_memory_reads<key_type>(static_cast<size_t>(table_size) * (tree_levels + 1));
+  state.add_global_memory_writes<key_type>(table_size);
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {
     // Execute tree that chains additions like (((a + b) + c) + d)
@@ -64,11 +71,65 @@ static void BM_binaryop_transform(nvbench::state& state)
   });
 }
 
+template <cudf::binary_operator cmp_op, cudf::binary_operator reduce_op>
+static void BM_string_compare_binaryop_transform(nvbench::state& state)
+{
+  auto const string_width    = static_cast<cudf::size_type>(state.get_int64("string_width"));
+  auto const num_rows        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_comparisons = static_cast<cudf::size_type>(state.get_int64("num_comparisons"));
+  auto const hit_rate        = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+
+  CUDF_EXPECTS(num_comparisons > 0, "benchmarks require 1 or more comparisons");
+
+  // Create table data
+  auto const num_cols = num_comparisons * 2;
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  std::for_each(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) {
+      columns.emplace_back(create_string_column(num_rows, string_width, hit_rate));
+    });
+
+  cudf::table table{std::move(columns)};
+  cudf::table_view const table_view = table.view();
+
+  int64_t const chars_size = std::accumulate(
+    table_view.begin(), table_view.end(), static_cast<int64_t>(0), [](int64_t size, auto& column) {
+      return size + cudf::strings_column_view{column}.chars_size(cudf::get_default_stream());
+    });
+
+  // Create column references
+
+  // Use the number of bytes read from global memory
+  state.add_element_count(chars_size, "chars_size");
+  state.add_global_memory_reads<nvbench::uint8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows);
+
+  // Construct binary operations (a == b && c == d && e == f && ...)
+  auto constexpr bool_type = cudf::data_type{cudf::type_id::BOOL8};
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream{launch.get_stream().get_stream()};
+    std::unique_ptr<cudf::column> reduction =
+      cudf::binary_operation(table.get_column(0), table.get_column(1), cmp_op, bool_type, stream);
+    std::for_each(
+      thrust::make_counting_iterator(1),
+      thrust::make_counting_iterator(num_comparisons),
+      [&](size_t idx) {
+        std::unique_ptr<cudf::column> comparison = cudf::binary_operation(
+          table.get_column(idx * 2), table.get_column(idx * 2 + 1), cmp_op, bool_type, stream);
+        std::unique_ptr<cudf::column> reduced =
+          cudf::binary_operation(*comparison, *reduction, reduce_op, bool_type, stream);
+        stream.synchronize();
+        reduction = std::move(reduced);
+      });
+  });
+}
+
 #define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \
                                                                                       \
   static void name(::nvbench::state& st)                                              \
   {                                                                                   \
-    BM_binaryop_transform<key_type, tree_type, reuse_columns>(st);                    \
+    ::BM_binaryop_transform<key_type, tree_type, reuse_columns>(st);                  \
   }                                                                                   \
   NVBENCH_BENCH(name)                                                                 \
     .add_int64_axis("tree_levels", {1, 2, 5, 10})                                     \
@@ -86,3 +147,20 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique,
                                     double,
                                     TreeType::IMBALANCED_LEFT,
                                     false);
+
+#define STRING_COMPARE_BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, cmp_op, reduce_op) \
+                                                                                    \
+  static void name(::nvbench::state& st)                                            \
+  {                                                                                 \
+    ::BM_string_compare_binaryop_transform<cmp_op, reduce_op>(st);                  \
+  }                                                                                 \
+  NVBENCH_BENCH(name)                                                               \
+    .set_name(#name)                                                                \
+    .add_int64_axis("string_width", {32, 64, 128, 256})                             \
+    .add_int64_axis("num_rows", {32768, 262144, 2097152})                           \
+    .add_int64_axis("num_comparisons", {1, 2, 3, 4})                                \
+    .add_int64_axis("hit_rate", {50, 100})
+
+STRING_COMPARE_BINARYOP_TRANSFORM_BENCHMARK_DEFINE(string_compare_binaryop_transform,
+                                                   cudf::binary_operator::EQUAL,
+                                                   cudf::binary_operator::LOGICAL_AND);
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
index bc0ff69bce9..cd3c3871a2e 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -39,7 +39,7 @@ void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
   // use number of bytes read and written to global memory
   state.add_global_memory_reads<TypeLhs>(table_size);
   state.add_global_memory_reads<TypeRhs>(table_size);
-  state.add_global_memory_reads<TypeOut>(table_size);
+  state.add_global_memory_writes<TypeOut>(table_size);
 
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); });
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index dc258e32dc5..bdce8a31176 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -17,13 +17,17 @@
 #include "generate_input.hpp"
 #include "random_distribution_factory.cuh"
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/filling.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/combine.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -918,6 +922,58 @@ std::unique_ptr<cudf::table> create_sequence_table(std::vector<cudf::type_id> co
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
+std::unique_ptr<cudf::column> create_string_column(cudf::size_type num_rows,
+                                                   cudf::size_type row_width,
+                                                   int32_t hit_rate)
+{
+  // build input table using the following data
+  auto raw_data = cudf::test::strings_column_wrapper(
+                    {
+                      "123 abc 4567890 DEFGHI 0987 5W43",  // matches both patterns;
+                      "012345 6789 01234 56789 0123 456",  // the rest do not match
+                      "abc 4567890 DEFGHI 0987 Wxyz 123",
+                      "abcdefghijklmnopqrstuvwxyz 01234",
+                      "",
+                      "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
+                      "9876543210,abcdefghijklmnopqrstU",
+                      "9876543210,abcdefghijklmnopqrstU",
+                      "123 édf 4567890 DéFG 0987 X5",
+                      "1",
+                    })
+                    .release();
+
+  if (row_width / 32 > 1) {
+    std::vector<cudf::column_view> columns;
+    for (int i = 0; i < row_width / 32; ++i) {
+      columns.push_back(raw_data->view());
+    }
+    raw_data = cudf::strings::concatenate(cudf::table_view(columns));
+  }
+  auto data_view = raw_data->view();
+
+  // compute number of rows in n_rows that should match
+  auto const num_matches = (static_cast<int64_t>(num_rows) * hit_rate) / 100;
+
+  // Create a randomized gather-map to build a column out of the strings in data.
+  data_profile gather_profile =
+    data_profile_builder().cardinality(0).null_probability(0.0).distribution(
+      cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1);
+  auto gather_table =
+    create_random_table({cudf::type_id::INT32}, row_count{num_rows}, gather_profile);
+  gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
+
+  // Create scatter map by placing 0-index values throughout the gather-map
+  auto scatter_data = cudf::sequence(num_matches,
+                                     cudf::numeric_scalar<int32_t>(0),
+                                     cudf::numeric_scalar<int32_t>(num_rows / num_matches));
+  auto zero_scalar  = cudf::numeric_scalar<int32_t>(0);
+  auto table        = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view());
+  auto gather_map   = table->view().column(0);
+  table             = cudf::gather(cudf::table_view({data_view}), gather_map);
+
+  return std::move(table->release().front());
+}
+
 std::pair<rmm::device_buffer, cudf::size_type> create_random_null_mask(
   cudf::size_type size, std::optional<double> null_probability, unsigned seed)
 {
diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
index 68d3dc492f5..57834fd11d2 100644
--- a/cpp/benchmarks/common/generate_input.hpp
+++ b/cpp/benchmarks/common/generate_input.hpp
@@ -670,6 +670,18 @@ std::unique_ptr<cudf::column> create_random_column(cudf::type_id dtype_id,
                                                    data_profile const& data_params = data_profile{},
                                                    unsigned seed                   = 1);
 
+/**
+ * @brief Deterministically generates a large string column filled with data with the given
+ * parameters.
+ *
+ * @param num_rows Number of rows in the output column
+ * @param row_width Width of each string in the column
+ * @param hit_rate The hit rate percentage, ranging from 0 to 100
+ */
+std::unique_ptr<cudf::column> create_string_column(cudf::size_type num_rows,
+                                                   cudf::size_type row_width,
+                                                   int32_t hit_rate);
+
 /**
  * @brief Generate sequence columns starting with value 0 in first row and increasing by 1 in
  * subsequent rows.
diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index ae6c8b844c8..a73017dda18 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -17,10 +17,6 @@
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 
-#include <cudf_test/column_wrapper.hpp>
-
-#include <cudf/filling.hpp>
-#include <cudf/strings/combine.hpp>
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -28,57 +24,6 @@
 
 #include <nvbench/nvbench.cuh>
 
-std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
-                                                 cudf::size_type row_width,
-                                                 int32_t hit_rate)
-{
-  // build input table using the following data
-  auto raw_data = cudf::test::strings_column_wrapper(
-                    {
-                      "123 abc 4567890 DEFGHI 0987 5W43",  // matches both patterns;
-                      "012345 6789 01234 56789 0123 456",  // the rest do not match
-                      "abc 4567890 DEFGHI 0987 Wxyz 123",
-                      "abcdefghijklmnopqrstuvwxyz 01234",
-                      "",
-                      "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
-                      "9876543210,abcdefghijklmnopqrstU",
-                      "9876543210,abcdefghijklmnopqrstU",
-                      "123 édf 4567890 DéFG 0987 X5",
-                      "1",
-                    })
-                    .release();
-
-  if (row_width / 32 > 1) {
-    std::vector<cudf::column_view> columns;
-    for (int i = 0; i < row_width / 32; ++i) {
-      columns.push_back(raw_data->view());
-    }
-    raw_data = cudf::strings::concatenate(cudf::table_view(columns));
-  }
-  auto data_view = raw_data->view();
-
-  // compute number of rows in n_rows that should match
-  auto matches = static_cast<int32_t>(n_rows * hit_rate) / 100;
-
-  // Create a randomized gather-map to build a column out of the strings in data.
-  data_profile gather_profile =
-    data_profile_builder().cardinality(0).null_probability(0.0).distribution(
-      cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1);
-  auto gather_table =
-    create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile);
-  gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
-
-  // Create scatter map by placing 0-index values throughout the gather-map
-  auto scatter_data = cudf::sequence(
-    matches, cudf::numeric_scalar<int32_t>(0), cudf::numeric_scalar<int32_t>(n_rows / matches));
-  auto zero_scalar = cudf::numeric_scalar<int32_t>(0);
-  auto table       = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view());
-  auto gather_map  = table->view().column(0);
-  table            = cudf::gather(cudf::table_view({data_view}), gather_map);
-
-  return std::move(table->release().front());
-}
-
 // longer pattern lengths demand more working memory per string
 std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"};
 
@@ -94,7 +39,7 @@ static void bench_contains(nvbench::state& state)
     state.skip("Skip benchmarks greater than size_type limit");
   }
 
-  auto col   = build_input_column(n_rows, row_width, hit_rate);
+  auto col   = create_string_column(n_rows, row_width, hit_rate);
   auto input = cudf::strings_column_view(col->view());
 
   auto pattern = patterns[pattern_index];
diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index a9c620e4bf0..996bdcf0332 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -19,7 +19,6 @@
 
 #include <cudf_test/column_wrapper.hpp>
 
-#include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/find.hpp>
@@ -29,10 +28,6 @@
 
 #include <nvbench/nvbench.cuh>
 
-std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
-                                                 cudf::size_type row_width,
-                                                 int32_t hit_rate);
-
 static void bench_find_string(nvbench::state& state)
 {
   auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
@@ -46,7 +41,7 @@ static void bench_find_string(nvbench::state& state)
   }
 
   auto const stream = cudf::get_default_stream();
-  auto const col    = build_input_column(n_rows, row_width, hit_rate);
+  auto const col    = create_string_column(n_rows, row_width, hit_rate);
   auto const input  = cudf::strings_column_view(col->view());
 
   std::vector<std::string> h_targets({"5W", "5W43", "0987 5W43"});
diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp
index 99cef640dc3..105ae65cbe8 100644
--- a/cpp/benchmarks/string/like.cpp
+++ b/cpp/benchmarks/string/like.cpp
@@ -18,68 +18,12 @@
 
 #include <cudf_test/column_wrapper.hpp>
 
-#include <cudf/copying.hpp>
-#include <cudf/filling.hpp>
-#include <cudf/strings/combine.hpp>
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <nvbench/nvbench.cuh>
 
-namespace {
-std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
-                                                 cudf::size_type row_width,
-                                                 int32_t hit_rate)
-{
-  // build input table using the following data
-  auto raw_data = cudf::test::strings_column_wrapper(
-                    {
-                      "123 abc 4567890 DEFGHI 0987 5W43",  // matches always;
-                      "012345 6789 01234 56789 0123 456",  // the rest do not match
-                      "abc 4567890 DEFGHI 0987 Wxyz 123",
-                      "abcdefghijklmnopqrstuvwxyz 01234",
-                      "",
-                      "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
-                      "9876543210,abcdefghijklmnopqrstU",
-                      "9876543210,abcdefghijklmnopqrstU",
-                      "123 édf 4567890 DéFG 0987 X5",
-                      "1",
-                    })
-                    .release();
-  if (row_width / 32 > 1) {
-    std::vector<cudf::column_view> columns;
-    for (int i = 0; i < row_width / 32; ++i) {
-      columns.push_back(raw_data->view());
-    }
-    raw_data = cudf::strings::concatenate(cudf::table_view(columns));
-  }
-  auto data_view = raw_data->view();
-
-  // compute number of rows in n_rows that should match
-  auto matches = static_cast<int32_t>(n_rows * hit_rate) / 100;
-
-  // Create a randomized gather-map to build a column out of the strings in data.
-  data_profile gather_profile =
-    data_profile_builder().cardinality(0).null_probability(0.0).distribution(
-      cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1);
-  auto gather_table =
-    create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile);
-  gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
-
-  // Create scatter map by placing 0-index values throughout the gather-map
-  auto scatter_data = cudf::sequence(
-    matches, cudf::numeric_scalar<int32_t>(0), cudf::numeric_scalar<int32_t>(n_rows / matches));
-  auto zero_scalar = cudf::numeric_scalar<int32_t>(0);
-  auto table       = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view());
-  auto gather_map  = table->view().column(0);
-  table            = cudf::gather(cudf::table_view({data_view}), gather_map);
-
-  return std::move(table->release().front());
-}
-
-}  // namespace
-
 static void bench_like(nvbench::state& state)
 {
   auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
@@ -91,7 +35,7 @@ static void bench_like(nvbench::state& state)
     state.skip("Skip benchmarks greater than size_type limit");
   }
 
-  auto col   = build_input_column(n_rows, row_width, hit_rate);
+  auto col   = create_string_column(n_rows, row_width, hit_rate);
   auto input = cudf::strings_column_view(col->view());
 
   // This pattern forces reading the entire target string (when matched expected)

From 1ad9fc1feef0ea0ee38adaa8f05cde6bb05aff0f Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 28 Oct 2024 16:33:50 -0700
Subject: [PATCH 18/26] Remove includes suggested by include-what-you-use
 (#17170)

This PR cherry-picks out the suggestions from IWYU generated in #17078.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17170
---
 cpp/include/cudf/ast/detail/expression_parser.hpp   |  4 ----
 cpp/include/cudf/column/column_factories.hpp        |  2 --
 cpp/include/cudf/column/column_view.hpp             |  1 -
 .../cudf/detail/aggregation/result_cache.hpp        |  1 -
 cpp/include/cudf/detail/is_element_valid.hpp        |  1 -
 .../cudf/dictionary/dictionary_column_view.hpp      |  1 -
 cpp/include/cudf/io/text/detail/bgzip_utils.hpp     |  6 ------
 cpp/include/cudf/utilities/default_stream.hpp       |  2 --
 cpp/include/cudf/utilities/traits.hpp               |  2 --
 cpp/include/cudf/utilities/type_dispatcher.hpp      |  1 -
 cpp/src/ast/expression_parser.cpp                   |  5 +----
 cpp/src/ast/expressions.cpp                         |  5 +----
 cpp/src/binaryop/binaryop.cpp                       |  5 -----
 cpp/src/column/column_factories.cpp                 |  6 ------
 cpp/src/column/column_view.cpp                      |  3 ---
 cpp/src/copying/copy.cpp                            |  5 -----
 cpp/src/copying/pack.cpp                            |  1 -
 cpp/src/copying/split.cpp                           |  2 --
 cpp/src/datetime/timezone.cpp                       |  2 --
 cpp/src/groupby/hash/flatten_single_pass_aggs.hpp   |  1 -
 cpp/src/groupby/sort/aggregate.cpp                  |  1 -
 cpp/src/interop/arrow_utilities.cpp                 |  5 -----
 cpp/src/interop/arrow_utilities.hpp                 |  1 -
 cpp/src/interop/dlpack.cpp                          |  3 ---
 cpp/src/io/avro/avro.cpp                            |  1 -
 cpp/src/io/avro/avro.hpp                            |  2 --
 cpp/src/io/comp/nvcomp_adapter.hpp                  |  2 --
 cpp/src/io/comp/uncomp.cpp                          |  2 --
 cpp/src/io/functions.cpp                            |  2 --
 cpp/src/io/json/host_tree_algorithms.cu             |  1 +
 cpp/src/io/json/nested_json.hpp                     |  3 ---
 cpp/src/io/orc/reader_impl_decode.cu                |  1 +
 cpp/src/io/orc/reader_impl_helpers.cpp              |  2 --
 cpp/src/io/orc/reader_impl_helpers.hpp              |  3 ---
 cpp/src/io/parquet/arrow_schema_writer.cpp          |  1 -
 cpp/src/io/parquet/arrow_schema_writer.hpp          |  5 ++---
 cpp/src/io/parquet/compact_protocol_reader.hpp      |  3 ---
 cpp/src/io/parquet/compact_protocol_writer.hpp      |  1 -
 cpp/src/io/parquet/predicate_pushdown.cpp           |  1 -
 cpp/src/io/parquet/reader.cpp                       |  2 --
 cpp/src/io/parquet/reader_impl.cpp                  |  2 --
 cpp/src/io/text/data_chunk_source_factories.cpp     |  4 ----
 cpp/src/io/utilities/column_buffer.cpp              |  2 +-
 cpp/src/io/utilities/column_buffer.hpp              |  5 ++---
 cpp/src/io/utilities/config_utils.cpp               |  4 ----
 cpp/src/io/utilities/datasource.cpp                 |  1 -
 cpp/src/io/utilities/file_io_utilities.cpp          |  2 --
 cpp/src/io/utilities/row_selection.cpp              |  3 ---
 cpp/src/io/utilities/row_selection.hpp              |  2 +-
 cpp/src/jit/cache.cpp                               |  3 ---
 cpp/src/jit/util.cpp                                |  4 +---
 cpp/src/quantiles/tdigest/tdigest_column_view.cpp   |  3 +--
 cpp/src/reductions/reductions.cpp                   |  2 --
 cpp/src/reductions/scan/scan.cpp                    |  3 ---
 cpp/src/reductions/segmented/reductions.cpp         |  4 ----
 .../rolling/detail/optimized_unbounded_window.cpp   |  3 ---
 cpp/src/rolling/detail/range_window_bounds.hpp      |  5 +----
 cpp/src/rolling/range_window_bounds.cpp             |  1 -
 cpp/src/scalar/scalar.cpp                           |  2 --
 cpp/src/scalar/scalar_factories.cpp                 |  2 --
 cpp/src/strings/regex/regexec.cpp                   |  1 -
 cpp/src/strings/strings_scalar_factories.cpp        |  1 -
 cpp/src/structs/structs_column_view.cpp             |  3 +--
 cpp/src/structs/utilities.cpp                       |  3 ---
 cpp/src/table/table.cpp                             |  1 -
 cpp/src/table/table_view.cpp                        |  3 ---
 cpp/src/transform/transform.cpp                     |  2 --
 cpp/src/utilities/cuda.cpp                          |  2 --
 cpp/src/utilities/host_memory.cpp                   |  1 -
 cpp/src/utilities/prefetch.cpp                      |  1 -
 cpp/src/utilities/stream_pool.cpp                   |  1 -
 cpp/src/utilities/traits.cpp                        |  2 --
 cpp/src/utilities/type_checks.cpp                   |  2 --
 cpp/tests/ast/transform_tests.cpp                   |  8 --------
 cpp/tests/binaryop/binop-compiled-test.cpp          |  2 --
 cpp/tests/binaryop/binop-generic-ptx-test.cpp       |  3 +--
 cpp/tests/bitmask/bitmask_tests.cpp                 |  1 -
 cpp/tests/column/bit_cast_test.cpp                  |  3 ---
 cpp/tests/column/column_test.cpp                    |  1 -
 cpp/tests/column/column_view_device_span_test.cpp   |  1 -
 cpp/tests/column/column_view_shallow_test.cpp       |  2 --
 cpp/tests/column/factories_test.cpp                 |  3 ---
 cpp/tests/copying/concatenate_tests.cpp             |  2 --
 cpp/tests/copying/copy_if_else_nested_tests.cpp     |  3 +--
 cpp/tests/copying/copy_range_tests.cpp              |  1 -
 cpp/tests/copying/copy_tests.cpp                    |  1 -
 cpp/tests/copying/gather_list_tests.cpp             |  4 +---
 cpp/tests/copying/gather_str_tests.cpp              |  1 -
 cpp/tests/copying/gather_struct_tests.cpp           |  5 -----
 cpp/tests/copying/gather_tests.cpp                  |  1 -
 cpp/tests/copying/get_value_tests.cpp               |  2 --
 cpp/tests/copying/purge_nonempty_nulls_tests.cpp    |  3 ---
 cpp/tests/copying/reverse_tests.cpp                 |  6 +-----
 cpp/tests/copying/sample_tests.cpp                  |  5 +----
 cpp/tests/copying/scatter_list_scalar_tests.cpp     |  3 +--
 cpp/tests/copying/scatter_list_tests.cpp            |  1 -
 cpp/tests/copying/scatter_struct_scalar_tests.cpp   |  3 +--
 cpp/tests/copying/scatter_struct_tests.cpp          |  1 -
 cpp/tests/copying/scatter_tests.cpp                 |  2 --
 cpp/tests/copying/segmented_gather_list_tests.cpp   |  3 +--
 cpp/tests/copying/shift_tests.cpp                   |  2 --
 cpp/tests/copying/slice_tests.cpp                   |  4 ----
 cpp/tests/copying/utility_tests.cpp                 |  1 -
 cpp/tests/datetime/datetime_ops_test.cpp            |  3 ---
 cpp/tests/dictionary/add_keys_test.cpp              |  2 --
 cpp/tests/dictionary/encode_test.cpp                |  2 --
 cpp/tests/dictionary/fill_test.cpp                  |  3 ---
 cpp/tests/dictionary/search_test.cpp                |  1 -
 cpp/tests/dictionary/slice_test.cpp                 |  1 -
 cpp/tests/filling/fill_tests.cpp                    |  1 -
 cpp/tests/filling/repeat_tests.cpp                  |  4 ----
 cpp/tests/filling/sequence_tests.cpp                |  1 -
 cpp/tests/fixed_point/fixed_point_tests.cpp         |  3 ---
 cpp/tests/groupby/collect_list_tests.cpp            |  2 --
 cpp/tests/groupby/collect_set_tests.cpp             |  1 -
 cpp/tests/groupby/correlation_tests.cpp             |  1 -
 cpp/tests/groupby/covariance_tests.cpp              |  2 --
 cpp/tests/groupby/groupby_test_util.cpp             |  5 +----
 cpp/tests/groupby/groupby_test_util.hpp             |  5 +----
 cpp/tests/groupby/histogram_tests.cpp               |  1 -
 cpp/tests/groupby/max_scan_tests.cpp                |  1 -
 cpp/tests/groupby/merge_lists_tests.cpp             |  1 -
 cpp/tests/groupby/merge_sets_tests.cpp              |  1 -
 cpp/tests/groupby/rank_scan_tests.cpp               |  2 --
 cpp/tests/groupby/shift_tests.cpp                   |  1 -
 cpp/tests/hashing/md5_test.cpp                      |  1 -
 cpp/tests/hashing/murmurhash3_x86_32_test.cpp       |  2 --
 cpp/tests/hashing/sha1_test.cpp                     |  1 -
 cpp/tests/hashing/sha224_test.cpp                   |  1 -
 cpp/tests/hashing/sha256_test.cpp                   |  1 -
 cpp/tests/hashing/sha384_test.cpp                   |  1 -
 cpp/tests/hashing/sha512_test.cpp                   |  1 -
 cpp/tests/hashing/xxhash_64_test.cpp                |  3 ---
 cpp/tests/interop/from_arrow_device_test.cpp        |  4 ----
 cpp/tests/interop/from_arrow_host_test.cpp          |  2 --
 cpp/tests/interop/from_arrow_stream_test.cpp        | 13 -------------
 cpp/tests/interop/from_arrow_test.cpp               |  4 ----
 cpp/tests/interop/to_arrow_device_test.cpp          |  6 ------
 cpp/tests/interop/to_arrow_host_test.cpp            |  6 ------
 cpp/tests/interop/to_arrow_test.cpp                 |  2 --
 cpp/tests/io/csv_test.cpp                           |  8 --------
 cpp/tests/io/file_io_test.cpp                       |  3 ---
 cpp/tests/io/json/json_quote_normalization_test.cpp |  2 --
 cpp/tests/io/json/json_test.cpp                     |  2 --
 cpp/tests/io/json/json_tree.cpp                     |  6 +-----
 cpp/tests/io/json/nested_json_test.cpp              |  8 --------
 cpp/tests/io/orc_test.cpp                           |  1 -
 cpp/tests/io/parquet_common.hpp                     |  2 --
 cpp/tests/io/parquet_misc_test.cpp                  |  2 --
 cpp/tests/io/parquet_reader_test.cpp                |  2 ++
 cpp/tests/io/parquet_test.cpp                       |  1 -
 cpp/tests/io/row_selection_test.cpp                 |  1 -
 cpp/tests/io/text/data_chunk_source_test.cpp        |  3 ---
 cpp/tests/io/text/multibyte_split_test.cpp          |  4 ----
 cpp/tests/iterator/value_iterator.cpp               |  1 -
 cpp/tests/jit/parse_ptx_function.cpp                |  1 -
 cpp/tests/join/cross_join_tests.cpp                 |  1 -
 cpp/tests/join/distinct_join_tests.cpp              |  5 -----
 cpp/tests/join/join_tests.cpp                       |  5 -----
 cpp/tests/join/semi_anti_join_tests.cpp             |  1 -
 cpp/tests/json/json_tests.cpp                       |  1 -
 cpp/tests/large_strings/large_strings_fixture.cpp   |  2 --
 cpp/tests/large_strings/parquet_tests.cpp           |  2 --
 cpp/tests/lists/contains_tests.cpp                  |  1 -
 cpp/tests/lists/extract_tests.cpp                   |  4 ----
 cpp/tests/lists/sequences_tests.cpp                 |  1 -
 .../stream_compaction/apply_boolean_mask_tests.cpp  |  2 --
 cpp/tests/merge/merge_dictionary_test.cpp           |  2 --
 cpp/tests/merge/merge_string_test.cpp               |  6 ------
 cpp/tests/merge/merge_test.cpp                      |  2 --
 cpp/tests/partitioning/round_robin_test.cpp         |  7 -------
 cpp/tests/quantiles/quantile_test.cpp               |  1 -
 cpp/tests/quantiles/quantiles_test.cpp              |  1 -
 cpp/tests/reductions/ewm_tests.cpp                  |  2 --
 cpp/tests/reductions/list_rank_test.cpp             |  5 -----
 cpp/tests/reductions/rank_tests.cpp                 |  3 +--
 cpp/tests/reductions/reduction_tests.cpp            |  4 ----
 cpp/tests/reductions/scan_tests.cpp                 |  2 --
 cpp/tests/reductions/scan_tests.hpp                 |  5 +----
 cpp/tests/replace/clamp_test.cpp                    |  1 -
 cpp/tests/replace/normalize_replace_tests.cpp       |  1 -
 cpp/tests/replace/replace_nans_tests.cpp            |  1 -
 cpp/tests/replace/replace_nulls_tests.cpp           |  2 --
 cpp/tests/replace/replace_tests.cpp                 |  4 ----
 cpp/tests/reshape/byte_cast_tests.cpp               |  1 -
 cpp/tests/reshape/tile_tests.cpp                    |  3 +--
 cpp/tests/rolling/collect_ops_test.cpp              |  1 -
 cpp/tests/rolling/empty_input_test.cpp              |  4 +---
 cpp/tests/rolling/grouped_rolling_range_test.cpp    |  7 +------
 cpp/tests/rolling/grouped_rolling_test.cpp          |  1 -
 cpp/tests/rolling/lead_lag_test.cpp                 |  4 +---
 cpp/tests/rolling/nth_element_test.cpp              |  7 -------
 cpp/tests/rolling/offset_row_window_test.cpp        |  4 ----
 cpp/tests/rolling/range_rolling_window_test.cpp     |  5 -----
 cpp/tests/rolling/range_window_bounds_test.cpp      |  5 -----
 cpp/tests/rolling/rolling_test.cpp                  |  2 --
 cpp/tests/scalar/factories_test.cpp                 |  3 ---
 cpp/tests/search/search_dictionary_test.cpp         |  1 -
 cpp/tests/search/search_list_test.cpp               |  1 -
 cpp/tests/search/search_struct_test.cpp             |  3 +--
 cpp/tests/search/search_test.cpp                    |  1 -
 cpp/tests/sort/is_sorted_tests.cpp                  |  1 -
 cpp/tests/sort/rank_test.cpp                        |  2 --
 cpp/tests/sort/sort_nested_types_tests.cpp          |  3 +--
 cpp/tests/sort/sort_test.cpp                        |  1 -
 cpp/tests/sort/stable_sort_tests.cpp                |  3 ---
 .../stream_compaction/apply_boolean_mask_tests.cpp  |  4 ----
 .../stream_compaction/distinct_count_tests.cpp      |  5 -----
 cpp/tests/stream_compaction/distinct_tests.cpp      |  3 ---
 cpp/tests/stream_compaction/drop_nans_tests.cpp     |  3 ---
 cpp/tests/stream_compaction/drop_nulls_tests.cpp    |  2 --
 .../stream_compaction/stable_distinct_tests.cpp     |  4 ----
 cpp/tests/stream_compaction/unique_count_tests.cpp  |  5 -----
 cpp/tests/stream_compaction/unique_tests.cpp        |  6 ------
 cpp/tests/streams/binaryop_test.cpp                 |  1 -
 cpp/tests/streams/io/csv_test.cpp                   |  4 ----
 cpp/tests/streams/io/json_test.cpp                  |  2 --
 cpp/tests/streams/io/multibyte_split_test.cpp       |  1 -
 cpp/tests/streams/io/orc_test.cpp                   |  8 --------
 cpp/tests/streams/io/parquet_test.cpp               |  4 ----
 cpp/tests/streams/join_test.cpp                     |  2 --
 cpp/tests/streams/null_mask_test.cpp                |  3 ---
 cpp/tests/streams/reduction_test.cpp                |  3 ---
 cpp/tests/streams/rolling_test.cpp                  |  2 --
 cpp/tests/streams/stream_compaction_test.cpp        |  4 ----
 cpp/tests/streams/strings/factory_test.cpp          |  1 -
 cpp/tests/streams/strings/reverse_test.cpp          |  1 -
 cpp/tests/streams/transform_test.cpp                |  6 ------
 cpp/tests/strings/array_tests.cpp                   |  2 --
 cpp/tests/strings/combine/concatenate_tests.cpp     |  1 -
 .../strings/combine/join_list_elements_tests.cpp    |  1 -
 cpp/tests/strings/concatenate_tests.cpp             |  3 +--
 cpp/tests/strings/datetime_tests.cpp                |  1 -
 cpp/tests/strings/extract_tests.cpp                 |  1 -
 cpp/tests/strings/findall_tests.cpp                 |  3 ---
 cpp/tests/strings/fixed_point_tests.cpp             |  2 --
 cpp/tests/strings/integers_tests.cpp                |  3 ---
 cpp/tests/structs/structs_column_tests.cpp          | 10 ----------
 cpp/tests/structs/utilities_tests.cpp               |  6 ------
 cpp/tests/table/row_operators_tests.cpp             |  1 -
 cpp/tests/table/table_tests.cpp                     |  3 ---
 cpp/tests/text/minhash_tests.cpp                    |  4 ----
 cpp/tests/text/ngrams_tests.cpp                     |  2 --
 cpp/tests/text/normalize_tests.cpp                  |  1 -
 cpp/tests/text/stemmer_tests.cpp                    |  1 -
 cpp/tests/text/subword_tests.cpp                    |  2 --
 cpp/tests/transform/bools_to_mask_test.cpp          |  2 --
 cpp/tests/transform/nans_to_null_test.cpp           |  2 --
 cpp/tests/transpose/transpose_test.cpp              |  1 -
 cpp/tests/types/traits_test.cpp                     |  1 -
 cpp/tests/unary/cast_tests.cpp                      |  3 ---
 cpp/tests/unary/math_ops_test.cpp                   |  4 ----
 cpp/tests/unary/unary_ops_test.cpp                  |  1 -
 cpp/tests/utilities/random_seed.cpp                 |  3 ++-
 cpp/tests/utilities_tests/column_debug_tests.cpp    |  3 ---
 .../utilities_tests/column_utilities_tests.cpp      |  4 ----
 cpp/tests/utilities_tests/column_wrapper_tests.cpp  |  1 -
 .../utilities_tests/lists_column_wrapper_tests.cpp  |  1 -
 cpp/tests/utilities_tests/type_check_tests.cpp      |  1 -
 cpp/tests/utilities_tests/type_list_tests.cpp       |  2 +-
 260 files changed, 39 insertions(+), 643 deletions(-)

diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index a254171ef11..f4cce8e6da6 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -17,12 +17,8 @@
 
 #include <cudf/ast/detail/operators.hpp>
 #include <cudf/ast/expressions.hpp>
-#include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
-
-#include <thrust/scan.h>
 
 #include <functional>
 #include <numeric>
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index 6bbe32de134..e72661ce49a 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -24,8 +24,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/pair.h>
-
 namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup column_factories
diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index 48f89b8be25..6db5c8b3c7b 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -16,7 +16,6 @@
 #pragma once
 
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/prefetch.hpp>
 #include <cudf/utilities/span.hpp>
diff --git a/cpp/include/cudf/detail/aggregation/result_cache.hpp b/cpp/include/cudf/detail/aggregation/result_cache.hpp
index ec5a511bb7c..486808ebe18 100644
--- a/cpp/include/cudf/detail/aggregation/result_cache.hpp
+++ b/cpp/include/cudf/detail/aggregation/result_cache.hpp
@@ -19,7 +19,6 @@
 #include <cudf/column/column.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/hashing/detail/hashing.hpp>
-#include <cudf/types.hpp>
 
 #include <unordered_map>
 
diff --git a/cpp/include/cudf/detail/is_element_valid.hpp b/cpp/include/cudf/detail/is_element_valid.hpp
index 4b74d12f306..26b1bec2ced 100644
--- a/cpp/include/cudf/detail/is_element_valid.hpp
+++ b/cpp/include/cudf/detail/is_element_valid.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
index 5596f78a90b..0a799f27d00 100644
--- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 
 /**
diff --git a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
index 11eb4518210..5659f86b0c4 100644
--- a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
+++ b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
@@ -16,16 +16,10 @@
 
 #pragma once
 
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <zlib.h>
-
-#include <algorithm>
-#include <array>
 #include <fstream>
-#include <limits>
 
 namespace CUDF_EXPORT cudf {
 namespace io::text::detail::bgzip {
diff --git a/cpp/include/cudf/utilities/default_stream.hpp b/cpp/include/cudf/utilities/default_stream.hpp
index 97a42243250..3e740b81cc9 100644
--- a/cpp/include/cudf/utilities/default_stream.hpp
+++ b/cpp/include/cudf/utilities/default_stream.hpp
@@ -16,10 +16,8 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
 
-#include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace CUDF_EXPORT cudf {
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 3f37ae02151..cf8413b597f 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -22,8 +22,6 @@
 #include <cudf/wrappers/durations.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <cuda/std/type_traits>
-
 namespace CUDF_EXPORT cudf {
 
 /**
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index 15b5f921c1b..6351a84e38f 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
diff --git a/cpp/src/ast/expression_parser.cpp b/cpp/src/ast/expression_parser.cpp
index 3b650d791aa..5815ce33e33 100644
--- a/cpp/src/ast/expression_parser.cpp
+++ b/cpp/src/ast/expression_parser.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,6 @@
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/ast/detail/operators.hpp>
 #include <cudf/ast/expressions.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
diff --git a/cpp/src/ast/expressions.cpp b/cpp/src/ast/expressions.cpp
index b45b9d0c78c..4c2b56dd4f5 100644
--- a/cpp/src/ast/expressions.cpp
+++ b/cpp/src/ast/expressions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,6 @@
 #include <cudf/ast/detail/expression_transformer.hpp>
 #include <cudf/ast/detail/operators.hpp>
 #include <cudf/ast/expressions.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index a6c878efbbc..1b23ea12a5e 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -27,15 +27,10 @@
 #include <cudf/detail/binaryop.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/unary.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index 482413d0ccb..972f97e8668 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -15,19 +15,13 @@
  */
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/fill.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/strings/detail/fill.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <thrust/iterator/constant_iterator.h>
-
 namespace cudf {
 namespace {
 struct size_of_helper {
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index 386c5ebe478..e831aa9645d 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -27,9 +26,7 @@
 #include <thrust/iterator/transform_iterator.h>
 
 #include <algorithm>
-#include <exception>
 #include <numeric>
-#include <string>
 #include <vector>
 
 namespace cudf {
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index d60fb5ce110..5e2065ba844 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -20,16 +20,11 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/iterator/transform_iterator.h>
-
 #include <algorithm>
 
 namespace cudf {
diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp
index 1282eec6c44..a001807c82b 100644
--- a/cpp/src/copying/pack.cpp
+++ b/cpp/src/copying/pack.cpp
@@ -18,7 +18,6 @@
 #include <cudf/detail/contiguous_split.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/copying/split.cpp b/cpp/src/copying/split.cpp
index 832a72ed5b0..116e3516460 100644
--- a/cpp/src/copying/split.cpp
+++ b/cpp/src/copying/split.cpp
@@ -14,10 +14,8 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index 2196ee97fee..f786624680c 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -13,12 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <algorithm>
 #include <filesystem>
diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
index 2bf983e5e90..dfad51f27d4 100644
--- a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
+++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
@@ -17,7 +17,6 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <memory>
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index a9085a1f1fd..3041e261945 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -26,7 +26,6 @@
 #include <cudf/detail/binaryop.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp
index a99262fb3bf..c69ebe12d2c 100644
--- a/cpp/src/interop/arrow_utilities.cpp
+++ b/cpp/src/interop/arrow_utilities.cpp
@@ -20,11 +20,6 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-
 #include <nanoarrow/nanoarrow.h>
 
 namespace cudf {
diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp
index 1b79fbf9eda..e4bdedf6603 100644
--- a/cpp/src/interop/arrow_utilities.hpp
+++ b/cpp/src/interop/arrow_utilities.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index a1be6aade4e..4395b741e53 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -16,11 +16,8 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/lists/list_view.hpp>
-#include <cudf/structs/struct_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index d5caa4720ac..b3fcca62314 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -17,7 +17,6 @@
 #include "avro.hpp"
 
 #include <array>
-#include <cstring>
 #include <unordered_map>
 
 namespace cudf {
diff --git a/cpp/src/io/avro/avro.hpp b/cpp/src/io/avro/avro.hpp
index 2e992546ccc..fd2c781b8a1 100644
--- a/cpp/src/io/avro/avro.hpp
+++ b/cpp/src/io/avro/avro.hpp
@@ -18,11 +18,9 @@
 
 #include "avro_common.hpp"
 
-#include <algorithm>
 #include <array>
 #include <cstddef>
 #include <cstdint>
-#include <cstdio>
 #include <cstring>
 #include <map>
 #include <string>
diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index 583bd6a3523..2e1cda2d6b7 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -18,9 +18,7 @@
 
 #include "gpuinflate.hpp"
 
-#include <cudf/io/config_utils.hpp>
 #include <cudf/io/nvcomp_adapter.hpp>
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index d4d6f46b99a..fb8c308065d 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -24,8 +24,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <cuda_runtime.h>
-
 #include <zlib.h>  // uncompress
 
 #include <cstring>  // memset
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index a8682e6a760..ceaeb5d8f85 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -32,10 +32,8 @@
 #include <cudf/io/orc_metadata.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/io/parquet_metadata.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <algorithm>
 #include <utility>
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index d06338c6f69..570a00cbfc2 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -24,6 +24,7 @@
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index f6be4539d7f..7b3b04dea16 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -19,10 +19,7 @@
 #include <cudf/io/detail/tokenize_json.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/bit.hpp>
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <map>
 #include <vector>
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index c42348a165f..0081ed30d17 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -23,6 +23,7 @@
 
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/device_scalar.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
diff --git a/cpp/src/io/orc/reader_impl_helpers.cpp b/cpp/src/io/orc/reader_impl_helpers.cpp
index 4c1079cffe8..7e5db4b7617 100644
--- a/cpp/src/io/orc/reader_impl_helpers.cpp
+++ b/cpp/src/io/orc/reader_impl_helpers.cpp
@@ -16,8 +16,6 @@
 
 #include "reader_impl_helpers.hpp"
 
-#include <cudf/utilities/memory_resource.hpp>
-
 namespace cudf::io::orc::detail {
 
 std::unique_ptr<column> create_empty_column(size_type orc_col_id,
diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp
index 5528b2ee763..4cded30d89b 100644
--- a/cpp/src/io/orc/reader_impl_helpers.hpp
+++ b/cpp/src/io/orc/reader_impl_helpers.hpp
@@ -20,9 +20,6 @@
 #include "io/orc/orc.hpp"
 #include "io/utilities/column_buffer.hpp"
 
-#include <cudf/io/orc.hpp>
-#include <cudf/utilities/memory_resource.hpp>
-
 #include <rmm/cuda_stream_view.hpp>
 
 #include <memory>
diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index ddf65e9020f..d15435b2553 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -27,7 +27,6 @@
 #include "ipc/Schema_generated.h"
 #include "writer_impl_helpers.hpp"
 
-#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
index 9bc435bf6c8..66810ee163a 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.hpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -22,10 +22,9 @@
 #pragma once
 
 #include <cudf/detail/utilities/linked_column.hpp>
-#include <cudf/io/data_sink.hpp>
-#include <cudf/io/detail/parquet.hpp>
+#include <cudf/io/detail/utils.hpp>
+#include <cudf/io/types.hpp>
 #include <cudf/strings/detail/utilities.hpp>
-#include <cudf/types.hpp>
 
 namespace cudf::io::parquet::detail {
 
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index 12c24e2b848..b87f2e9c692 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -22,10 +22,7 @@
 
 #include <algorithm>
 #include <cstddef>
-#include <optional>
-#include <string>
 #include <utility>
-#include <vector>
 
 namespace CUDF_EXPORT cudf {
 namespace io::parquet::detail {
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index d4778b1ea15..05859d60c03 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include "parquet.hpp"
-#include "parquet_common.hpp"
 
 #include <algorithm>
 #include <cstddef>
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 32e922b04bb..a965f3325d5 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -23,7 +23,6 @@
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index dd354b905f3..170c6e8857f 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -16,8 +16,6 @@
 
 #include "reader_impl.hpp"
 
-#include <cudf/utilities/memory_resource.hpp>
-
 namespace cudf::io::parquet::detail {
 
 reader::reader() = default;
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 0705ff6f5cc..fed1a309064 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -21,11 +21,9 @@
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 
 #include <bitset>
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 4baea8655e0..f4a2f29026a 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -22,10 +22,6 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 
-#include <rmm/device_buffer.hpp>
-
-#include <thrust/host_vector.h>
-
 #include <fstream>
 
 namespace cudf::io::text {
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 249dc3b5875..6d954753af8 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -21,12 +21,12 @@
 
 #include "column_buffer.hpp"
 
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <iomanip>
 #include <sstream>
 
 namespace cudf::io::detail {
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index e73b2bc88de..31c8b781e77 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -22,12 +22,9 @@
 #pragma once
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/io/types.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
-#include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -35,6 +32,8 @@
 
 #include <thrust/pair.h>
 
+#include <optional>
+
 namespace cudf {
 namespace io {
 namespace detail {
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index 813743fa7b4..b66742569d9 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -16,14 +16,10 @@
 
 #include "getenv_or.hpp"
 
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <kvikio/defaults.hpp>
 
-#include <cstdlib>
-#include <mutex>
-#include <sstream>
 #include <string>
 
 namespace cudf::io {
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 4e8908a8942..9668b30e9a9 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -33,7 +33,6 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
-#include <unordered_map>
 #include <vector>
 
 namespace cudf {
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 98ed9b28f0a..93cdccfbb9f 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -22,8 +22,6 @@
 #include <cudf/detail/utilities/logger.hpp>
 #include <cudf/io/config_utils.hpp>
 
-#include <rmm/device_buffer.hpp>
-
 #include <dlfcn.h>
 
 #include <cerrno>
diff --git a/cpp/src/io/utilities/row_selection.cpp b/cpp/src/io/utilities/row_selection.cpp
index c0bbca39167..cf252fe63af 100644
--- a/cpp/src/io/utilities/row_selection.cpp
+++ b/cpp/src/io/utilities/row_selection.cpp
@@ -16,10 +16,7 @@
 
 #include "io/utilities/row_selection.hpp"
 
-#include <cudf/utilities/error.hpp>
-
 #include <algorithm>
-#include <limits>
 
 namespace cudf::io::detail {
 
diff --git a/cpp/src/io/utilities/row_selection.hpp b/cpp/src/io/utilities/row_selection.hpp
index 7c607099cdc..e826feff201 100644
--- a/cpp/src/io/utilities/row_selection.hpp
+++ b/cpp/src/io/utilities/row_selection.hpp
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <cstdint>
 #include <optional>
diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp
index 89c47d246d0..34a0bdce124 100644
--- a/cpp/src/jit/cache.cpp
+++ b/cpp/src/jit/cache.cpp
@@ -16,11 +16,8 @@
 
 #include <cudf/utilities/error.hpp>
 
-#include <cuda.h>
-
 #include <jitify2.hpp>
 
-#include <cstddef>
 #include <filesystem>
 
 namespace cudf {
diff --git a/cpp/src/jit/util.cpp b/cpp/src/jit/util.cpp
index 0585e02a031..d9a29203133 100644
--- a/cpp/src/jit/util.cpp
+++ b/cpp/src/jit/util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,6 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <string>
-
 namespace cudf {
 namespace jit {
 struct get_data_ptr_functor {
diff --git a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
index a9f86ac1b5f..17844b6bb0a 100644
--- a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
+++ b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/tdigest/tdigest_column_view.hpp>
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index d187375b69f..75ebc078930 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -26,8 +26,6 @@
 #include <cudf/reduction/detail/histogram.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/structs/structs_column_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp
index d3c0b54f286..b91ae19b51a 100644
--- a/cpp/src/reductions/scan/scan.cpp
+++ b/cpp/src/reductions/scan/scan.cpp
@@ -14,13 +14,10 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/scan.hpp>
 #include <cudf/reduction.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index 40d1d8a0a53..c4f6c135dde 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -13,16 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cudf/column/column.hpp>
-#include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.cpp b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
index 72c23395a93..7cad31c0658 100644
--- a/cpp/src/rolling/detail/optimized_unbounded_window.cpp
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
@@ -18,13 +18,10 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
-#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/groupby.hpp>
 #include <cudf/reduction/detail/reduction.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
-#include <cudf/unary.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf::detail {
diff --git a/cpp/src/rolling/detail/range_window_bounds.hpp b/cpp/src/rolling/detail/range_window_bounds.hpp
index 8a53e937f98..77cb2a8c7f5 100644
--- a/cpp/src/rolling/detail/range_window_bounds.hpp
+++ b/cpp/src/rolling/detail/range_window_bounds.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,7 @@
 #pragma once
 
 #include <cudf/rolling/range_window_bounds.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/wrappers/durations.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/rolling/range_window_bounds.cpp b/cpp/src/rolling/range_window_bounds.cpp
index 69792136c64..7f698dfcd6b 100644
--- a/cpp/src/rolling/range_window_bounds.cpp
+++ b/cpp/src/rolling/range_window_bounds.cpp
@@ -19,7 +19,6 @@
 #include <cudf/rolling/range_window_bounds.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
-#include <cudf/wrappers/durations.hpp>
 
 namespace cudf {
 namespace {
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 31535198c58..4ec2174a96f 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -26,8 +26,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-
 #include <string>
 
 namespace cudf {
diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index 656fe61fbbe..9f242bdffe0 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -16,10 +16,8 @@
 
 #include <cudf/detail/copy.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/src/strings/regex/regexec.cpp b/cpp/src/strings/regex/regexec.cpp
index d1990733e81..60ad714dfec 100644
--- a/cpp/src/strings/regex/regexec.cpp
+++ b/cpp/src/strings/regex/regexec.cpp
@@ -24,7 +24,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
-#include <algorithm>
 #include <functional>
 #include <numeric>
 
diff --git a/cpp/src/strings/strings_scalar_factories.cpp b/cpp/src/strings/strings_scalar_factories.cpp
index 219d1174d42..1cc405234b2 100644
--- a/cpp/src/strings/strings_scalar_factories.cpp
+++ b/cpp/src/strings/strings_scalar_factories.cpp
@@ -16,7 +16,6 @@
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/structs/structs_column_view.cpp b/cpp/src/structs/structs_column_view.cpp
index b0284e9cb96..e14142a9ad1 100644
--- a/cpp/src/structs/structs_column_view.cpp
+++ b/cpp/src/structs/structs_column_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/types.hpp>
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index 5df9943303d..4012ee3d21c 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -21,13 +21,10 @@
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/structs/structs_column_view.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
-#include <cudf/utilities/traits.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp
index cb707c94288..41c64c6decb 100644
--- a/cpp/src/table/table.cpp
+++ b/cpp/src/table/table.cpp
@@ -18,7 +18,6 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp
index 8a5340dc20d..659beb749af 100644
--- a/cpp/src/table/table_view.cpp
+++ b/cpp/src/table/table_view.cpp
@@ -20,10 +20,7 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-
 #include <algorithm>
-#include <cassert>
 #include <vector>
 
 namespace cudf {
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 52b96bc9039..b919ac16956 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -23,8 +23,6 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/null_mask.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/src/utilities/cuda.cpp b/cpp/src/utilities/cuda.cpp
index 53ca0608170..d979bda41d0 100644
--- a/cpp/src/utilities/cuda.cpp
+++ b/cpp/src/utilities/cuda.cpp
@@ -18,8 +18,6 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <cuda_runtime.h>
-
 namespace cudf::detail {
 
 cudf::size_type num_multiprocessors()
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index 9d8e3cf2fa6..e30806a5011 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -18,7 +18,6 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/cuda_device.hpp>
diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp
index 58971552758..000526723c4 100644
--- a/cpp/src/utilities/prefetch.cpp
+++ b/cpp/src/utilities/prefetch.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/prefetch.hpp>
 
diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp
index 8c29182bfb5..7069b59be26 100644
--- a/cpp/src/utilities/stream_pool.cpp
+++ b/cpp/src/utilities/stream_pool.cpp
@@ -23,7 +23,6 @@
 
 #include <algorithm>
 #include <cstddef>
-#include <memory>
 #include <mutex>
 #include <vector>
 
diff --git a/cpp/src/utilities/traits.cpp b/cpp/src/utilities/traits.cpp
index a68dc84e340..c1e71f5f8f9 100644
--- a/cpp/src/utilities/traits.cpp
+++ b/cpp/src/utilities/traits.cpp
@@ -19,8 +19,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf/wrappers/dictionary.hpp>
 
-#include <cuda_runtime.h>
-
 namespace cudf {
 
 namespace {
diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp
index 3095b342748..84c8529641d 100644
--- a/cpp/src/utilities/type_checks.cpp
+++ b/cpp/src/utilities/type_checks.cpp
@@ -21,8 +21,6 @@
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-
 #include <algorithm>
 
 namespace cudf {
diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index a4bde50a21e..7af88d8aa34 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -18,7 +18,6 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/ast/expressions.hpp>
@@ -26,14 +25,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/transform.hpp>
-#include <cudf/types.hpp>
-
-#include <rmm/device_uvector.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -41,7 +34,6 @@
 #include <limits>
 #include <list>
 #include <random>
-#include <type_traits>
 #include <vector>
 
 template <typename T>
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index aa5b49567e6..3bd67001c16 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -26,9 +26,7 @@
 #include <cudf/binaryop.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
-#include <cudf/unary.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
diff --git a/cpp/tests/binaryop/binop-generic-ptx-test.cpp b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
index 03cc87a1968..e9a2761db4a 100644
--- a/cpp/tests/binaryop/binop-generic-ptx-test.cpp
+++ b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -19,7 +19,6 @@
 
 #include <tests/binaryop/assert-binops.h>
 #include <tests/binaryop/binop-fixture.hpp>
-#include <tests/binaryop/util/operation.h>
 #include <tests/binaryop/util/runtime_support.h>
 
 #include <cudf/binaryop.hpp>
diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp
index fe221fb1c48..799bf646e52 100644
--- a/cpp/tests/bitmask/bitmask_tests.cpp
+++ b/cpp/tests/bitmask/bitmask_tests.cpp
@@ -16,7 +16,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/testing_main.hpp>
 
diff --git a/cpp/tests/column/bit_cast_test.cpp b/cpp/tests/column/bit_cast_test.cpp
index ab230ab036e..5570a7d498c 100644
--- a/cpp/tests/column/bit_cast_test.cpp
+++ b/cpp/tests/column/bit_cast_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
@@ -26,8 +25,6 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
-#include <random>
-
 template <typename T, typename T2 = void>
 struct rep_type_impl {
   using type = void;
diff --git a/cpp/tests/column/column_test.cpp b/cpp/tests/column/column_test.cpp
index 631f5150829..d700adaebd5 100644
--- a/cpp/tests/column/column_test.cpp
+++ b/cpp/tests/column/column_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
diff --git a/cpp/tests/column/column_view_device_span_test.cpp b/cpp/tests/column/column_view_device_span_test.cpp
index 6de9121158b..470437f4112 100644
--- a/cpp/tests/column/column_view_device_span_test.cpp
+++ b/cpp/tests/column/column_view_device_span_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp
index 37ab4b8f387..ad344476332 100644
--- a/cpp/tests/column/column_view_shallow_test.cpp
+++ b/cpp/tests/column/column_view_shallow_test.cpp
@@ -15,9 +15,7 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index 603187f0330..aa9d508b6aa 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -26,11 +26,8 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/utilities.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-
 #include <thrust/iterator/counting_iterator.h>
 
 class ColumnFactoryTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 18140c34abd..aedc498964a 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -34,8 +34,6 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <thrust/iterator/constant_iterator.h>
-
 #include <numeric>
 #include <stdexcept>
 #include <string>
diff --git a/cpp/tests/copying/copy_if_else_nested_tests.cpp b/cpp/tests/copying/copy_if_else_nested_tests.cpp
index cfbd181f944..e1cdfe9beed 100644
--- a/cpp/tests/copying/copy_if_else_nested_tests.cpp
+++ b/cpp/tests/copying/copy_if_else_nested_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp
index 25d93da277b..e2133a546e4 100644
--- a/cpp/tests/copying/copy_range_tests.cpp
+++ b/cpp/tests/copying/copy_range_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp
index 4124f749012..9c00725d5d2 100644
--- a/cpp/tests/copying/copy_tests.cpp
+++ b/cpp/tests/copying/copy_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/copying/gather_list_tests.cpp b/cpp/tests/copying/gather_list_tests.cpp
index 247090aac90..93f71345c5c 100644
--- a/cpp/tests/copying/gather_list_tests.cpp
+++ b/cpp/tests/copying/gather_list_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/copying/gather_str_tests.cpp b/cpp/tests/copying/gather_str_tests.cpp
index 28098878086..795e3f30aa1 100644
--- a/cpp/tests/copying/gather_str_tests.cpp
+++ b/cpp/tests/copying/gather_str_tests.cpp
@@ -16,7 +16,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/copying/gather_struct_tests.cpp b/cpp/tests/copying/gather_struct_tests.cpp
index 1598ab2646a..b2c0f7acc3a 100644
--- a/cpp/tests/copying/gather_struct_tests.cpp
+++ b/cpp/tests/copying/gather_struct_tests.cpp
@@ -17,20 +17,15 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/error.hpp>
 
 #include <rmm/device_buffer.hpp>
 
diff --git a/cpp/tests/copying/gather_tests.cpp b/cpp/tests/copying/gather_tests.cpp
index 07ce672b14d..908dcd67673 100644
--- a/cpp/tests/copying/gather_tests.cpp
+++ b/cpp/tests/copying/gather_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index 90ff97e7355..b2d64dac7c8 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -16,10 +16,8 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
diff --git a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
index 4f28ff12941..1f76efdc4c3 100644
--- a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
+++ b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
@@ -16,13 +16,10 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/gather.hpp>
-#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/cpp/tests/copying/reverse_tests.cpp b/cpp/tests/copying/reverse_tests.cpp
index e4b2d319ddf..46516436901 100644
--- a/cpp/tests/copying/reverse_tests.cpp
+++ b/cpp/tests/copying/reverse_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,17 +17,13 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
-#include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/tabulate.h>
diff --git a/cpp/tests/copying/sample_tests.cpp b/cpp/tests/copying/sample_tests.cpp
index 2f76e3f1fcd..8be5d8c1fbb 100644
--- a/cpp/tests/copying/sample_tests.cpp
+++ b/cpp/tests/copying/sample_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,12 +15,9 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <cudf/column/column.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/sorting.hpp>
diff --git a/cpp/tests/copying/scatter_list_scalar_tests.cpp b/cpp/tests/copying/scatter_list_scalar_tests.cpp
index 42d2e004d6b..23faa6e5b86 100644
--- a/cpp/tests/copying/scatter_list_scalar_tests.cpp
+++ b/cpp/tests/copying/scatter_list_scalar_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/null_mask.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 
 using mask_vector = std::vector<cudf::valid_type>;
 using size_column = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
diff --git a/cpp/tests/copying/scatter_list_tests.cpp b/cpp/tests/copying/scatter_list_tests.cpp
index a82860a3eec..1f87fcfcc99 100644
--- a/cpp/tests/copying/scatter_list_tests.cpp
+++ b/cpp/tests/copying/scatter_list_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/copying/scatter_struct_scalar_tests.cpp b/cpp/tests/copying/scatter_struct_scalar_tests.cpp
index 78572b0bb37..1d1da8a1b1e 100644
--- a/cpp/tests/copying/scatter_struct_scalar_tests.cpp
+++ b/cpp/tests/copying/scatter_struct_scalar_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,6 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
diff --git a/cpp/tests/copying/scatter_struct_tests.cpp b/cpp/tests/copying/scatter_struct_tests.cpp
index c92244d047b..7d88e9af85f 100644
--- a/cpp/tests/copying/scatter_struct_tests.cpp
+++ b/cpp/tests/copying/scatter_struct_tests.cpp
@@ -21,7 +21,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 
 using namespace cudf::test::iterators;
diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp
index 41a753cd0ac..74c04446bdd 100644
--- a/cpp/tests/copying/scatter_tests.cpp
+++ b/cpp/tests/copying/scatter_tests.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
@@ -23,7 +22,6 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/stream_compaction.hpp>
 
 #include <stdexcept>
 
diff --git a/cpp/tests/copying/segmented_gather_list_tests.cpp b/cpp/tests/copying/segmented_gather_list_tests.cpp
index 8881fb344a2..a133ae43872 100644
--- a/cpp/tests/copying/segmented_gather_list_tests.cpp
+++ b/cpp/tests/copying/segmented_gather_list_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,6 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/gather.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp
index ff6808d9a79..72a8e7357bc 100644
--- a/cpp/tests/copying/shift_tests.cpp
+++ b/cpp/tests/copying/shift_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
@@ -30,7 +29,6 @@
 
 #include <limits>
 #include <memory>
-#include <stdexcept>
 
 using TestTypes = cudf::test::Types<int32_t>;
 
diff --git a/cpp/tests/copying/slice_tests.cpp b/cpp/tests/copying/slice_tests.cpp
index aef0d4ad78a..3868a147fa8 100644
--- a/cpp/tests/copying/slice_tests.cpp
+++ b/cpp/tests/copying/slice_tests.cpp
@@ -22,12 +22,8 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-#include <cudf/wrappers/timestamps.hpp>
 
 #include <array>
 #include <stdexcept>
diff --git a/cpp/tests/copying/utility_tests.cpp b/cpp/tests/copying/utility_tests.cpp
index 0905f9babdc..90457f8d74c 100644
--- a/cpp/tests/copying/utility_tests.cpp
+++ b/cpp/tests/copying/utility_tests.cpp
@@ -23,7 +23,6 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index 603edb27c7c..44f99adc0e9 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -23,14 +23,11 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/datetime.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <thrust/transform.h>
-
 #define XXX false  // stub for null values
 
 constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
diff --git a/cpp/tests/dictionary/add_keys_test.cpp b/cpp/tests/dictionary/add_keys_test.cpp
index 46bf5468922..ebc8c11e86c 100644
--- a/cpp/tests/dictionary/add_keys_test.cpp
+++ b/cpp/tests/dictionary/add_keys_test.cpp
@@ -24,8 +24,6 @@
 #include <cudf/dictionary/update_keys.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <vector>
-
 struct DictionaryAddKeysTest : public cudf::test::BaseFixture {};
 
 TEST_F(DictionaryAddKeysTest, StringsColumn)
diff --git a/cpp/tests/dictionary/encode_test.cpp b/cpp/tests/dictionary/encode_test.cpp
index 5db0e9fa1e4..dfa3ede5d46 100644
--- a/cpp/tests/dictionary/encode_test.cpp
+++ b/cpp/tests/dictionary/encode_test.cpp
@@ -21,8 +21,6 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 
-#include <vector>
-
 struct DictionaryEncodeTest : public cudf::test::BaseFixture {};
 
 TEST_F(DictionaryEncodeTest, EncodeStringColumn)
diff --git a/cpp/tests/dictionary/fill_test.cpp b/cpp/tests/dictionary/fill_test.cpp
index 18696b66e48..bc7d19201aa 100644
--- a/cpp/tests/dictionary/fill_test.cpp
+++ b/cpp/tests/dictionary/fill_test.cpp
@@ -18,13 +18,10 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
-#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
 
-#include <vector>
-
 struct DictionaryFillTest : public cudf::test::BaseFixture {};
 
 TEST_F(DictionaryFillTest, StringsColumn)
diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp
index 25501b4fde7..2774173b80a 100644
--- a/cpp/tests/dictionary/search_test.cpp
+++ b/cpp/tests/dictionary/search_test.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/dictionary/detail/search.hpp>
diff --git a/cpp/tests/dictionary/slice_test.cpp b/cpp/tests/dictionary/slice_test.cpp
index d80f8dee079..8c15d6dbecd 100644
--- a/cpp/tests/dictionary/slice_test.cpp
+++ b/cpp/tests/dictionary/slice_test.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/dictionary/update_keys.hpp>
diff --git a/cpp/tests/filling/fill_tests.cpp b/cpp/tests/filling/fill_tests.cpp
index 26badefe698..a5e2db6a005 100644
--- a/cpp/tests/filling/fill_tests.cpp
+++ b/cpp/tests/filling/fill_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/filling/repeat_tests.cpp b/cpp/tests/filling/repeat_tests.cpp
index 6326765c68b..c856984a4a3 100644
--- a/cpp/tests/filling/repeat_tests.cpp
+++ b/cpp/tests/filling/repeat_tests.cpp
@@ -17,14 +17,11 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/filling.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
@@ -33,7 +30,6 @@
 
 #include <algorithm>
 #include <numeric>
-#include <random>
 
 constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
 
diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp
index 0783b4e5bbb..53782c90c26 100644
--- a/cpp/tests/filling/sequence_tests.cpp
+++ b/cpp/tests/filling/sequence_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/filling.hpp>
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index a222289216d..b96c6909e55 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -18,17 +18,14 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/binaryop.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/unary.hpp>
 
 #include <algorithm>
 #include <limits>
 #include <numeric>
-#include <type_traits>
 #include <vector>
 
 using namespace numeric;
diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp
index a79b6a32916..ba456084a7c 100644
--- a/cpp/tests/groupby/collect_list_tests.cpp
+++ b/cpp/tests/groupby/collect_list_tests.cpp
@@ -20,8 +20,6 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
-
 template <typename V>
 struct groupby_collect_list_test : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp
index 61d2838590b..dfd7eb82c4a 100644
--- a/cpp/tests/groupby/collect_set_tests.cpp
+++ b/cpp/tests/groupby/collect_set_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/lists/sorting.hpp>
 #include <cudf/sorting.hpp>
diff --git a/cpp/tests/groupby/correlation_tests.cpp b/cpp/tests/groupby/correlation_tests.cpp
index 26f714632dd..f8cc813e877 100644
--- a/cpp/tests/groupby/correlation_tests.cpp
+++ b/cpp/tests/groupby/correlation_tests.cpp
@@ -25,7 +25,6 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 
 #include <limits>
-#include <vector>
 
 using namespace cudf::test::iterators;
 
diff --git a/cpp/tests/groupby/covariance_tests.cpp b/cpp/tests/groupby/covariance_tests.cpp
index e3eb2da201f..81378bb91e8 100644
--- a/cpp/tests/groupby/covariance_tests.cpp
+++ b/cpp/tests/groupby/covariance_tests.cpp
@@ -23,10 +23,8 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/utilities/traits.hpp>
 
 #include <limits>
-#include <vector>
 
 using namespace cudf::test::iterators;
 
diff --git a/cpp/tests/groupby/groupby_test_util.cpp b/cpp/tests/groupby/groupby_test_util.cpp
index 5d99d15ae77..df0375d6a09 100644
--- a/cpp/tests/groupby/groupby_test_util.cpp
+++ b/cpp/tests/groupby/groupby_test_util.cpp
@@ -17,8 +17,8 @@
 #include "groupby_test_util.hpp"
 
 #include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/default_stream.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
@@ -27,9 +27,6 @@
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
-#include <cudf/unary.hpp>
-
-#include <random>
 
 void test_single_agg(cudf::column_view const& keys,
                      cudf::column_view const& values,
diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp
index 755b0c20f17..9d2e613be3e 100644
--- a/cpp/tests/groupby/groupby_test_util.hpp
+++ b/cpp/tests/groupby/groupby_test_util.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,8 @@
 
 #pragma once
 
-#include <cudf/column/column_view.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/sorting.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 enum class force_use_sort_impl : bool { NO, YES };
 
diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp
index 2d447025919..783cfb17e49 100644
--- a/cpp/tests/groupby/histogram_tests.cpp
+++ b/cpp/tests/groupby/histogram_tests.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/lists/sorting.hpp>
 #include <cudf/sorting.hpp>
diff --git a/cpp/tests/groupby/max_scan_tests.cpp b/cpp/tests/groupby/max_scan_tests.cpp
index d86de798844..6195e0179ec 100644
--- a/cpp/tests/groupby/max_scan_tests.cpp
+++ b/cpp/tests/groupby/max_scan_tests.cpp
@@ -22,7 +22,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/dictionary/update_keys.hpp>
 
 using namespace cudf::test::iterators;
 
diff --git a/cpp/tests/groupby/merge_lists_tests.cpp b/cpp/tests/groupby/merge_lists_tests.cpp
index 279d71560b4..4481e2dc022 100644
--- a/cpp/tests/groupby/merge_lists_tests.cpp
+++ b/cpp/tests/groupby/merge_lists_tests.cpp
@@ -21,7 +21,6 @@
 
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/table/table_view.hpp>
 
diff --git a/cpp/tests/groupby/merge_sets_tests.cpp b/cpp/tests/groupby/merge_sets_tests.cpp
index 9736bb84dd6..1bfba265478 100644
--- a/cpp/tests/groupby/merge_sets_tests.cpp
+++ b/cpp/tests/groupby/merge_sets_tests.cpp
@@ -21,7 +21,6 @@
 
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/lists/sorting.hpp>
 #include <cudf/sorting.hpp>
diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp
index 7f31bc9089f..f2a50248b4a 100644
--- a/cpp/tests/groupby/rank_scan_tests.cpp
+++ b/cpp/tests/groupby/rank_scan_tests.cpp
@@ -22,8 +22,6 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
-
 using namespace cudf::test::iterators;
 
 template <typename T>
diff --git a/cpp/tests/groupby/shift_tests.cpp b/cpp/tests/groupby/shift_tests.cpp
index 14c9ceb4508..49f9d7cb10a 100644
--- a/cpp/tests/groupby/shift_tests.cpp
+++ b/cpp/tests/groupby/shift_tests.cpp
@@ -21,7 +21,6 @@
 
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 
 template <typename T>
diff --git a/cpp/tests/hashing/md5_test.cpp b/cpp/tests/hashing/md5_test.cpp
index 69e518cbf8d..b54adb52496 100644
--- a/cpp/tests/hashing/md5_test.cpp
+++ b/cpp/tests/hashing/md5_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
diff --git a/cpp/tests/hashing/murmurhash3_x86_32_test.cpp b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp
index c1a6e6ff6e1..b4622f5eb81 100644
--- a/cpp/tests/hashing/murmurhash3_x86_32_test.cpp
+++ b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp
@@ -17,11 +17,9 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/iterator.cuh>
 #include <cudf/hashing.hpp>
 
 constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
diff --git a/cpp/tests/hashing/sha1_test.cpp b/cpp/tests/hashing/sha1_test.cpp
index e28e71442a6..1e86751bb4c 100644
--- a/cpp/tests/hashing/sha1_test.cpp
+++ b/cpp/tests/hashing/sha1_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
diff --git a/cpp/tests/hashing/sha224_test.cpp b/cpp/tests/hashing/sha224_test.cpp
index 61b584f94df..259e7102ee2 100644
--- a/cpp/tests/hashing/sha224_test.cpp
+++ b/cpp/tests/hashing/sha224_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp
index 8bc47c92c6b..a4affc87874 100644
--- a/cpp/tests/hashing/sha256_test.cpp
+++ b/cpp/tests/hashing/sha256_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
diff --git a/cpp/tests/hashing/sha384_test.cpp b/cpp/tests/hashing/sha384_test.cpp
index 4c79934f98d..8a5c090eeea 100644
--- a/cpp/tests/hashing/sha384_test.cpp
+++ b/cpp/tests/hashing/sha384_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
diff --git a/cpp/tests/hashing/sha512_test.cpp b/cpp/tests/hashing/sha512_test.cpp
index 0eb1c60b8fc..77fc56b5f13 100644
--- a/cpp/tests/hashing/sha512_test.cpp
+++ b/cpp/tests/hashing/sha512_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
diff --git a/cpp/tests/hashing/xxhash_64_test.cpp b/cpp/tests/hashing/xxhash_64_test.cpp
index ab4ed829681..d8694a72d94 100644
--- a/cpp/tests/hashing/xxhash_64_test.cpp
+++ b/cpp/tests/hashing/xxhash_64_test.cpp
@@ -17,11 +17,8 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/iterator.cuh>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/hashing.hpp>
 
 using NumericTypesNoBools =
diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp
index 2151ec6e22f..1ddc33e749a 100644
--- a/cpp/tests/interop/from_arrow_device_test.cpp
+++ b/cpp/tests/interop/from_arrow_device_test.cpp
@@ -17,17 +17,13 @@
 #include "nanoarrow_utils.hpp"
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp
index ef9936b214c..d93ef28aab8 100644
--- a/cpp/tests/interop/from_arrow_host_test.cpp
+++ b/cpp/tests/interop/from_arrow_host_test.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
@@ -28,7 +27,6 @@
 #include <cudf/copying.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
-#include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/interop/from_arrow_stream_test.cpp b/cpp/tests/interop/from_arrow_stream_test.cpp
index 80a2e4b2ffd..3916025bf22 100644
--- a/cpp/tests/interop/from_arrow_stream_test.cpp
+++ b/cpp/tests/interop/from_arrow_stream_test.cpp
@@ -17,27 +17,14 @@
 #include "nanoarrow_utils.hpp"
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/concatenate.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/dictionary/dictionary_factories.hpp>
-#include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-
 struct VectorOfArrays {
   std::vector<nanoarrow::UniqueArray> arrays;
   nanoarrow::UniqueSchema schema;
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 6e742b9e4cf..18efae75cb1 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -25,9 +25,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
 #include <cudf/table/table.hpp>
@@ -37,8 +35,6 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <arrow/c/bridge.h>
-#include <nanoarrow/nanoarrow.h>
-#include <nanoarrow/nanoarrow_device.h>
 
 std::unique_ptr<cudf::table> get_cudf_table()
 {
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 7ba586461dc..29aa928c277 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -17,21 +17,15 @@
 #include "nanoarrow_utils.hpp"
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 
diff --git a/cpp/tests/interop/to_arrow_host_test.cpp b/cpp/tests/interop/to_arrow_host_test.cpp
index fcb4433b42e..fa3aa82fee2 100644
--- a/cpp/tests/interop/to_arrow_host_test.cpp
+++ b/cpp/tests/interop/to_arrow_host_test.cpp
@@ -17,20 +17,14 @@
 #include "nanoarrow_utils.hpp"
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index a6aa4b22eca..86295d8efb1 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -19,14 +19,12 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index b265dcf9273..cc1e367d114 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -17,14 +17,12 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/io/csv.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
@@ -32,18 +30,12 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/unary.hpp>
 
-#include <thrust/copy.h>
 #include <thrust/execution_policy.h>
-#include <thrust/find.h>
 #include <thrust/iterator/counting_iterator.h>
 
-#include <arrow/io/api.h>
-
 #include <algorithm>
 #include <fstream>
-#include <iostream>
 #include <iterator>
 #include <limits>
 #include <numeric>
diff --git a/cpp/tests/io/file_io_test.cpp b/cpp/tests/io/file_io_test.cpp
index 3c41f21b0a4..1b85541687a 100644
--- a/cpp/tests/io/file_io_test.cpp
+++ b/cpp/tests/io/file_io_test.cpp
@@ -15,13 +15,10 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <src/io/utilities/file_io_utilities.hpp>
 
-#include <type_traits>
-
 // Base test fixture for tests
 struct CuFileIOTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp
index d23acf3ae00..c8c2d18903f 100644
--- a/cpp/tests/io/json/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json/json_quote_normalization_test.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
-#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
@@ -29,7 +28,6 @@
 
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <string>
 
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index cb6716f4a18..5f070bd53b9 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -39,8 +39,6 @@
 
 #include <thrust/iterator/constant_iterator.h>
 
-#include <arrow/io/api.h>
-
 #include <fstream>
 #include <limits>
 #include <memory>
diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
index 15682c6ae6b..887d4fa783f 100644
--- a/cpp/tests/io/json/json_tree.cpp
+++ b/cpp/tests/io/json/json_tree.cpp
@@ -15,12 +15,8 @@
  */
 
 #include "io/json/nested_json.hpp"
-#include "io/utilities/hostdevice_vector.hpp"
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/random.hpp>
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/hashing.hpp>
@@ -29,9 +25,9 @@
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
+#include <numeric>
 #include <stack>
 #include <string>
 #include <unordered_map>
diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp
index f32aba0e632..e0e955c4f48 100644
--- a/cpp/tests/io/json/nested_json_test.cpp
+++ b/cpp/tests/io/json/nested_json_test.cpp
@@ -21,24 +21,16 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/io_metadata_utilities.hpp>
 #include <cudf_test/random.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
-#include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
-#include <cudf/io/parquet.hpp>
 #include <cudf/io/types.hpp>
-#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/copy.h>
 #include <thrust/iterator/zip_iterator.h>
 
 #include <string>
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index cce0adbf317..fce99187516 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -31,7 +31,6 @@
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/io/orc_metadata.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/span.hpp>
diff --git a/cpp/tests/io/parquet_common.hpp b/cpp/tests/io/parquet_common.hpp
index c90b81ed27a..d66aa3bde9d 100644
--- a/cpp/tests/io/parquet_common.hpp
+++ b/cpp/tests/io/parquet_common.hpp
@@ -22,13 +22,11 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/io/datasource.hpp>
-#include <cudf/io/parquet_metadata.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
 #include <src/io/parquet/compact_protocol_reader.hpp>
 #include <src/io/parquet/parquet.hpp>
-#include <src/io/parquet/parquet_gpu.hpp>
 
 #include <random>
 #include <type_traits>
diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp
index f1286a00d22..d66f685cd9c 100644
--- a/cpp/tests/io/parquet_misc_test.cpp
+++ b/cpp/tests/io/parquet_misc_test.cpp
@@ -20,8 +20,6 @@
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/io/parquet.hpp>
-#include <cudf/stream_compaction.hpp>
-#include <cudf/transform.hpp>
 
 #include <array>
 
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index 7986a3c6d70..177e6163d4f 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -29,6 +29,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/transform.hpp>
 
+#include <src/io/parquet/parquet_gpu.hpp>
+
 #include <array>
 
 TEST_F(ParquetReaderTest, UserBounds)
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index be2ecd56424..5c3c8342cd2 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf_test/base_fixture.hpp>
 #include <cudf_test/testing_main.hpp>
 
 // NOTE: this file exists to define the parquet test's `main()` function.
diff --git a/cpp/tests/io/row_selection_test.cpp b/cpp/tests/io/row_selection_test.cpp
index ebadd870091..c40d3bbd299 100644
--- a/cpp/tests/io/row_selection_test.cpp
+++ b/cpp/tests/io/row_selection_test.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <src/io/utilities/row_selection.hpp>
diff --git a/cpp/tests/io/text/data_chunk_source_test.cpp b/cpp/tests/io/text/data_chunk_source_test.cpp
index 6f46df20633..79ce908f3e0 100644
--- a/cpp/tests/io/text/data_chunk_source_test.cpp
+++ b/cpp/tests/io/text/data_chunk_source_test.cpp
@@ -15,14 +15,11 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-
 #include <fstream>
 #include <random>
 
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
index 74d08061df9..60244462e2c 100644
--- a/cpp/tests/io/text/multibyte_split_test.cpp
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -19,16 +19,12 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/concatenate.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/multibyte_split.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 using cudf::test::strings_column_wrapper;
diff --git a/cpp/tests/iterator/value_iterator.cpp b/cpp/tests/iterator/value_iterator.cpp
index 22bc7475dbe..f7f7c0f2721 100644
--- a/cpp/tests/iterator/value_iterator.cpp
+++ b/cpp/tests/iterator/value_iterator.cpp
@@ -13,7 +13,6 @@
  * the License.
  */
 
-#include <cudf_test/base_fixture.hpp>
 #include <cudf_test/testing_main.hpp>
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/jit/parse_ptx_function.cpp b/cpp/tests/jit/parse_ptx_function.cpp
index 6f9dfd06730..c9bb691907a 100644
--- a/cpp/tests/jit/parse_ptx_function.cpp
+++ b/cpp/tests/jit/parse_ptx_function.cpp
@@ -16,7 +16,6 @@
 
 #include "jit/parser.hpp"
 
-#include <cudf_test/base_fixture.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <algorithm>
diff --git a/cpp/tests/join/cross_join_tests.cpp b/cpp/tests/join/cross_join_tests.cpp
index d87f5e54153..971913443e5 100644
--- a/cpp/tests/join/cross_join_tests.cpp
+++ b/cpp/tests/join/cross_join_tests.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp
index 178edc52dd3..9070efa38fe 100644
--- a/cpp/tests/join/distinct_join_tests.cpp
+++ b/cpp/tests/join/distinct_join_tests.cpp
@@ -15,12 +15,8 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/filling.hpp>
@@ -31,7 +27,6 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <limits>
 #include <vector>
 
 template <typename T>
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 3431e941359..6a8a54c8465 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -20,17 +20,12 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/join.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
index 554d5754e39..ddc65c3f379 100644
--- a/cpp/tests/join/semi_anti_join_tests.cpp
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -22,7 +22,6 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
-#include <cudf/dictionary/encode.hpp>
 #include <cudf/join.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
diff --git a/cpp/tests/json/json_tests.cpp b/cpp/tests/json/json_tests.cpp
index 42a574ac5c0..53166e04173 100644
--- a/cpp/tests/json/json_tests.cpp
+++ b/cpp/tests/json/json_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/json/json.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp
index 7b61be113f9..f1404990354 100644
--- a/cpp/tests/large_strings/large_strings_fixture.cpp
+++ b/cpp/tests/large_strings/large_strings_fixture.cpp
@@ -16,12 +16,10 @@
 
 #include "large_strings_fixture.hpp"
 
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/strings/combine.hpp>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
diff --git a/cpp/tests/large_strings/parquet_tests.cpp b/cpp/tests/large_strings/parquet_tests.cpp
index 007c08ce0fb..f47782a2d02 100644
--- a/cpp/tests/large_strings/parquet_tests.cpp
+++ b/cpp/tests/large_strings/parquet_tests.cpp
@@ -16,8 +16,6 @@
 
 #include "large_strings_fixture.hpp"
 
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/io/parquet.hpp>
diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp
index 8fb2b403051..7ae7a6a7414 100644
--- a/cpp/tests/lists/contains_tests.cpp
+++ b/cpp/tests/lists/contains_tests.cpp
@@ -22,7 +22,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/lists/contains.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 
diff --git a/cpp/tests/lists/extract_tests.cpp b/cpp/tests/lists/extract_tests.cpp
index 92dd5df5ec7..2c24f695c29 100644
--- a/cpp/tests/lists/extract_tests.cpp
+++ b/cpp/tests/lists/extract_tests.cpp
@@ -21,12 +21,8 @@
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/extract.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/tests/lists/sequences_tests.cpp b/cpp/tests/lists/sequences_tests.cpp
index 74545903eb3..dcb906cd2ef 100644
--- a/cpp/tests/lists/sequences_tests.cpp
+++ b/cpp/tests/lists/sequences_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp
index 5625b47e7ea..18aa118bb81 100644
--- a/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp
+++ b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp
@@ -20,8 +20,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/lists/extract.hpp>
 #include <cudf/lists/stream_compaction.hpp>
 
 namespace cudf::test {
diff --git a/cpp/tests/merge/merge_dictionary_test.cpp b/cpp/tests/merge/merge_dictionary_test.cpp
index dd528c19e4e..1d7a31fd797 100644
--- a/cpp/tests/merge/merge_dictionary_test.cpp
+++ b/cpp/tests/merge/merge_dictionary_test.cpp
@@ -17,9 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 
-#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/merge.hpp>
 #include <cudf/table/table.hpp>
diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp
index bea044496b3..d9fdb6099f0 100644
--- a/cpp/tests/merge/merge_string_test.cpp
+++ b/cpp/tests/merge/merge_string_test.cpp
@@ -17,10 +17,8 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/merge.hpp>
 #include <cudf/table/table.hpp>
@@ -30,10 +28,6 @@
 
 #include <gtest/gtest.h>
 
-#include <algorithm>
-#include <cassert>
-#include <initializer_list>
-#include <limits>
 #include <memory>
 #include <vector>
 
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index 6208d395f0a..fad390105d7 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -21,7 +21,6 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
@@ -34,7 +33,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/merge.h>
 
 #include <vector>
 
diff --git a/cpp/tests/partitioning/round_robin_test.cpp b/cpp/tests/partitioning/round_robin_test.cpp
index 89d23c39dca..3693cfbcc72 100644
--- a/cpp/tests/partitioning/round_robin_test.cpp
+++ b/cpp/tests/partitioning/round_robin_test.cpp
@@ -17,10 +17,8 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/partitioning.hpp>
 #include <cudf/table/table.hpp>
@@ -30,12 +28,7 @@
 
 #include <gtest/gtest.h>
 
-#include <algorithm>
-#include <cassert>
-#include <initializer_list>
-#include <limits>
 #include <memory>
-#include <numeric>
 #include <vector>
 
 using cudf::test::fixed_width_column_wrapper;
diff --git a/cpp/tests/quantiles/quantile_test.cpp b/cpp/tests/quantiles/quantile_test.cpp
index 6e88365b6e8..23b58618fe1 100644
--- a/cpp/tests/quantiles/quantile_test.cpp
+++ b/cpp/tests/quantiles/quantile_test.cpp
@@ -22,7 +22,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/quantiles.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
 #include <limits>
diff --git a/cpp/tests/quantiles/quantiles_test.cpp b/cpp/tests/quantiles/quantiles_test.cpp
index 44d4ec61852..c7e11af8c85 100644
--- a/cpp/tests/quantiles/quantiles_test.cpp
+++ b/cpp/tests/quantiles/quantiles_test.cpp
@@ -16,7 +16,6 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/reductions/ewm_tests.cpp b/cpp/tests/reductions/ewm_tests.cpp
index 09cec688509..1117b0d1acf 100644
--- a/cpp/tests/reductions/ewm_tests.cpp
+++ b/cpp/tests/reductions/ewm_tests.cpp
@@ -18,9 +18,7 @@
 
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/reduction.hpp>
 
 template <typename T>
diff --git a/cpp/tests/reductions/list_rank_test.cpp b/cpp/tests/reductions/list_rank_test.cpp
index f5470f7d881..736b5081d8f 100644
--- a/cpp/tests/reductions/list_rank_test.cpp
+++ b/cpp/tests/reductions/list_rank_test.cpp
@@ -14,14 +14,9 @@
  * limitations under the License.
  */
 
-#include <benchmarks/common/generate_input.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/filling.hpp>
 #include <cudf/reduction.hpp>
 
 struct ListRankScanTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/reductions/rank_tests.cpp b/cpp/tests/reductions/rank_tests.cpp
index 3ab1fc01eaa..19633211192 100644
--- a/cpp/tests/reductions/rank_tests.cpp
+++ b/cpp/tests/reductions/rank_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@
 #include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/aggregation.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/types.hpp>
 
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index bdb98372836..c09cde8f9e4 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -22,9 +22,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/dictionary/encode.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar.hpp>
@@ -33,11 +31,9 @@
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
-#include <iostream>
 #include <iterator>
 #include <vector>
 
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index c4463d68a68..72d92c5ac53 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -20,13 +20,11 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/reduction.hpp>
 
 #include <thrust/host_vector.h>
-#include <thrust/iterator/zip_iterator.h>
 #include <thrust/tuple.h>
 
 #include <algorithm>
diff --git a/cpp/tests/reductions/scan_tests.hpp b/cpp/tests/reductions/scan_tests.hpp
index 858697d8ef5..c2cce4bbbfa 100644
--- a/cpp/tests/reductions/scan_tests.hpp
+++ b/cpp/tests/reductions/scan_tests.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,7 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/strings/string_view.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
@@ -30,7 +28,6 @@
 
 #include <initializer_list>
 #include <type_traits>
-#include <vector>
 
 template <typename T>
 struct TypeParam_to_host_type {
diff --git a/cpp/tests/replace/clamp_test.cpp b/cpp/tests/replace/clamp_test.cpp
index 239c9ce6ddd..e972ea35ed0 100644
--- a/cpp/tests/replace/clamp_test.cpp
+++ b/cpp/tests/replace/clamp_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/replace/normalize_replace_tests.cpp b/cpp/tests/replace/normalize_replace_tests.cpp
index 2de17388ee8..c35f385329a 100644
--- a/cpp/tests/replace/normalize_replace_tests.cpp
+++ b/cpp/tests/replace/normalize_replace_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/testing_main.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/replace.hpp>
 
 // This is the main test fixture
diff --git a/cpp/tests/replace/replace_nans_tests.cpp b/cpp/tests/replace/replace_nans_tests.cpp
index 35232204db7..1b9fe92066a 100644
--- a/cpp/tests/replace/replace_nans_tests.cpp
+++ b/cpp/tests/replace/replace_nans_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index fcee27305f2..0c8ccea52a6 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -20,13 +20,11 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/dictionary/encode.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index b12bf08520f..ae4041bcfaf 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -20,20 +20,16 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/encode.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <thrust/host_vector.h>
-#include <thrust/iterator/transform_iterator.h>
 
 #include <gtest/gtest.h>
 
diff --git a/cpp/tests/reshape/byte_cast_tests.cpp b/cpp/tests/reshape/byte_cast_tests.cpp
index b3d9b2e2f5f..59585c0e947 100644
--- a/cpp/tests/reshape/byte_cast_tests.cpp
+++ b/cpp/tests/reshape/byte_cast_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/reshape.hpp>
diff --git a/cpp/tests/reshape/tile_tests.cpp b/cpp/tests/reshape/tile_tests.cpp
index ed76b9d2ea5..25cfc5c5108 100644
--- a/cpp/tests/reshape/tile_tests.cpp
+++ b/cpp/tests/reshape/tile_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp
index 165e0347785..e8a36d9ab48 100644
--- a/cpp/tests/rolling/collect_ops_test.cpp
+++ b/cpp/tests/rolling/collect_ops_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/rolling/empty_input_test.cpp b/cpp/tests/rolling/empty_input_test.cpp
index e7d1e3f0b10..2e1815671a9 100644
--- a/cpp/tests/rolling/empty_input_test.cpp
+++ b/cpp/tests/rolling/empty_input_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,7 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/aggregation.hpp>
diff --git a/cpp/tests/rolling/grouped_rolling_range_test.cpp b/cpp/tests/rolling/grouped_rolling_range_test.cpp
index fcfbd0eee78..2cb9b60000b 100644
--- a/cpp/tests/rolling/grouped_rolling_range_test.cpp
+++ b/cpp/tests/rolling/grouped_rolling_range_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,21 +17,16 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/bit.hpp>
 
-#include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
diff --git a/cpp/tests/rolling/grouped_rolling_test.cpp b/cpp/tests/rolling/grouped_rolling_test.cpp
index 78d5daf7e83..78b444bcd93 100644
--- a/cpp/tests/rolling/grouped_rolling_test.cpp
+++ b/cpp/tests/rolling/grouped_rolling_test.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/rolling/lead_lag_test.cpp b/cpp/tests/rolling/lead_lag_test.cpp
index de057e96320..6519b0ed4ee 100644
--- a/cpp/tests/rolling/lead_lag_test.cpp
+++ b/cpp/tests/rolling/lead_lag_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -26,7 +25,6 @@
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/tests/rolling/nth_element_test.cpp b/cpp/tests/rolling/nth_element_test.cpp
index 2444992e68f..5f2b383ed55 100644
--- a/cpp/tests/rolling/nth_element_test.cpp
+++ b/cpp/tests/rolling/nth_element_test.cpp
@@ -17,22 +17,15 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/aggregation.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/rolling.hpp>
 
-#include <rmm/device_buffer.hpp>
-
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
-#include <gtest/gtest-typed-test.h>
-
 #include <memory>
 #include <optional>
 
diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp
index 0eaab0c9f7a..dcaa47e722b 100644
--- a/cpp/tests/rolling/offset_row_window_test.cpp
+++ b/cpp/tests/rolling/offset_row_window_test.cpp
@@ -17,14 +17,10 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/aggregation.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/lists/explode.hpp>
 #include <cudf/rolling.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 template <typename T>
 using fwcw = cudf::test::fixed_width_column_wrapper<T>;
diff --git a/cpp/tests/rolling/range_rolling_window_test.cpp b/cpp/tests/rolling/range_rolling_window_test.cpp
index 461c41025e9..daf5fcc1d96 100644
--- a/cpp/tests/rolling/range_rolling_window_test.cpp
+++ b/cpp/tests/rolling/range_rolling_window_test.cpp
@@ -17,22 +17,17 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/aggregation.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/unary.hpp>
-#include <cudf/utilities/bit.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
 #include <src/rolling/detail/range_window_bounds.hpp>
-#include <src/rolling/detail/rolling.hpp>
 
 #include <utility>
 #include <vector>
diff --git a/cpp/tests/rolling/range_window_bounds_test.cpp b/cpp/tests/rolling/range_window_bounds_test.cpp
index b77451bf0bc..a67555280f4 100644
--- a/cpp/tests/rolling/range_window_bounds_test.cpp
+++ b/cpp/tests/rolling/range_window_bounds_test.cpp
@@ -15,9 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/rolling/range_window_bounds.hpp>
@@ -25,8 +22,6 @@
 
 #include <src/rolling/detail/range_window_bounds.hpp>
 
-#include <vector>
-
 struct RangeWindowBoundsTest : public cudf::test::BaseFixture {};
 
 template <typename Timestamp>
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index 6e0dc16dca9..72a511fd5f1 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/testing_main.hpp>
@@ -30,7 +29,6 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/rolling.hpp>
-#include <cudf/unary.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/traits.hpp>
 
diff --git a/cpp/tests/scalar/factories_test.cpp b/cpp/tests/scalar/factories_test.cpp
index 5f132f3ace9..26987ea1b7b 100644
--- a/cpp/tests/scalar/factories_test.cpp
+++ b/cpp/tests/scalar/factories_test.cpp
@@ -22,11 +22,8 @@
 
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-
 class ScalarFactoryTest : public cudf::test::BaseFixture {};
 
 template <typename T>
diff --git a/cpp/tests/search/search_dictionary_test.cpp b/cpp/tests/search/search_dictionary_test.cpp
index 78f79ccc648..a3bb1dfda10 100644
--- a/cpp/tests/search/search_dictionary_test.cpp
+++ b/cpp/tests/search/search_dictionary_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/search.hpp>
 
diff --git a/cpp/tests/search/search_list_test.cpp b/cpp/tests/search/search_list_test.cpp
index 7584003e800..fb5d0fcc889 100644
--- a/cpp/tests/search/search_list_test.cpp
+++ b/cpp/tests/search/search_list_test.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/search.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/search/search_struct_test.cpp b/cpp/tests/search/search_struct_test.cpp
index c35d359e75c..05b9deb3463 100644
--- a/cpp/tests/search/search_struct_test.cpp
+++ b/cpp/tests/search/search_struct_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/search.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/search/search_test.cpp b/cpp/tests/search/search_test.cpp
index 7550cc27161..8d750be5677 100644
--- a/cpp/tests/search/search_test.cpp
+++ b/cpp/tests/search/search_test.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/search.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/tests/sort/is_sorted_tests.cpp b/cpp/tests/sort/is_sorted_tests.cpp
index 109095192f9..e3c9f8d349e 100644
--- a/cpp/tests/sort/is_sorted_tests.cpp
+++ b/cpp/tests/sort/is_sorted_tests.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
diff --git a/cpp/tests/sort/rank_test.cpp b/cpp/tests/sort/rank_test.cpp
index e08a2105aea..ded46cb1f31 100644
--- a/cpp/tests/sort/rank_test.cpp
+++ b/cpp/tests/sort/rank_test.cpp
@@ -18,10 +18,8 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
diff --git a/cpp/tests/sort/sort_nested_types_tests.cpp b/cpp/tests/sort/sort_nested_types_tests.cpp
index 8ab23936ceb..ce4148a941e 100644
--- a/cpp/tests/sort/sort_nested_types_tests.cpp
+++ b/cpp/tests/sort/sort_nested_types_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
 #include <cudf/sorting.hpp>
diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp
index 6a35e977b46..e1505c7a474 100644
--- a/cpp/tests/sort/sort_test.cpp
+++ b/cpp/tests/sort/sort_test.cpp
@@ -28,7 +28,6 @@
 #include <cudf/types.hpp>
 
 #include <thrust/host_vector.h>
-#include <thrust/sort.h>
 
 #include <type_traits>
 #include <vector>
diff --git a/cpp/tests/sort/stable_sort_tests.cpp b/cpp/tests/sort/stable_sort_tests.cpp
index 655166e0d62..88de9d51523 100644
--- a/cpp/tests/sort/stable_sort_tests.cpp
+++ b/cpp/tests/sort/stable_sort_tests.cpp
@@ -25,9 +25,6 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
-#include <thrust/host_vector.h>
-#include <thrust/sort.h>
-
 #include <type_traits>
 #include <vector>
 
diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
index 6c0582fb846..1204b019739 100644
--- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
+++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
@@ -20,9 +20,7 @@
 #include <cudf_test/random.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/stream_compaction.hpp>
@@ -31,8 +29,6 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <thrust/copy.h>
-#include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 
 struct ApplyBooleanMask : public cudf::test::BaseFixture {};
diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp
index a2dab649961..ee1bb3ead92 100644
--- a/cpp/tests/stream_compaction/distinct_count_tests.cpp
+++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp
@@ -15,16 +15,11 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
-#include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
diff --git a/cpp/tests/stream_compaction/distinct_tests.cpp b/cpp/tests/stream_compaction/distinct_tests.cpp
index 14d7d8789ac..c618ff68cbb 100644
--- a/cpp/tests/stream_compaction/distinct_tests.cpp
+++ b/cpp/tests/stream_compaction/distinct_tests.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
@@ -27,8 +26,6 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
-#include <cmath>
-
 auto constexpr null{0};  // null at current level
 auto constexpr XXX{0};   // null pushed down from parent level
 auto constexpr NaN          = std::numeric_limits<double>::quiet_NaN();
diff --git a/cpp/tests/stream_compaction/drop_nans_tests.cpp b/cpp/tests/stream_compaction/drop_nans_tests.cpp
index bf72da5c840..71321361564 100644
--- a/cpp/tests/stream_compaction/drop_nans_tests.cpp
+++ b/cpp/tests/stream_compaction/drop_nans_tests.cpp
@@ -15,12 +15,9 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/stream_compaction/drop_nulls_tests.cpp b/cpp/tests/stream_compaction/drop_nulls_tests.cpp
index dbac1d58195..d3b45c2323e 100644
--- a/cpp/tests/stream_compaction/drop_nulls_tests.cpp
+++ b/cpp/tests/stream_compaction/drop_nulls_tests.cpp
@@ -15,12 +15,10 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/stream_compaction/stable_distinct_tests.cpp b/cpp/tests/stream_compaction/stable_distinct_tests.cpp
index 6c6c53331d4..cc847da6340 100644
--- a/cpp/tests/stream_compaction/stable_distinct_tests.cpp
+++ b/cpp/tests/stream_compaction/stable_distinct_tests.cpp
@@ -15,20 +15,16 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
-#include <cmath>
-
 auto constexpr null{0};  // null at current level
 auto constexpr XXX{0};   // null pushed down from parent level
 auto constexpr NaN          = std::numeric_limits<double>::quiet_NaN();
diff --git a/cpp/tests/stream_compaction/unique_count_tests.cpp b/cpp/tests/stream_compaction/unique_count_tests.cpp
index 640d159fc4f..bad93e92712 100644
--- a/cpp/tests/stream_compaction/unique_count_tests.cpp
+++ b/cpp/tests/stream_compaction/unique_count_tests.cpp
@@ -15,16 +15,11 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
-#include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
diff --git a/cpp/tests/stream_compaction/unique_tests.cpp b/cpp/tests/stream_compaction/unique_tests.cpp
index d5b6915b520..e2b32b898b3 100644
--- a/cpp/tests/stream_compaction/unique_tests.cpp
+++ b/cpp/tests/stream_compaction/unique_tests.cpp
@@ -15,22 +15,16 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
-#include <algorithm>
-#include <cmath>
-
 using cudf::nan_policy;
 using cudf::null_equality;
 using cudf::null_policy;
diff --git a/cpp/tests/streams/binaryop_test.cpp b/cpp/tests/streams/binaryop_test.cpp
index 2a7b52b1b6b..3dcc6f9e632 100644
--- a/cpp/tests/streams/binaryop_test.cpp
+++ b/cpp/tests/streams/binaryop_test.cpp
@@ -21,7 +21,6 @@
 #include <cudf_test/default_stream.hpp>
 
 #include <cudf/binaryop.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 
 class BinaryopTest : public cudf::test::BaseFixture {};
diff --git a/cpp/tests/streams/io/csv_test.cpp b/cpp/tests/streams/io/csv_test.cpp
index 42894a0ebcb..a74ee64f8de 100644
--- a/cpp/tests/streams/io/csv_test.cpp
+++ b/cpp/tests/streams/io/csv_test.cpp
@@ -17,13 +17,9 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/io/csv.hpp>
-#include <cudf/io/detail/csv.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
 
 #include <string>
 #include <vector>
diff --git a/cpp/tests/streams/io/json_test.cpp b/cpp/tests/streams/io/json_test.cpp
index f98e685ed0c..d352c6c3b2a 100644
--- a/cpp/tests/streams/io/json_test.cpp
+++ b/cpp/tests/streams/io/json_test.cpp
@@ -19,9 +19,7 @@
 #include <cudf_test/default_stream.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
diff --git a/cpp/tests/streams/io/multibyte_split_test.cpp b/cpp/tests/streams/io/multibyte_split_test.cpp
index b0eff1d3340..5bb17226029 100644
--- a/cpp/tests/streams/io/multibyte_split_test.cpp
+++ b/cpp/tests/streams/io/multibyte_split_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/default_stream.hpp>
 
-#include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/multibyte_split.hpp>
 
diff --git a/cpp/tests/streams/io/orc_test.cpp b/cpp/tests/streams/io/orc_test.cpp
index cc43bf15b5d..10722557e6a 100644
--- a/cpp/tests/streams/io/orc_test.cpp
+++ b/cpp/tests/streams/io/orc_test.cpp
@@ -17,19 +17,11 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/io/detail/orc.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/io/orc_metadata.hpp>
-#include <cudf/io/orc_types.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
 
-#include <iostream>
-#include <random>
-#include <sstream>
 #include <string>
 #include <vector>
 
diff --git a/cpp/tests/streams/io/parquet_test.cpp b/cpp/tests/streams/io/parquet_test.cpp
index 9d2dec2d697..18bb80e64af 100644
--- a/cpp/tests/streams/io/parquet_test.cpp
+++ b/cpp/tests/streams/io/parquet_test.cpp
@@ -17,13 +17,9 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/io/detail/parquet.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
 
 #include <string>
 #include <vector>
diff --git a/cpp/tests/streams/join_test.cpp b/cpp/tests/streams/join_test.cpp
index 2811bb676fa..27bd7e080c9 100644
--- a/cpp/tests/streams/join_test.cpp
+++ b/cpp/tests/streams/join_test.cpp
@@ -19,11 +19,9 @@
 #include <cudf_test/default_stream.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <cstdint>
 #include <memory>
diff --git a/cpp/tests/streams/null_mask_test.cpp b/cpp/tests/streams/null_mask_test.cpp
index e96224003f4..ed37a72545f 100644
--- a/cpp/tests/streams/null_mask_test.cpp
+++ b/cpp/tests/streams/null_mask_test.cpp
@@ -14,15 +14,12 @@
  * limitations under the License.
  */
 
-#include <tests/binaryop/util/runtime_support.h>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/null_mask.hpp>
-#include <cudf/scalar/scalar.hpp>
 
 class NullMaskTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/streams/reduction_test.cpp b/cpp/tests/streams/reduction_test.cpp
index b4f013fc960..9ab972302e4 100644
--- a/cpp/tests/streams/reduction_test.cpp
+++ b/cpp/tests/streams/reduction_test.cpp
@@ -17,11 +17,8 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/reduction.hpp>
-#include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
diff --git a/cpp/tests/streams/rolling_test.cpp b/cpp/tests/streams/rolling_test.cpp
index b352ad2c0d2..4d9899870b4 100644
--- a/cpp/tests/streams/rolling_test.cpp
+++ b/cpp/tests/streams/rolling_test.cpp
@@ -17,12 +17,10 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 
 class RollingTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp
index 07b2d77cc04..e7b282601e1 100644
--- a/cpp/tests/streams/stream_compaction_test.cpp
+++ b/cpp/tests/streams/stream_compaction_test.cpp
@@ -15,20 +15,16 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <cudf/copying.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
-#include <cmath>
-
 auto constexpr NaN          = std::numeric_limits<double>::quiet_NaN();
 auto constexpr KEEP_ANY     = cudf::duplicate_keep_option::KEEP_ANY;
 auto constexpr KEEP_FIRST   = cudf::duplicate_keep_option::KEEP_FIRST;
diff --git a/cpp/tests/streams/strings/factory_test.cpp b/cpp/tests/streams/strings/factory_test.cpp
index 36e595ab9fa..449e0830b0c 100644
--- a/cpp/tests/streams/strings/factory_test.cpp
+++ b/cpp/tests/streams/strings/factory_test.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
 
 #include <cudf/column/column_factories.hpp>
diff --git a/cpp/tests/streams/strings/reverse_test.cpp b/cpp/tests/streams/strings/reverse_test.cpp
index 4b4d0a7aff5..154e1c1b715 100644
--- a/cpp/tests/streams/strings/reverse_test.cpp
+++ b/cpp/tests/streams/strings/reverse_test.cpp
@@ -21,7 +21,6 @@
 #include <cudf/strings/reverse.hpp>
 
 #include <string>
-#include <vector>
 
 class StringsReverseTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/streams/transform_test.cpp b/cpp/tests/streams/transform_test.cpp
index cf81dc6fb42..9f168abcb31 100644
--- a/cpp/tests/streams/transform_test.cpp
+++ b/cpp/tests/streams/transform_test.cpp
@@ -15,17 +15,11 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/ast/expressions.hpp>
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 
diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp
index 9c0ecaa52c0..06b9c2fa3c1 100644
--- a/cpp/tests/strings/array_tests.cpp
+++ b/cpp/tests/strings/array_tests.cpp
@@ -23,10 +23,8 @@
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/sorting.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/utilities.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp
index bb57d6f5e8a..e53adcf373a 100644
--- a/cpp/tests/strings/combine/concatenate_tests.cpp
+++ b/cpp/tests/strings/combine/concatenate_tests.cpp
@@ -22,7 +22,6 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/tests/strings/combine/join_list_elements_tests.cpp b/cpp/tests/strings/combine/join_list_elements_tests.cpp
index 00317146088..c92f1cfc8f8 100644
--- a/cpp/tests/strings/combine/join_list_elements_tests.cpp
+++ b/cpp/tests/strings/combine/join_list_elements_tests.cpp
@@ -22,7 +22,6 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/types.hpp>
 
 using namespace cudf::test::iterators;
 
diff --git a/cpp/tests/strings/concatenate_tests.cpp b/cpp/tests/strings/concatenate_tests.cpp
index 5cf4015b9e9..51dcc60d95e 100644
--- a/cpp/tests/strings/concatenate_tests.cpp
+++ b/cpp/tests/strings/concatenate_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/concatenate.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 
 #include <vector>
 
diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp
index b3dc3010c67..da0db0fc056 100644
--- a/cpp/tests/strings/datetime_tests.cpp
+++ b/cpp/tests/strings/datetime_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/strings/convert/convert_datetime.hpp>
-#include <cudf/strings/convert/convert_durations.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/wrappers/durations.hpp>
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index 7e0338f1bf4..37b25d9b287 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -21,7 +21,6 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <cudf/detail/iterator.cuh>
 #include <cudf/strings/extract.hpp>
 #include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index 4821a7fa999..7eb4b32d078 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
 
 #include <cudf/strings/findall.hpp>
 #include <cudf/strings/regex/regex_program.hpp>
@@ -28,8 +27,6 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
-#include <vector>
-
 struct StringsFindallTests : public cudf::test::BaseFixture {};
 
 TEST_F(StringsFindallTests, FindallTest)
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index 79054551498..b788c05c152 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -23,8 +23,6 @@
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <limits>
-
 struct StringsConvertTest : public cudf::test::BaseFixture {};
 
 template <typename T>
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 26bcfe8028d..c08effdb969 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -24,9 +24,6 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-
 #include <thrust/host_vector.h>
 #include <thrust/iterator/transform_iterator.h>
 
diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp
index 219bd6d8b01..a34ff25cb69 100644
--- a/cpp/tests/structs/structs_column_tests.cpp
+++ b/cpp/tests/structs/structs_column_tests.cpp
@@ -17,28 +17,18 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/structs/structs_column_view.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/device_buffer.hpp>
 
-#include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/scan.h>
-#include <thrust/sequence.h>
 
 #include <algorithm>
 #include <functional>
diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp
index c33eedf9bd9..c0df2f01a63 100644
--- a/cpp/tests/structs/utilities_tests.cpp
+++ b/cpp/tests/structs/utilities_tests.cpp
@@ -14,21 +14,15 @@
  * limitations under the License.
  */
 
-#include "cudf_test/default_stream.hpp"
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/structs/utilities.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
diff --git a/cpp/tests/table/row_operators_tests.cpp b/cpp/tests/table/row_operators_tests.cpp
index 5fa63c47cf0..216c4d7b6bb 100644
--- a/cpp/tests/table/row_operators_tests.cpp
+++ b/cpp/tests/table/row_operators_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
diff --git a/cpp/tests/table/table_tests.cpp b/cpp/tests/table/table_tests.cpp
index 1637ba7d7d3..363f1a0ba5d 100644
--- a/cpp/tests/table/table_tests.cpp
+++ b/cpp/tests/table/table_tests.cpp
@@ -17,17 +17,14 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
-#include <cudf/copying.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
 #include <memory>
-#include <random>
 
 template <typename T>
 using column_wrapper = cudf::test::fixed_width_column_wrapper<T>;
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index e23f3f6e7d8..ef35a4472cf 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -21,13 +21,9 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/span.hpp>
 
 #include <nvtext/minhash.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-
 #include <vector>
 
 struct MinHashTest : public cudf::test::BaseFixture {};
diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp
index 1acb4fc4265..c72c7cfc80e 100644
--- a/cpp/tests/text/ngrams_tests.cpp
+++ b/cpp/tests/text/ngrams_tests.cpp
@@ -28,8 +28,6 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
-#include <vector>
-
 struct TextGenerateNgramsTest : public cudf::test::BaseFixture {};
 
 TEST_F(TextGenerateNgramsTest, Ngrams)
diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp
index b0d41004e7e..2515cc917fa 100644
--- a/cpp/tests/text/normalize_tests.cpp
+++ b/cpp/tests/text/normalize_tests.cpp
@@ -20,7 +20,6 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/normalize.hpp>
diff --git a/cpp/tests/text/stemmer_tests.cpp b/cpp/tests/text/stemmer_tests.cpp
index a343913411c..82c4bf53cfc 100644
--- a/cpp/tests/text/stemmer_tests.cpp
+++ b/cpp/tests/text/stemmer_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/stemmer.hpp>
diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp
index a615780c02a..782551ad66e 100644
--- a/cpp/tests/text/subword_tests.cpp
+++ b/cpp/tests/text/subword_tests.cpp
@@ -19,13 +19,11 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/subword_tokenize.hpp>
 
 #include <fstream>
-#include <iostream>
 #include <vector>
 
 // Global environment for temporary files
diff --git a/cpp/tests/transform/bools_to_mask_test.cpp b/cpp/tests/transform/bools_to_mask_test.cpp
index 2684123c08a..9437440f34d 100644
--- a/cpp/tests/transform/bools_to_mask_test.cpp
+++ b/cpp/tests/transform/bools_to_mask_test.cpp
@@ -20,10 +20,8 @@
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/transform.hpp>
-#include <cudf/types.hpp>
 
 #include <thrust/host_vector.h>
 
diff --git a/cpp/tests/transform/nans_to_null_test.cpp b/cpp/tests/transform/nans_to_null_test.cpp
index ba16c100e7a..42ca872a936 100644
--- a/cpp/tests/transform/nans_to_null_test.cpp
+++ b/cpp/tests/transform/nans_to_null_test.cpp
@@ -17,12 +17,10 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/transform.hpp>
-#include <cudf/types.hpp>
 
 template <typename T>
 struct NaNsToNullTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/transpose/transpose_test.cpp b/cpp/tests/transpose/transpose_test.cpp
index 5a88c402b8c..7797b2b2cf8 100644
--- a/cpp/tests/transpose/transpose_test.cpp
+++ b/cpp/tests/transpose/transpose_test.cpp
@@ -22,7 +22,6 @@
 #include <cudf/transpose.hpp>
 
 #include <algorithm>
-#include <limits>
 #include <random>
 #include <string>
 
diff --git a/cpp/tests/types/traits_test.cpp b/cpp/tests/types/traits_test.cpp
index 0d9092c33da..46468af515d 100644
--- a/cpp/tests/types/traits_test.cpp
+++ b/cpp/tests/types/traits_test.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf_test/base_fixture.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index 45b89b76070..ed4c1340dbb 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -20,18 +20,15 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <cuda/std/limits>
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
 
-#include <type_traits>
 #include <vector>
 
 static auto const test_timestamps_D = std::vector<int32_t>{
diff --git a/cpp/tests/unary/math_ops_test.cpp b/cpp/tests/unary/math_ops_test.cpp
index 5bfbf70d5f9..663a919f3f4 100644
--- a/cpp/tests/unary/math_ops_test.cpp
+++ b/cpp/tests/unary/math_ops_test.cpp
@@ -22,10 +22,6 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/unary.hpp>
-#include <cudf/utilities/bit.hpp>
-#include <cudf/wrappers/timestamps.hpp>
-
-#include <cuda/std/climits>
 
 #include <vector>
 
diff --git a/cpp/tests/unary/unary_ops_test.cpp b/cpp/tests/unary/unary_ops_test.cpp
index e7477c34642..3c616461c74 100644
--- a/cpp/tests/unary/unary_ops_test.cpp
+++ b/cpp/tests/unary/unary_ops_test.cpp
@@ -23,7 +23,6 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/unary.hpp>
 
-#include <cuda/std/limits>
 #include <thrust/iterator/counting_iterator.h>
 
 template <typename T>
diff --git a/cpp/tests/utilities/random_seed.cpp b/cpp/tests/utilities/random_seed.cpp
index ab5a31ce161..555d89b7dc5 100644
--- a/cpp/tests/utilities/random_seed.cpp
+++ b/cpp/tests/utilities/random_seed.cpp
@@ -13,8 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cudf/utilities/export.hpp>
 
-#include <cudf/types.hpp>
+#include <cstdint>
 
 namespace cudf {
 namespace test {
diff --git a/cpp/tests/utilities_tests/column_debug_tests.cpp b/cpp/tests/utilities_tests/column_debug_tests.cpp
index 7aa05af4591..2a57d678d07 100644
--- a/cpp/tests/utilities_tests/column_debug_tests.cpp
+++ b/cpp/tests/utilities_tests/column_debug_tests.cpp
@@ -16,12 +16,9 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/strings/strings_column_view.hpp>
-
 #include <thrust/iterator/transform_iterator.h>
 
 #include <type_traits>
diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp
index 9d6d5ccb9b5..a13ce825d0b 100644
--- a/cpp/tests/utilities_tests/column_utilities_tests.cpp
+++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp
@@ -17,20 +17,16 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/strings/strings_column_view.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
-#include <type_traits>
-
 template <typename T>
 struct ColumnUtilitiesTest : public cudf::test::BaseFixture {
   cudf::test::UniformRandomGenerator<cudf::size_type> random;
diff --git a/cpp/tests/utilities_tests/column_wrapper_tests.cpp b/cpp/tests/utilities_tests/column_wrapper_tests.cpp
index 479c6687e75..339678f3be8 100644
--- a/cpp/tests/utilities_tests/column_wrapper_tests.cpp
+++ b/cpp/tests/utilities_tests/column_wrapper_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp b/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp
index 5e3fda5e6f7..ff50dc39979 100644
--- a/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp
+++ b/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
diff --git a/cpp/tests/utilities_tests/type_check_tests.cpp b/cpp/tests/utilities_tests/type_check_tests.cpp
index fecb896f95a..c1c5776be74 100644
--- a/cpp/tests/utilities_tests/type_check_tests.cpp
+++ b/cpp/tests/utilities_tests/type_check_tests.cpp
@@ -18,7 +18,6 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/wrappers/durations.hpp>
diff --git a/cpp/tests/utilities_tests/type_list_tests.cpp b/cpp/tests/utilities_tests/type_list_tests.cpp
index 849457056e4..6c3a84763a0 100644
--- a/cpp/tests/utilities_tests/type_list_tests.cpp
+++ b/cpp/tests/utilities_tests/type_list_tests.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_list_utilities.hpp>
 
 using namespace cudf::test;  // this will make reading code way easier
@@ -23,6 +22,7 @@ namespace {
 // Work around to remove parentheses surrounding a type
 template <typename T>
 struct argument_type;
+
 template <typename T, typename U>
 struct argument_type<T(U)> {
   using type = U;

From bf5b778c265b3bfa712f509be0ba268216bcf3d0 Mon Sep 17 00:00:00 2001
From: Christopher Harris <xixonia@gmail.com>
Date: Mon, 28 Oct 2024 23:51:03 -0500
Subject: [PATCH 19/26] Check `num_children() == 0` in
 `Column.from_column_view` (#17193)

This fixes a bug where `Column.from_column_view` is not verifying the existence of a string column's offsets child column prior to accessing it, resulting in a segmentation fault when passing a `column_view` from `Column.view()` to `Column.from_column_view(...)`.

The issue can be reproduced with:
```
import cudf
from cudf.core.column.column import as_column
df = cudf.DataFrame({'a': cudf.Series([[]], dtype=cudf.core.dtypes.ListDtype('string'))})
s = df['a']
col = as_column(s)
col2 = cudf._lib.column.Column.back_and_forth(col)
print(col)
print(col2)
```
where `back_and_forth` is defined as:
```
    @staticmethod
    def back_and_forth(Column input_column):
        cdef column_view input_column_view = input_column.view()
        return Column.from_column_view(input_column_view, input_column)
```
I don't have the expertise to write the appropriate tests for this without introducing the `back_and_forth` function as an API, which seems undesirable.

Authors:
  - Christopher Harris (https://github.com/cwharris)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17193
---
 python/cudf/cudf/_lib/column.pyx | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 065655505b8..94dbdf5534d 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -688,15 +688,18 @@ cdef class Column:
         # special case for string column
         is_string_column = (cv.type().id() == libcudf_types.type_id.STRING)
         if is_string_column:
-            # get the size from offset child column (device to host copy)
-            offsets_column_index = 0
-            offset_child_column = cv.child(offsets_column_index)
-            if offset_child_column.size() == 0:
+            if cv.num_children() == 0:
                 base_nbytes = 0
             else:
-                chars_size = get_element(
-                    offset_child_column, offset_child_column.size()-1).value
-                base_nbytes = chars_size
+                # get the size from offset child column (device to host copy)
+                offsets_column_index = 0
+                offset_child_column = cv.child(offsets_column_index)
+                if offset_child_column.size() == 0:
+                    base_nbytes = 0
+                else:
+                    chars_size = get_element(
+                        offset_child_column, offset_child_column.size()-1).value
+                    base_nbytes = chars_size
 
         if data_ptr:
             if data_owner is None:

From 4b0a634e51f64c68f107683d82ebfea87290efaf Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 29 Oct 2024 10:42:07 -0400
Subject: [PATCH 20/26] Auto assign PR to author (#16969)

I think most PRs remain unassigned, so this PR auto assigns the PR to the PR author. I think this will help keep our project boards up-to-date.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16969
---
 .github/workflows/auto-assign.yml | 17 +++++++++++++++++
 .github/workflows/labeler.yml     |  1 +
 2 files changed, 18 insertions(+)
 create mode 100644 .github/workflows/auto-assign.yml

diff --git a/.github/workflows/auto-assign.yml b/.github/workflows/auto-assign.yml
new file mode 100644
index 00000000000..673bebd4ecc
--- /dev/null
+++ b/.github/workflows/auto-assign.yml
@@ -0,0 +1,17 @@
+name: "Auto Assign PR"
+
+on:
+  pull_request_target:
+    types:
+      - opened
+      - reopened
+      - synchronize
+
+jobs:
+  add_assignees:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions-ecosystem/action-add-assignees@v1
+        with:
+          repo_token: "${{ secrets.GITHUB_TOKEN }}"
+          assignees: ${{ github.actor }}
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 31e78f82a62..f5cb71bfc14 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -1,4 +1,5 @@
 name: "Pull Request Labeler"
+
 on:
 - pull_request_target
 

From 3775f7b9f6509bd0f2f75c46edb60abf2522de86 Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Tue, 29 Oct 2024 14:49:52 +0000
Subject: [PATCH 21/26] Fixed unused attribute compilation error for GCC 13
 (#17188)

With `decltype(&pclose) ` for the destructor type of the `unique_ptr`, gcc makes the signature inherit the attributes of `pclose`. The compiler then ignores this attribute as it doesn't apply within the context with a warning, and since we have `-Werror` on for ignored attributes,  the build fails.
This happens on gcc 13.2.0.

Authors:
  - Basit Ayantunde (https://github.com/lamarrr)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/17188
---
 cpp/benchmarks/io/cuio_common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index fe24fb58728..45b46005c47 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -186,7 +186,7 @@ std::string exec_cmd(std::string_view cmd)
   std::fflush(nullptr);
   // Switch stderr and stdout to only capture stderr
   auto const redirected_cmd = std::string{"( "}.append(cmd).append(" 3>&2 2>&1 1>&3) 2>/dev/null");
-  std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(redirected_cmd.c_str(), "r"), pclose);
+  std::unique_ptr<FILE, int (*)(FILE*)> pipe(popen(redirected_cmd.c_str(), "r"), pclose);
   CUDF_EXPECTS(pipe != nullptr, "popen() failed");
 
   std::array<char, 128> buffer;

From ddfb2848d6b7bb3cd03b8377f349f401030f558c Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Tue, 29 Oct 2024 09:51:19 -0700
Subject: [PATCH 22/26] Support storing `precision` of decimal types in
 `Schema` class (#17176)

In Spark, the `DecimalType` has a specific number of digits to represent the numbers. However, when creating a data Schema, only type and name of the column are stored, thus we lose that precision information. As such, it would be difficult to reconstruct the original decimal types from cudf's `Schema` instance.

This PR adds a `precision` member variable to the `Schema` class in cudf Java, allowing it to store the precision number of the original decimal column.

Partially contributes to https://github.com/NVIDIA/spark-rapids/issues/11560.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/17176
---
 java/src/main/java/ai/rapids/cudf/Schema.java | 77 +++++++++++++++++--
 1 file changed, 70 insertions(+), 7 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java
index 76b2799aad6..6da591d659f 100644
--- a/java/src/main/java/ai/rapids/cudf/Schema.java
+++ b/java/src/main/java/ai/rapids/cudf/Schema.java
@@ -29,26 +29,52 @@ public class Schema {
   public static final Schema INFERRED = new Schema();
 
   private final DType topLevelType;
+
+  /**
+   * Default value for precision value, when it is not specified or the column type is not decimal.
+   */
+  private static final int UNKNOWN_PRECISION = -1;
+
+  /**
+  * Store precision for the top level column, only applicable if the column is a decimal type.
+  * <p/>
+  * This variable is not designed to be used by any libcudf's APIs since libcudf does not support
+  * precisions for fixed point numbers.
+  * Instead, it is used only to pass down the precision values from Spark's DecimalType to the
+  * JNI level, where some JNI functions require these values to perform their operations.
+  */
+  private final int topLevelPrecision;
+
   private final List<String> childNames;
   private final List<Schema> childSchemas;
   private boolean flattened = false;
   private String[] flattenedNames;
   private DType[] flattenedTypes;
+  private int[] flattenedPrecisions;
   private int[] flattenedCounts;
 
   private Schema(DType topLevelType,
+                 int topLevelPrecision,
                  List<String> childNames,
                  List<Schema> childSchemas) {
     this.topLevelType = topLevelType;
+    this.topLevelPrecision = topLevelPrecision;
     this.childNames = childNames;
     this.childSchemas = childSchemas;
   }
 
+  private Schema(DType topLevelType,
+                 List<String> childNames,
+                 List<Schema> childSchemas) {
+    this(topLevelType, UNKNOWN_PRECISION, childNames, childSchemas);
+  }
+
   /**
    * Inferred schema.
    */
   private Schema() {
     topLevelType = null;
+    topLevelPrecision = UNKNOWN_PRECISION;
     childNames = null;
     childSchemas = null;
   }
@@ -104,14 +130,17 @@ private void flattenIfNeeded() {
       if (flatLen == 0) {
         flattenedNames = null;
         flattenedTypes = null;
+        flattenedPrecisions = null;
         flattenedCounts = null;
       } else {
         String[] names = new String[flatLen];
         DType[] types = new DType[flatLen];
+        int[] precisions = new int[flatLen];
         int[] counts = new int[flatLen];
-        collectFlattened(names, types, counts, 0);
+        collectFlattened(names, types, precisions, counts, 0);
         flattenedNames = names;
         flattenedTypes = types;
+        flattenedPrecisions = precisions;
         flattenedCounts = counts;
       }
       flattened = true;
@@ -128,19 +157,20 @@ private int flattenedLength(int startingLength) {
     return startingLength;
   }
 
-  private int collectFlattened(String[] names, DType[] types, int[] counts, int offset) {
+  private int collectFlattened(String[] names, DType[] types, int[] precisions, int[] counts, int offset) {
     if (childSchemas != null) {
       for (int i = 0; i < childSchemas.size(); i++) {
         Schema child = childSchemas.get(i);
         names[offset] = childNames.get(i);
         types[offset] = child.topLevelType;
+        precisions[offset] = child.topLevelPrecision;
         if (child.childNames != null) {
           counts[offset] = child.childNames.size();
         } else {
           counts[offset] = 0;
         }
         offset++;
-        offset = this.childSchemas.get(i).collectFlattened(names, types, counts, offset);
+        offset = this.childSchemas.get(i).collectFlattened(names, types, precisions, counts, offset);
       }
     }
     return offset;
@@ -226,6 +256,22 @@ public int[] getFlattenedTypeScales() {
     return ret;
   }
 
+  /**
+   * Get decimal precisions of the columns' types flattened from all levels in schema by
+   * depth-first traversal.
+   * <p/>
+   * This is used to pass down the decimal precisions from Spark to only the JNI layer, where
+   * some JNI functions require precision values to perform their operations.
+   * Decimal precisions should not be consumed by any libcudf's APIs since libcudf does not
+   * support precisions for fixed point numbers.
+   *
+   * @return An array containing decimal precision of all columns in schema.
+   */
+  public int[] getFlattenedDecimalPrecisions() {
+    flattenIfNeeded();
+    return flattenedPrecisions;
+  }
+
   /**
    * Get the types of the columns in schema flattened from all levels by depth-first traversal.
    * @return An array containing types of all columns in schema.
@@ -307,11 +353,13 @@ public HostColumnVector.DataType asHostDataType() {
 
   public static class Builder {
     private final DType topLevelType;
+    private final int topLevelPrecision;
     private final List<String> names;
     private final List<Builder> types;
 
-    private Builder(DType topLevelType) {
+    private Builder(DType topLevelType, int topLevelPrecision) {
       this.topLevelType = topLevelType;
+      this.topLevelPrecision = topLevelPrecision;
       if (topLevelType == DType.STRUCT || topLevelType == DType.LIST) {
         // There can be children
         names = new ArrayList<>();
@@ -322,14 +370,19 @@ private Builder(DType topLevelType) {
       }
     }
 
+    private Builder(DType topLevelType) {
+      this(topLevelType, UNKNOWN_PRECISION);
+    }
+
     /**
      * Add a new column
      * @param type the type of column to add
      * @param name the name of the column to add (Ignored for list types)
+     * @param precision the decimal precision, only applicable for decimal types
      * @return the builder for the new column. This should really only be used when the type
      * passed in is a LIST or a STRUCT.
      */
-    public Builder addColumn(DType type, String name) {
+    public Builder addColumn(DType type, String name, int precision) {
       if (names == null) {
         throw new IllegalStateException("A column of type " + topLevelType +
             " cannot have children");
@@ -340,21 +393,31 @@ public Builder addColumn(DType type, String name) {
       if (names.contains(name)) {
         throw new IllegalStateException("Cannot add duplicate names to a schema");
       }
-      Builder ret = new Builder(type);
+      Builder ret = new Builder(type, precision);
       types.add(ret);
       names.add(name);
       return ret;
     }
 
+    public Builder addColumn(DType type, String name) {
+      return addColumn(type, name, UNKNOWN_PRECISION);
+    }
+
     /**
      * Adds a single column to the current schema. addColumn is preferred as it can be used
      * to support nested types.
      * @param type the type of the column.
      * @param name the name of the column.
+     * @param precision the decimal precision, only applicable for decimal types.
      * @return this for chaining.
      */
+    public Builder column(DType type, String name, int precision) {
+      addColumn(type, name, precision);
+      return this;
+    }
+
     public Builder column(DType type, String name) {
-      addColumn(type, name);
+      addColumn(type, name, UNKNOWN_PRECISION);
       return this;
     }
 

From 63b773e73a9f582e2cfa75ae04bcad8608e8f03a Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Tue, 29 Oct 2024 13:25:55 -0500
Subject: [PATCH 23/26] Add in new java API for raw host memory allocation
 (#17197)

This is the first patch in a series of patches that should make it so that all java host memory allocations go through the DefaultHostMemoryAllocator unless another allocator is explicitly provided.

This is to make it simpler to track/control host memory usage.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Alessandro Bellina (https://github.com/abellina)

URL: https://github.com/rapidsai/cudf/pull/17197
---
 .../main/java/ai/rapids/cudf/HostMemoryBuffer.java   | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
index e4106574a19..d792459901c 100644
--- a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
+++ b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -155,6 +155,16 @@ public static HostMemoryBuffer allocate(long bytes) {
     return allocate(bytes, defaultPreferPinned);
   }
 
+  /**
+   * Allocate host memory bypassing the default allocator. This is intended to only be used by other allocators.
+   * Pinned memory will not be used for these allocations.
+   * @param bytes size in bytes to allocate
+   * @return the newly created buffer
+   */
+  public static HostMemoryBuffer allocateRaw(long bytes) {
+    return new HostMemoryBuffer(UnsafeMemoryAccessor.allocate(bytes), bytes);
+  }
+
   /**
    * Create a host buffer that is memory-mapped to a file.
    * @param path path to the file to map into host memory

From 52d7e638af366a2384868c41a7ece889d7ada30e Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Tue, 29 Oct 2024 19:59:13 +0000
Subject: [PATCH 24/26] Unified binary_ops and ast benchmarks parameter names
 (#17200)

This merge request unifies the parameter names of the AST and BINARYOP benchmark suites and makes it easier to perform parameter sweeps and compare the outputs of both benchmarks.

Authors:
  - Basit Ayantunde (https://github.com/lamarrr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/17200
---
 cpp/benchmarks/ast/transform.cpp              | 26 +++++++++----------
 cpp/benchmarks/binaryop/binaryop.cpp          | 26 +++++++++----------
 cpp/benchmarks/binaryop/compiled_binaryop.cpp | 12 ++++-----
 3 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
index 7fe61054a26..2533ea9611c 100644
--- a/cpp/benchmarks/ast/transform.cpp
+++ b/cpp/benchmarks/ast/transform.cpp
@@ -52,14 +52,14 @@ enum class TreeType {
 template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
 static void BM_ast_transform(nvbench::state& state)
 {
-  auto const table_size  = static_cast<cudf::size_type>(state.get_int64("table_size"));
+  auto const num_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const tree_levels = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
 
   // Create table data
   auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
   auto const source_table =
     create_sequence_table(cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols),
-                          row_count{table_size},
+                          row_count{num_rows},
                           Nullable ? std::optional<double>{0.5} : std::nullopt);
   auto table = source_table->view();
 
@@ -99,8 +99,8 @@ static void BM_ast_transform(nvbench::state& state)
   auto const& expression_tree_root = expressions.back();
 
   // Use the number of bytes read from global memory
-  state.add_global_memory_reads<key_type>(static_cast<size_t>(table_size) * (tree_levels + 1));
-  state.add_global_memory_writes<key_type>(table_size);
+  state.add_global_memory_reads<key_type>(static_cast<size_t>(num_rows) * (tree_levels + 1));
+  state.add_global_memory_writes<key_type>(num_rows);
 
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
@@ -109,15 +109,15 @@ static void BM_ast_transform(nvbench::state& state)
 template <cudf::ast::ast_operator cmp_op, cudf::ast::ast_operator reduce_op>
 static void BM_string_compare_ast_transform(nvbench::state& state)
 {
-  auto const string_width    = static_cast<cudf::size_type>(state.get_int64("string_width"));
-  auto const num_rows        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const num_comparisons = static_cast<cudf::size_type>(state.get_int64("num_comparisons"));
-  auto const hit_rate        = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+  auto const string_width = static_cast<cudf::size_type>(state.get_int64("string_width"));
+  auto const num_rows     = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const tree_levels  = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
+  auto const hit_rate     = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
 
-  CUDF_EXPECTS(num_comparisons > 0, "benchmarks require 1 or more comparisons");
+  CUDF_EXPECTS(tree_levels > 0, "benchmarks require 1 or more comparisons");
 
   // Create table data
-  auto const num_cols = num_comparisons * 2;
+  auto const num_cols = tree_levels * 2;
   std::vector<std::unique_ptr<cudf::column>> columns;
   std::for_each(
     thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) {
@@ -150,7 +150,7 @@ static void BM_string_compare_ast_transform(nvbench::state& state)
   expressions.emplace_back(cudf::ast::operation(cmp_op, column_refs[0], column_refs[1]));
 
   std::for_each(thrust::make_counting_iterator(1),
-                thrust::make_counting_iterator(num_comparisons),
+                thrust::make_counting_iterator(tree_levels),
                 [&](size_t idx) {
                   auto const& lhs = expressions.back();
                   auto const& rhs = expressions.emplace_back(
@@ -177,7 +177,7 @@ static void BM_string_compare_ast_transform(nvbench::state& state)
   NVBENCH_BENCH(name)                                                                      \
     .set_name(#name)                                                                       \
     .add_int64_axis("tree_levels", {1, 5, 10})                                             \
-    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
+    .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);
@@ -202,7 +202,7 @@ AST_TRANSFORM_BENCHMARK_DEFINE(
     .set_name(#name)                                                           \
     .add_int64_axis("string_width", {32, 64, 128, 256})                        \
     .add_int64_axis("num_rows", {32768, 262144, 2097152})                      \
-    .add_int64_axis("num_comparisons", {1, 2, 3, 4})                           \
+    .add_int64_axis("tree_levels", {1, 2, 3, 4})                               \
     .add_int64_axis("hit_rate", {50, 100})
 
 AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(ast_string_equal_logical_and,
diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp
index 35e41c6c2a4..75c91d270a7 100644
--- a/cpp/benchmarks/binaryop/binaryop.cpp
+++ b/cpp/benchmarks/binaryop/binaryop.cpp
@@ -40,18 +40,18 @@ enum class TreeType {
 template <typename key_type, TreeType tree_type, bool reuse_columns>
 static void BM_binaryop_transform(nvbench::state& state)
 {
-  auto const table_size{static_cast<cudf::size_type>(state.get_int64("table_size"))};
+  auto const num_rows{static_cast<cudf::size_type>(state.get_int64("num_rows"))};
   auto const tree_levels{static_cast<cudf::size_type>(state.get_int64("tree_levels"))};
 
   // Create table data
   auto const n_cols       = reuse_columns ? 1 : tree_levels + 1;
   auto const source_table = create_sequence_table(
-    cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols), row_count{table_size});
+    cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols), row_count{num_rows});
   cudf::table_view table{*source_table};
 
   // Use the number of bytes read from global memory
-  state.add_global_memory_reads<key_type>(static_cast<size_t>(table_size) * (tree_levels + 1));
-  state.add_global_memory_writes<key_type>(table_size);
+  state.add_global_memory_reads<key_type>(static_cast<size_t>(num_rows) * (tree_levels + 1));
+  state.add_global_memory_writes<key_type>(num_rows);
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {
     // Execute tree that chains additions like (((a + b) + c) + d)
@@ -74,15 +74,15 @@ static void BM_binaryop_transform(nvbench::state& state)
 template <cudf::binary_operator cmp_op, cudf::binary_operator reduce_op>
 static void BM_string_compare_binaryop_transform(nvbench::state& state)
 {
-  auto const string_width    = static_cast<cudf::size_type>(state.get_int64("string_width"));
-  auto const num_rows        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const num_comparisons = static_cast<cudf::size_type>(state.get_int64("num_comparisons"));
-  auto const hit_rate        = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+  auto const string_width = static_cast<cudf::size_type>(state.get_int64("string_width"));
+  auto const num_rows     = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const tree_levels  = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
+  auto const hit_rate     = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
 
-  CUDF_EXPECTS(num_comparisons > 0, "benchmarks require 1 or more comparisons");
+  CUDF_EXPECTS(tree_levels > 0, "benchmarks require 1 or more comparisons");
 
   // Create table data
-  auto const num_cols = num_comparisons * 2;
+  auto const num_cols = tree_levels * 2;
   std::vector<std::unique_ptr<cudf::column>> columns;
   std::for_each(
     thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) {
@@ -113,7 +113,7 @@ static void BM_string_compare_binaryop_transform(nvbench::state& state)
       cudf::binary_operation(table.get_column(0), table.get_column(1), cmp_op, bool_type, stream);
     std::for_each(
       thrust::make_counting_iterator(1),
-      thrust::make_counting_iterator(num_comparisons),
+      thrust::make_counting_iterator(tree_levels),
       [&](size_t idx) {
         std::unique_ptr<cudf::column> comparison = cudf::binary_operation(
           table.get_column(idx * 2), table.get_column(idx * 2 + 1), cmp_op, bool_type, stream);
@@ -133,7 +133,7 @@ static void BM_string_compare_binaryop_transform(nvbench::state& state)
   }                                                                                   \
   NVBENCH_BENCH(name)                                                                 \
     .add_int64_axis("tree_levels", {1, 2, 5, 10})                                     \
-    .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000})
+    .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000})
 
 BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique,
                                     int32_t,
@@ -158,7 +158,7 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique,
     .set_name(#name)                                                                \
     .add_int64_axis("string_width", {32, 64, 128, 256})                             \
     .add_int64_axis("num_rows", {32768, 262144, 2097152})                           \
-    .add_int64_axis("num_comparisons", {1, 2, 3, 4})                                \
+    .add_int64_axis("tree_levels", {1, 2, 3, 4})                                    \
     .add_int64_axis("hit_rate", {50, 100})
 
 STRING_COMPARE_BINARYOP_TRANSFORM_BENCHMARK_DEFINE(string_compare_binaryop_transform,
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
index cd3c3871a2e..426f44a4fa1 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -23,10 +23,10 @@
 template <typename TypeLhs, typename TypeRhs, typename TypeOut>
 void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
 {
-  auto const table_size = static_cast<cudf::size_type>(state.get_int64("table_size"));
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
 
   auto const source_table = create_random_table(
-    {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{table_size});
+    {cudf::type_to_id<TypeLhs>(), cudf::type_to_id<TypeRhs>()}, row_count{num_rows});
 
   auto lhs = cudf::column_view(source_table->get_column(0));
   auto rhs = cudf::column_view(source_table->get_column(1));
@@ -37,9 +37,9 @@ void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
   cudf::binary_operation(lhs, rhs, binop, output_dtype);
 
   // use number of bytes read and written to global memory
-  state.add_global_memory_reads<TypeLhs>(table_size);
-  state.add_global_memory_reads<TypeRhs>(table_size);
-  state.add_global_memory_writes<TypeOut>(table_size);
+  state.add_global_memory_reads<TypeLhs>(num_rows);
+  state.add_global_memory_reads<TypeRhs>(num_rows);
+  state.add_global_memory_writes<TypeOut>(num_rows);
 
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); });
@@ -55,7 +55,7 @@ void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
   }                                                                           \
   NVBENCH_BENCH(name)                                                         \
     .set_name("compiled_binary_op_" BM_STRINGIFY(name))                       \
-    .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000})
+    .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000})
 
 #define build_name(a, b, c, d) a##_##b##_##c##_##d
 

From 8d7b0d8bf0aebebde0a5036d2e51f5991ecbe63b Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 29 Oct 2024 16:31:27 -0400
Subject: [PATCH 25/26] [BUG] Replace `repo_token` with `github_token` in Auto
 Assign PR GHA (#17203)

The Auto Assign GHA workflow fails with this [error](https://github.com/rapidsai/cudf/actions/runs/11580081781). This PR fixes this error.
xref #16969

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17203
---
 .github/workflows/auto-assign.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/auto-assign.yml b/.github/workflows/auto-assign.yml
index 673bebd4ecc..1bf4ac08b69 100644
--- a/.github/workflows/auto-assign.yml
+++ b/.github/workflows/auto-assign.yml
@@ -13,5 +13,5 @@ jobs:
     steps:
       - uses: actions-ecosystem/action-add-assignees@v1
         with:
-          repo_token: "${{ secrets.GITHUB_TOKEN }}"
+          github_token: "${{ secrets.GITHUB_TOKEN }}"
           assignees: ${{ github.actor }}

From eeb4d2780163794f4b705062e49dbdc3283ebce0 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Tue, 29 Oct 2024 17:12:43 -0400
Subject: [PATCH 26/26] Parquet reader list microkernel (#16538)

This PR refactors fixed-width parquet list reader decoding into its own set of micro-kernels, templatizing the existing fixed-width microkernels. When skipping rows for lists, this will skip ahead the decoding of the definition, repetition, and dictionary rle_streams as well. The list kernel uses 128 threads per block and 71 registers per thread, so I've changed the launch_bounds to enforce a minimum of 8 blocks per SM.  This causes a small register spill but the benchmarks are still faster, as seen below:

DEVICE_BUFFER list benchmarks (decompress + decode, not bound by IO):
run_length 1,   cardinality 0,             no byte_limit: 24.7% faster
run_length 32, cardinality 1000,       no byte_limit: 18.3% faster
run_length 1,   cardinality 0,       500kb byte_limit: 57% faster
run_length 32, cardinality 1000, 500kb byte_limit: 53% faster

Compressed list of ints on hard drive: 5.5% faster
Sample real data on hard drive (many columns not lists): 0.5% faster

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - https://github.com/nvdbaranec
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16538
---
 cpp/src/io/parquet/decode_fixed.cu | 585 ++++++++++++++++++++++++-----
 cpp/src/io/parquet/page_hdr.cu     |  17 +-
 cpp/src/io/parquet/parquet_gpu.hpp |  10 +
 cpp/src/io/parquet/reader_impl.cpp |  45 +++
 cpp/src/io/parquet/rle_stream.cuh  |  81 ++--
 5 files changed, 615 insertions(+), 123 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 4522ea7fe56..45380e6ea20 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -37,7 +37,14 @@ struct block_scan_results {
 };
 
 template <int decode_block_size>
-static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_results& results)
+using block_scan_temp_storage = int[decode_block_size / cudf::detail::warp_size];
+
+// Similar to CUB, must __syncthreads() after calling if reusing temp_storage
+template <int decode_block_size>
+__device__ inline static void scan_block_exclusive_sum(
+  int thread_bit,
+  block_scan_results& results,
+  block_scan_temp_storage<decode_block_size>& temp_storage)
 {
   int const t              = threadIdx.x;
   int const warp_index     = t / cudf::detail::warp_size;
@@ -45,15 +52,19 @@ static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_resul
   uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1;
 
   uint32_t warp_bits = ballot(thread_bit);
-  scan_block_exclusive_sum<decode_block_size>(warp_bits, warp_lane, warp_index, lane_mask, results);
+  scan_block_exclusive_sum<decode_block_size>(
+    warp_bits, warp_lane, warp_index, lane_mask, results, temp_storage);
 }
 
+// Similar to CUB, must __syncthreads() after calling if reusing temp_storage
 template <int decode_block_size>
-__device__ static void scan_block_exclusive_sum(uint32_t warp_bits,
-                                                int warp_lane,
-                                                int warp_index,
-                                                uint32_t lane_mask,
-                                                block_scan_results& results)
+__device__ static void scan_block_exclusive_sum(
+  uint32_t warp_bits,
+  int warp_lane,
+  int warp_index,
+  uint32_t lane_mask,
+  block_scan_results& results,
+  block_scan_temp_storage<decode_block_size>& temp_storage)
 {
   // Compute # warps
   constexpr int num_warps = decode_block_size / cudf::detail::warp_size;
@@ -64,49 +75,64 @@ __device__ static void scan_block_exclusive_sum(uint32_t warp_bits,
   results.thread_count_within_warp = __popc(results.warp_bits & lane_mask);
 
   // Share the warp counts amongst the block threads
-  __shared__ int warp_counts[num_warps];
-  if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; }
-  __syncthreads();
+  if (warp_lane == 0) { temp_storage[warp_index] = results.warp_count; }
+  __syncthreads();  // Sync to share counts between threads/warps
 
   // Compute block-wide results
   results.block_count               = 0;
   results.thread_count_within_block = results.thread_count_within_warp;
   for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) {
-    results.block_count += warp_counts[warp_idx];
-    if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; }
+    results.block_count += temp_storage[warp_idx];
+    if (warp_idx < warp_index) { results.thread_count_within_block += temp_storage[warp_idx]; }
   }
 }
 
-template <int block_size, typename state_buf>
-__device__ inline void gpuDecodeFixedWidthValues(
+template <int block_size, bool has_lists_t, typename state_buf>
+__device__ void gpuDecodeFixedWidthValues(
   page_state_s* s, state_buf* const sb, int start, int end, int t)
 {
   constexpr int num_warps      = block_size / cudf::detail::warp_size;
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
-  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
-  int const dtype                          = s->col.physical_type;
+  // nesting level that is storing actual leaf values
+  int const leaf_level_index = s->col.max_nesting_depth - 1;
+  auto const data_out        = s->nesting_info[leaf_level_index].data_out;
+
+  int const dtype          = s->col.physical_type;
+  uint32_t const dtype_len = s->dtype_len;
+
+  int const skipped_leaf_values = s->page.skipped_leaf_values;
 
   // decode values
   int pos = start;
   while (pos < end) {
     int const batch_size = min(max_batch_size, end - pos);
-
     int const target_pos = pos + batch_size;
-    int const src_pos    = pos + t;
+    int const thread_pos = pos + t;
 
-    // the position in the output column/buffer
-    int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] - s->first_row;
+    // Index from value buffer (doesn't include nulls) to final array (has gaps for nulls)
+    int const dst_pos = [&]() {
+      int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(thread_pos)];
+      if constexpr (!has_lists_t) { dst_pos -= s->first_row; }
+      return dst_pos;
+    }();
 
     // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
     // before first_row) in the flat hierarchy case.
-    if (src_pos < target_pos && dst_pos >= 0) {
+    if (thread_pos < target_pos && dst_pos >= 0) {
       // nesting level that is storing actual leaf values
-      int const leaf_level_index = s->col.max_nesting_depth - 1;
 
-      uint32_t dtype_len = s->dtype_len;
-      void* dst =
-        nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+      // src_pos represents the logical row position we want to read from. But in the case of
+      // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos
+      // has to take into account the # of values we have to skip in the page to get to the
+      // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
+      int const src_pos = [&]() {
+        if constexpr (has_lists_t) { return thread_pos + skipped_leaf_values; }
+        return thread_pos;
+      }();
+
+      void* const dst = data_out + (static_cast<size_t>(dst_pos) * dtype_len);
+
       if (s->col.logical_type.has_value() && s->col.logical_type->type == LogicalType::DECIMAL) {
         switch (dtype) {
           case INT32: gpuOutputFast(s, sb, src_pos, static_cast<uint32_t*>(dst)); break;
@@ -145,15 +171,15 @@ __device__ inline void gpuDecodeFixedWidthValues(
   }
 }
 
-template <int block_size, typename state_buf>
+template <int block_size, bool has_lists_t, typename state_buf>
 struct decode_fixed_width_values_func {
   __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t)
   {
-    gpuDecodeFixedWidthValues<block_size, state_buf>(s, sb, start, end, t);
+    gpuDecodeFixedWidthValues<block_size, has_lists_t, state_buf>(s, sb, start, end, t);
   }
 };
 
-template <int block_size, typename state_buf>
+template <int block_size, bool has_lists_t, typename state_buf>
 __device__ inline void gpuDecodeFixedWidthSplitValues(
   page_state_s* s, state_buf* const sb, int start, int end, int t)
 {
@@ -161,10 +187,15 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
   constexpr int num_warps      = block_size / warp_size;
   constexpr int max_batch_size = num_warps * warp_size;
 
-  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
-  int const dtype                          = s->col.physical_type;
-  auto const data_len                      = thrust::distance(s->data_start, s->data_end);
-  auto const num_values                    = data_len / s->dtype_len_in;
+  // nesting level that is storing actual leaf values
+  int const leaf_level_index = s->col.max_nesting_depth - 1;
+  auto const data_out        = s->nesting_info[leaf_level_index].data_out;
+
+  int const dtype       = s->col.physical_type;
+  auto const data_len   = thrust::distance(s->data_start, s->data_end);
+  auto const num_values = data_len / s->dtype_len_in;
+
+  int const skipped_leaf_values = s->page.skipped_leaf_values;
 
   // decode values
   int pos = start;
@@ -172,21 +203,34 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
     int const batch_size = min(max_batch_size, end - pos);
 
     int const target_pos = pos + batch_size;
-    int const src_pos    = pos + t;
+    int const thread_pos = pos + t;
 
     // the position in the output column/buffer
-    int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] - s->first_row;
+    // Index from value buffer (doesn't include nulls) to final array (has gaps for nulls)
+    int const dst_pos = [&]() {
+      int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(thread_pos)];
+      if constexpr (!has_lists_t) { dst_pos -= s->first_row; }
+      return dst_pos;
+    }();
 
     // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
     // before first_row) in the flat hierarchy case.
-    if (src_pos < target_pos && dst_pos >= 0) {
-      // nesting level that is storing actual leaf values
-      int const leaf_level_index = s->col.max_nesting_depth - 1;
+    if (thread_pos < target_pos && dst_pos >= 0) {
+      // src_pos represents the logical row position we want to read from. But in the case of
+      // nested hierarchies (lists), there is no 1:1 mapping of rows to values. So src_pos
+      // has to take into account the # of values we have to skip in the page to get to the
+      // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
+      int const src_pos = [&]() {
+        if constexpr (has_lists_t) {
+          return thread_pos + skipped_leaf_values;
+        } else {
+          return thread_pos;
+        }
+      }();
 
-      uint32_t dtype_len = s->dtype_len;
-      uint8_t const* src = s->data_start + src_pos;
-      uint8_t* dst =
-        nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+      uint32_t const dtype_len = s->dtype_len;
+      uint8_t const* const src = s->data_start + src_pos;
+      uint8_t* const dst       = data_out + static_cast<size_t>(dst_pos) * dtype_len;
       auto const is_decimal =
         s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
 
@@ -239,11 +283,11 @@ __device__ inline void gpuDecodeFixedWidthSplitValues(
   }
 }
 
-template <int block_size, typename state_buf>
+template <int block_size, bool has_lists_t, typename state_buf>
 struct decode_fixed_width_split_values_func {
   __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t)
   {
-    gpuDecodeFixedWidthSplitValues<block_size, state_buf>(s, sb, start, end, t);
+    gpuDecodeFixedWidthSplitValues<block_size, has_lists_t, state_buf>(s, sb, start, end, t);
   }
 };
 
@@ -274,12 +318,14 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
     int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
     // definition level
-    int d = 1;
-    if (t >= batch_size) {
-      d = -1;
-    } else if (def) {
-      d = static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
-    }
+    int const d = [&]() {
+      if (t >= batch_size) {
+        return -1;
+      } else if (def) {
+        return static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
+      }
+      return 1;
+    }();
 
     int const thread_value_count = t;
     int const block_value_count  = batch_size;
@@ -340,6 +386,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested(
         if (is_valid) {
           int const dst_pos = value_count + thread_value_count;
           int const src_pos = max_depth_valid_count + thread_valid_count;
+
           sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
         }
         // update stuff
@@ -396,16 +443,16 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
     int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
 
     // use definition level & row bounds to determine if is valid
-    int is_valid;
-    if (t >= batch_size) {
-      is_valid = 0;
-    } else if (def) {
-      int const def_level =
-        static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
-      is_valid = ((def_level > 0) && in_row_bounds) ? 1 : 0;
-    } else {
-      is_valid = in_row_bounds;
-    }
+    int const is_valid = [&]() {
+      if (t >= batch_size) {
+        return 0;
+      } else if (def) {
+        int const def_level =
+          static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)]);
+        return ((def_level > 0) && in_row_bounds) ? 1 : 0;
+      }
+      return in_row_bounds;
+    }();
 
     // thread and block validity count
     using block_scan = cub::BlockScan<int, decode_block_size>;
@@ -447,8 +494,9 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
 
     // output offset
     if (is_valid) {
-      int const dst_pos                                          = value_count + thread_value_count;
-      int const src_pos                                          = valid_count + thread_valid_count;
+      int const dst_pos = value_count + thread_value_count;
+      int const src_pos = valid_count + thread_valid_count;
+
       sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
     }
 
@@ -460,7 +508,7 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat(
   if (t == 0) {
     // update valid value count for decoding and total # of values we've processed
     ni.valid_count       = valid_count;
-    ni.value_count       = value_count;  // TODO: remove? this is unused in the non-list path
+    ni.value_count       = value_count;
     s->nz_count          = valid_count;
     s->input_value_count = value_count;
     s->input_row_count   = value_count;
@@ -533,6 +581,239 @@ static __device__ int gpuUpdateValidityAndRowIndicesNonNullable(int32_t target_v
   return valid_count;
 }
 
+template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+static __device__ int gpuUpdateValidityAndRowIndicesLists(int32_t target_value_count,
+                                                          page_state_s* s,
+                                                          state_buf* sb,
+                                                          level_t const* const def,
+                                                          level_t const* const rep,
+                                                          int t)
+{
+  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
+
+  // how many (input) values we've processed in the page so far, prior to this loop iteration
+  int value_count = s->input_value_count;
+
+  // how many rows we've processed in the page so far
+  int input_row_count = s->input_row_count;
+
+  // cap by last row so that we don't process any rows past what we want to output.
+  int const first_row = s->first_row;
+  int const last_row  = first_row + s->num_rows;
+
+  int const row_index_lower_bound = s->row_index_lower_bound;
+  int const max_depth             = s->col.max_nesting_depth - 1;
+  int max_depth_valid_count       = s->nesting_info[max_depth].valid_count;
+
+  int const warp_index     = t / cudf::detail::warp_size;
+  int const warp_lane      = t % cudf::detail::warp_size;
+  bool const is_first_lane = (warp_lane == 0);
+
+  __syncthreads();
+  __shared__ block_scan_temp_storage<decode_block_size> temp_storage;
+
+  while (value_count < target_value_count) {
+    bool const within_batch = value_count + t < target_value_count;
+
+    // get definition level, use repetition level to get start/end depth
+    // different for each thread, as each thread has a different r/d
+    auto const [def_level, start_depth, end_depth] = [&]() {
+      if (!within_batch) { return cuda::std::make_tuple(-1, -1, -1); }
+
+      int const level_index = rolling_index<state_buf::nz_buf_size>(value_count + t);
+      int const rep_level   = static_cast<int>(rep[level_index]);
+      int const start_depth = s->nesting_info[rep_level].start_depth;
+
+      if constexpr (!nullable) {
+        return cuda::std::make_tuple(-1, start_depth, max_depth);
+      } else {
+        if (def != nullptr) {
+          int const def_level = static_cast<int>(def[level_index]);
+          return cuda::std::make_tuple(
+            def_level, start_depth, s->nesting_info[def_level].end_depth);
+        } else {
+          return cuda::std::make_tuple(1, start_depth, max_depth);
+        }
+      }
+    }();
+
+    // Determine value count & row index
+    //  track (page-relative) row index for the thread so we can compare against input bounds
+    //  keep track of overall # of rows we've read.
+    int const is_new_row = start_depth == 0 ? 1 : 0;
+    int num_prior_new_rows, total_num_new_rows;
+    {
+      block_scan_results new_row_scan_results;
+      scan_block_exclusive_sum<decode_block_size>(is_new_row, new_row_scan_results, temp_storage);
+      __syncthreads();
+      num_prior_new_rows = new_row_scan_results.thread_count_within_block;
+      total_num_new_rows = new_row_scan_results.block_count;
+    }
+
+    int const row_index = input_row_count + ((num_prior_new_rows + is_new_row) - 1);
+    input_row_count += total_num_new_rows;
+    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
+
+    // VALUE COUNT:
+    // in_nesting_bounds: if at a nesting level where we need to add value indices
+    // the bounds: from current rep to the rep AT the def depth
+    int in_nesting_bounds = ((0 >= start_depth && 0 <= end_depth) && in_row_bounds) ? 1 : 0;
+    int thread_value_count_within_warp, warp_value_count, thread_value_count, block_value_count;
+    {
+      block_scan_results value_count_scan_results;
+      scan_block_exclusive_sum<decode_block_size>(
+        in_nesting_bounds, value_count_scan_results, temp_storage);
+      __syncthreads();
+
+      thread_value_count_within_warp = value_count_scan_results.thread_count_within_warp;
+      warp_value_count               = value_count_scan_results.warp_count;
+      thread_value_count             = value_count_scan_results.thread_count_within_block;
+      block_value_count              = value_count_scan_results.block_count;
+    }
+
+    // iterate by depth
+    for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
+      auto& ni = s->nesting_info[d_idx];
+
+      // everything up to the max_def_level is a non-null value
+      int const is_valid = [&](int input_def_level) {
+        if constexpr (nullable) {
+          return ((input_def_level >= ni.max_def_level) && in_nesting_bounds) ? 1 : 0;
+        } else {
+          return in_nesting_bounds;
+        }
+      }(def_level);
+
+      // VALID COUNT:
+      // Not all values visited by this block will represent a value at this nesting level.
+      // the validity bit for thread t might actually represent output value t-6.
+      // the correct position for thread t's bit is thread_value_count.
+      uint32_t const warp_valid_mask =
+        WarpReduceOr32((uint32_t)is_valid << thread_value_count_within_warp);
+      int thread_valid_count, block_valid_count;
+      {
+        auto thread_mask = (uint32_t(1) << thread_value_count_within_warp) - 1;
+
+        block_scan_results valid_count_scan_results;
+        scan_block_exclusive_sum<decode_block_size>(warp_valid_mask,
+                                                    warp_lane,
+                                                    warp_index,
+                                                    thread_mask,
+                                                    valid_count_scan_results,
+                                                    temp_storage);
+        __syncthreads();
+        thread_valid_count = valid_count_scan_results.thread_count_within_block;
+        block_valid_count  = valid_count_scan_results.block_count;
+      }
+
+      // compute warp and thread value counts for the -next- nesting level. we need to
+      // do this for lists so that we can emit an offset for the -current- nesting level.
+      // the offset for the current nesting level == current length of the next nesting level
+      int next_thread_value_count_within_warp = 0, next_warp_value_count = 0;
+      int next_thread_value_count = 0, next_block_value_count = 0;
+      int next_in_nesting_bounds = 0;
+      if (d_idx < max_depth) {
+        // NEXT DEPTH VALUE COUNT:
+        next_in_nesting_bounds =
+          ((d_idx + 1 >= start_depth) && (d_idx + 1 <= end_depth) && in_row_bounds) ? 1 : 0;
+        {
+          block_scan_results next_value_count_scan_results;
+          scan_block_exclusive_sum<decode_block_size>(
+            next_in_nesting_bounds, next_value_count_scan_results, temp_storage);
+          __syncthreads();
+
+          next_thread_value_count_within_warp =
+            next_value_count_scan_results.thread_count_within_warp;
+          next_warp_value_count   = next_value_count_scan_results.warp_count;
+          next_thread_value_count = next_value_count_scan_results.thread_count_within_block;
+          next_block_value_count  = next_value_count_scan_results.block_count;
+        }
+
+        // STORE OFFSET TO THE LIST LOCATION
+        // if we're -not- at a leaf column and we're within nesting/row bounds
+        // and we have a valid data_out pointer, it implies this is a list column, so
+        // emit an offset.
+        if (in_nesting_bounds && ni.data_out != nullptr) {
+          const auto& next_ni = s->nesting_info[d_idx + 1];
+          int const idx       = ni.value_count + thread_value_count;
+          cudf::size_type const ofs =
+            next_ni.value_count + next_thread_value_count + next_ni.page_start_value;
+
+          (reinterpret_cast<cudf::size_type*>(ni.data_out))[idx] = ofs;
+        }
+      }
+
+      // validity is processed per-warp (on lane 0's)
+      // thi is because when atomic writes are needed, they are 32-bit operations
+      //
+      // lists always read and write to the same bounds
+      // (that is, read and write positions are already pre-bounded by first_row/num_rows).
+      // since we are about to write the validity vector
+      // here we need to adjust our computed mask to take into account the write row bounds.
+      if constexpr (nullable) {
+        if (is_first_lane && (ni.valid_map != nullptr) && (warp_value_count > 0)) {
+          // absolute bit offset into the output validity map
+          // is cumulative sum of warp_value_count at the given nesting depth
+          // DON'T subtract by first_row: since it's lists it's not 1-row-per-value
+          int const bit_offset = ni.valid_map_offset + thread_value_count;
+
+          store_validity(bit_offset, ni.valid_map, warp_valid_mask, warp_value_count);
+        }
+
+        if (t == 0) { ni.null_count += block_value_count - block_valid_count; }
+      }
+
+      // if this is valid and we're at the leaf, output dst_pos
+      // Read value_count before the sync, so that when thread 0 modifies it we've already read its
+      // value
+      int const current_value_count = ni.value_count;
+      __syncthreads();  // guard against modification of ni.value_count below
+      if (d_idx == max_depth) {
+        if (is_valid) {
+          int const dst_pos      = current_value_count + thread_value_count;
+          int const src_pos      = max_depth_valid_count + thread_valid_count;
+          int const output_index = rolling_index<state_buf::nz_buf_size>(src_pos);
+
+          // Index from rolling buffer of values (which doesn't include nulls) to final array (which
+          // includes gaps for nulls)
+          sb->nz_idx[output_index] = dst_pos;
+        }
+        max_depth_valid_count += block_valid_count;
+      }
+
+      // update stuff
+      if (t == 0) {
+        ni.value_count += block_value_count;
+        ni.valid_map_offset += block_value_count;
+      }
+      __syncthreads();  // sync modification of ni.value_count
+
+      // propagate value counts for the next depth level
+      block_value_count              = next_block_value_count;
+      thread_value_count             = next_thread_value_count;
+      in_nesting_bounds              = next_in_nesting_bounds;
+      warp_value_count               = next_warp_value_count;
+      thread_value_count_within_warp = next_thread_value_count_within_warp;
+    }  // END OF DEPTH LOOP
+
+    int const batch_size = min(max_batch_size, target_value_count - value_count);
+    value_count += batch_size;
+  }
+
+  if (t == 0) {
+    // update valid value count for decoding and total # of values we've processed
+    s->nesting_info[max_depth].valid_count = max_depth_valid_count;
+    s->nz_count                            = max_depth_valid_count;
+    s->input_value_count                   = value_count;
+
+    // If we have lists # rows != # values
+    s->input_row_count = input_row_count;
+  }
+
+  return max_depth_valid_count;
+}
+
 // is the page marked nullable or not
 __device__ inline bool is_nullable(page_state_s* s)
 {
@@ -560,6 +841,23 @@ __device__ inline bool maybe_has_nulls(page_state_s* s)
   return run_val != s->col.max_level[lvl];
 }
 
+template <int rolling_buf_size, typename stream_type>
+__device__ int skip_decode(stream_type& parquet_stream, int num_to_skip, int t)
+{
+  // it could be that (e.g.) we skip 5000 but starting at row 4000 we have a run of length 2000:
+  // in that case skip_decode() only skips 4000, and we have to process the remaining 1000 up front
+  // modulo 2 * block_size of course, since that's as many as we process at once
+  int num_skipped = parquet_stream.skip_decode(t, num_to_skip);
+  while (num_skipped < num_to_skip) {
+    // TODO: Instead of decoding, skip within the run to the appropriate location
+    auto const to_decode = min(rolling_buf_size, num_to_skip - num_skipped);
+    num_skipped += parquet_stream.decode_next(t, to_decode);
+    __syncthreads();
+  }
+
+  return num_skipped;
+}
+
 /**
  * @brief Kernel for computing fixed width non dictionary column data stored in the pages
  *
@@ -579,9 +877,10 @@ template <typename level_t,
           decode_kernel_mask kernel_mask_t,
           bool has_dict_t,
           bool has_nesting_t,
-          template <int block_size, typename state_buf>
+          bool has_lists_t,
+          template <int block_size, bool decode_has_lists_t, typename state_buf>
           typename DecodeValuesFunc>
-CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
+CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
   gpuDecodePageDataGeneric(PageInfo* pages,
                            device_span<ColumnChunkDesc const> chunks,
                            size_t min_row,
@@ -621,31 +920,29 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
   if (s->num_rows == 0) { return; }
 
-  DecodeValuesFunc<decode_block_size_t, state_buf_t> decode_values;
+  DecodeValuesFunc<decode_block_size_t, has_lists_t, state_buf_t> decode_values;
 
-  bool const nullable             = is_nullable(s);
-  bool const should_process_nulls = nullable && maybe_has_nulls(s);
+  bool const should_process_nulls = is_nullable(s) && maybe_has_nulls(s);
 
   // shared buffer. all shared memory is suballocated out of here
-  // constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size *
-  // sizeof(rle_run<level_t>), size_t{16}) : 0;
+  constexpr int shared_rep_size =
+    has_lists_t
+      ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), size_t{16})
+      : 0;
   constexpr int shared_dict_size =
     has_dict_t
       ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<uint32_t>), size_t{16})
       : 0;
   constexpr int shared_def_size =
     cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), size_t{16});
-  constexpr int shared_buf_size = /*shared_rep_size +*/ shared_dict_size + shared_def_size;
+  constexpr int shared_buf_size = shared_rep_size + shared_dict_size + shared_def_size;
   __shared__ __align__(16) uint8_t shared_buf[shared_buf_size];
 
   // setup all shared memory buffers
-  int shared_offset = 0;
-  /*
-  rle_run<level_t> *rep_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
-  if constexpr (has_lists_t){
-    shared_offset += shared_rep_size;
-  }
-  */
+  int shared_offset          = 0;
+  rle_run<level_t>* rep_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
+  if constexpr (has_lists_t) { shared_offset += shared_rep_size; }
+
   rle_run<uint32_t>* dict_runs = reinterpret_cast<rle_run<uint32_t>*>(shared_buf + shared_offset);
   if constexpr (has_dict_t) { shared_offset += shared_dict_size; }
   rle_run<level_t>* def_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
@@ -660,38 +957,51 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
                      def,
                      s->page.num_input_values);
   }
-  /*
+
   rle_stream<level_t, decode_block_size_t, rolling_buf_size> rep_decoder{rep_runs};
   level_t* const rep = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
-  if constexpr(has_lists_t){
+  if constexpr (has_lists_t) {
     rep_decoder.init(s->col.level_bits[level_type::REPETITION],
                      s->abs_lvl_start[level_type::REPETITION],
                      s->abs_lvl_end[level_type::REPETITION],
                      rep,
                      s->page.num_input_values);
   }
-  */
 
   rle_stream<uint32_t, decode_block_size_t, rolling_buf_size> dict_stream{dict_runs};
   if constexpr (has_dict_t) {
     dict_stream.init(
       s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values);
   }
-  __syncthreads();
 
   // We use two counters in the loop below: processed_count and valid_count.
-  // - processed_count: number of rows out of num_input_values that we have decoded so far.
+  // - processed_count: number of values out of num_input_values that we have decoded so far.
   //   the definition stream returns the number of total rows it has processed in each call
   //   to decode_next and we accumulate in process_count.
-  // - valid_count: number of non-null rows we have decoded so far. In each iteration of the
+  // - valid_count: number of non-null values we have decoded so far. In each iteration of the
   //   loop below, we look at the number of valid items (which could be all for non-nullable),
   //   and valid_count is that running count.
   int processed_count = 0;
   int valid_count     = 0;
+
+  // Skip ahead in the decoding so that we don't repeat work (skipped_leaf_values = 0 for non-lists)
+  if constexpr (has_lists_t) {
+    auto const skipped_leaf_values = s->page.skipped_leaf_values;
+    if (skipped_leaf_values > 0) {
+      if (should_process_nulls) {
+        skip_decode<rolling_buf_size>(def_decoder, skipped_leaf_values, t);
+      }
+      processed_count = skip_decode<rolling_buf_size>(rep_decoder, skipped_leaf_values, t);
+      if constexpr (has_dict_t) {
+        skip_decode<rolling_buf_size>(dict_stream, skipped_leaf_values, t);
+      }
+    }
+  }
+
   // the core loop. decode batches of level stream data using rle_stream objects
   // and pass the results to gpuDecodeValues
   // For chunked reads we may not process all of the rows on the page; if not stop early
-  int last_row = s->first_row + s->num_rows;
+  int const last_row = s->first_row + s->num_rows;
   while ((s->error == 0) && (processed_count < s->page.num_input_values) &&
          (s->input_row_count <= last_row)) {
     int next_valid_count;
@@ -701,7 +1011,12 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
       processed_count += def_decoder.decode_next(t);
       __syncthreads();
 
-      if constexpr (has_nesting_t) {
+      if constexpr (has_lists_t) {
+        rep_decoder.decode_next(t);
+        __syncthreads();
+        next_valid_count = gpuUpdateValidityAndRowIndicesLists<decode_block_size_t, true, level_t>(
+          processed_count, s, sb, def, rep, t);
+      } else if constexpr (has_nesting_t) {
         next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, level_t>(
           processed_count, s, sb, def, t);
       } else {
@@ -713,9 +1028,16 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
     // this function call entirely since all it will ever generate is a mapping of (i -> i) for
     // nz_idx.  gpuDecodeFixedWidthValues would be the only work that happens.
     else {
-      processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      next_valid_count =
-        gpuUpdateValidityAndRowIndicesNonNullable<decode_block_size_t>(processed_count, s, sb, t);
+      if constexpr (has_lists_t) {
+        processed_count += rep_decoder.decode_next(t);
+        __syncthreads();
+        next_valid_count = gpuUpdateValidityAndRowIndicesLists<decode_block_size_t, false, level_t>(
+          processed_count, s, sb, nullptr, rep, t);
+      } else {
+        processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
+        next_valid_count =
+          gpuUpdateValidityAndRowIndicesNonNullable<decode_block_size_t>(processed_count, s, sb, t);
+      }
     }
     __syncthreads();
 
@@ -745,6 +1067,7 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                   size_t min_row,
                                   int level_type_size,
                                   bool has_nesting,
+                                  bool is_list,
                                   kernel_error::pointer error_code,
                                   rmm::cuda_stream_view stream)
 {
@@ -754,12 +1077,23 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   if (level_type_size == 1) {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST,
+                               false,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
                                false,
                                true,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -769,17 +1103,29 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_kernel_mask::FIXED_WIDTH_NO_DICT,
                                false,
                                false,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
     }
   } else {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST,
+                               false,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
                                false,
                                true,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -789,6 +1135,7 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_kernel_mask::FIXED_WIDTH_NO_DICT,
                                false,
                                false,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -802,6 +1149,7 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
                                       size_t min_row,
                                       int level_type_size,
                                       bool has_nesting,
+                                      bool is_list,
                                       kernel_error::pointer error_code,
                                       rmm::cuda_stream_view stream)
 {
@@ -811,12 +1159,23 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
   dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT_LIST,
+                               true,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
                                true,
                                true,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -826,17 +1185,29 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
                                decode_kernel_mask::FIXED_WIDTH_DICT,
                                true,
                                false,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
     }
   } else {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT_LIST,
+                               true,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
                                true,
                                true,
+                               false,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -846,6 +1217,7 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
                                decode_kernel_mask::FIXED_WIDTH_DICT,
                                true,
                                false,
+                               true,
                                decode_fixed_width_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -860,6 +1232,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
                               size_t min_row,
                               int level_type_size,
                               bool has_nesting,
+                              bool is_list,
                               kernel_error::pointer error_code,
                               rmm::cuda_stream_view stream)
 {
@@ -869,12 +1242,23 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
   dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST,
+                               true,
+                               true,
+                               true,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
                                false,
                                true,
+                               false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -884,17 +1268,29 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
                                false,
                                false,
+                               false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
     }
   } else {
-    if (has_nesting) {
+    if (is_list) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST,
+                               true,
+                               true,
+                               true,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else if (has_nesting) {
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
                                false,
                                true,
+                               false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
@@ -904,6 +1300,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
                                false,
                                false,
+                               false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
           pages.device_ptr(), chunks, min_row, num_rows, error_code);
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index d604642be54..52d53cb8225 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -183,17 +183,20 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
     return decode_kernel_mask::STRING;
   }
 
-  if (!is_list(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
+  if (!is_byte_array(chunk) && !is_boolean(chunk)) {
     if (page.encoding == Encoding::PLAIN) {
-      return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED
-                              : decode_kernel_mask::FIXED_WIDTH_NO_DICT;
+      return is_list(chunk)     ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST
+             : is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED
+                                : decode_kernel_mask::FIXED_WIDTH_NO_DICT;
     } else if (page.encoding == Encoding::PLAIN_DICTIONARY ||
                page.encoding == Encoding::RLE_DICTIONARY) {
-      return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED
-                              : decode_kernel_mask::FIXED_WIDTH_DICT;
+      return is_list(chunk)     ? decode_kernel_mask::FIXED_WIDTH_DICT_LIST
+             : is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED
+                                : decode_kernel_mask::FIXED_WIDTH_DICT;
     } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
-      return is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED
-                              : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT;
+      return is_list(chunk)     ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST
+             : is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED
+                                : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT;
     }
   }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index be502b581af..dba24b553e6 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -220,6 +220,10 @@ enum class decode_kernel_mask {
     (1 << 9),                              // Same as above but for nested, fixed-width data
   FIXED_WIDTH_NO_DICT_NESTED = (1 << 10),  // Run decode kernel for fixed width non-dictionary pages
   FIXED_WIDTH_DICT_NESTED    = (1 << 11),  // Run decode kernel for fixed width dictionary pages
+  FIXED_WIDTH_DICT_LIST      = (1 << 12),  // Run decode kernel for fixed width dictionary pages
+  FIXED_WIDTH_NO_DICT_LIST   = (1 << 13),  // Run decode kernel for fixed width non-dictionary pages
+  BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST =
+    (1 << 14),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data for fixed width lists
 };
 
 // mask representing all the ways in which a string can be encoded
@@ -908,6 +912,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
+ * @param[in] is_list Whether or not the data contains list data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -917,6 +922,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                          size_t min_row,
                          int level_type_size,
                          bool has_nesting,
+                         bool is_list,
                          kernel_error::pointer error_code,
                          rmm::cuda_stream_view stream);
 
@@ -932,6 +938,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
+ * @param[in] is_list Whether or not the data contains list data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -941,6 +948,7 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
                              size_t min_row,
                              int level_type_size,
                              bool has_nesting,
+                             bool is_list,
                              kernel_error::pointer error_code,
                              rmm::cuda_stream_view stream);
 
@@ -956,6 +964,7 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
+ * @param[in] is_list Whether or not the data contains list data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -965,6 +974,7 @@ void DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages
                                    size_t min_row,
                                    int level_type_size,
                                    bool has_nesting,
+                                   bool is_list,
                                    kernel_error::pointer error_code,
                                    rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index fed1a309064..689386b8957 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -272,6 +272,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                                   skip_rows,
                                   level_type_size,
                                   false,
+                                  false,
                                   error_code.data(),
                                   streams[s_idx++]);
   }
@@ -284,6 +285,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                                   skip_rows,
                                   level_type_size,
                                   true,
+                                  false,
+                                  error_code.data(),
+                                  streams[s_idx++]);
+  }
+
+  // launch byte stream split decoder, for list columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_LIST) != 0) {
+    DecodeSplitPageFixedWidthData(subpass.pages,
+                                  pass.chunks,
+                                  num_rows,
+                                  skip_rows,
+                                  level_type_size,
+                                  true,
+                                  true,
                                   error_code.data(),
                                   streams[s_idx++]);
   }
@@ -307,6 +322,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                         skip_rows,
                         level_type_size,
                         false,
+                        false,
+                        error_code.data(),
+                        streams[s_idx++]);
+  }
+
+  // launch fixed width type decoder for lists
+  if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_LIST) != 0) {
+    DecodePageDataFixed(subpass.pages,
+                        pass.chunks,
+                        num_rows,
+                        skip_rows,
+                        level_type_size,
+                        true,
+                        true,
                         error_code.data(),
                         streams[s_idx++]);
   }
@@ -319,6 +348,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                         skip_rows,
                         level_type_size,
                         true,
+                        false,
                         error_code.data(),
                         streams[s_idx++]);
   }
@@ -331,6 +361,20 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                             skip_rows,
                             level_type_size,
                             false,
+                            false,
+                            error_code.data(),
+                            streams[s_idx++]);
+  }
+
+  // launch fixed width type decoder with dictionaries for lists
+  if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_LIST) != 0) {
+    DecodePageDataFixedDict(subpass.pages,
+                            pass.chunks,
+                            num_rows,
+                            skip_rows,
+                            level_type_size,
+                            true,
+                            true,
                             error_code.data(),
                             streams[s_idx++]);
   }
@@ -343,6 +387,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                             skip_rows,
                             level_type_size,
                             true,
+                            false,
                             error_code.data(),
                             streams[s_idx++]);
   }
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 4a0791d5c54..69e783a89d0 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -19,6 +19,7 @@
 #include "parquet_gpu.hpp"
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
 
 namespace cudf::io::parquet::detail {
 
@@ -216,6 +217,26 @@ struct rle_stream {
     decode_index = -1;  // signals the first iteration. Nothing to decode.
   }
 
+  __device__ inline int get_rle_run_info(rle_run<level_t>& run)
+  {
+    run.start     = cur;
+    run.level_run = get_vlq32(run.start, end);
+
+    // run_bytes includes the header size
+    int run_bytes = run.start - cur;
+    if (is_literal_run(run.level_run)) {
+      // from the parquet spec: literal runs always come in multiples of 8 values.
+      run.size = (run.level_run >> 1) * 8;
+      run_bytes += util::div_rounding_up_unsafe(run.size * level_bits, 8);
+    } else {
+      // repeated value run
+      run.size = (run.level_run >> 1);
+      run_bytes += util::div_rounding_up_unsafe(level_bits, 8);
+    }
+
+    return run_bytes;
+  }
+
   __device__ inline void fill_run_batch()
   {
     // decode_index == -1 means we are on the very first decode iteration for this stream.
@@ -226,31 +247,14 @@ struct rle_stream {
     while (((decode_index == -1 && fill_index < num_rle_stream_decode_warps) ||
             fill_index < decode_index + run_buffer_size) &&
            cur < end) {
-      auto& run = runs[rolling_index<run_buffer_size>(fill_index)];
-
       // Encoding::RLE
+      // Pass by reference to fill the runs shared memory with the run data
+      auto& run           = runs[rolling_index<run_buffer_size>(fill_index)];
+      int const run_bytes = get_rle_run_info(run);
 
-      // bytes for the varint header
-      uint8_t const* _cur = cur;
-      int const level_run = get_vlq32(_cur, end);
-      // run_bytes includes the header size
-      int run_bytes = _cur - cur;
-
-      // literal run
-      if (is_literal_run(level_run)) {
-        // from the parquet spec: literal runs always come in multiples of 8 values.
-        run.size = (level_run >> 1) * 8;
-        run_bytes += ((run.size * level_bits) + 7) >> 3;
-      }
-      // repeated value run
-      else {
-        run.size = (level_run >> 1);
-        run_bytes += ((level_bits) + 7) >> 3;
-      }
-      run.output_pos = output_pos;
-      run.start      = _cur;
-      run.level_run  = level_run;
       run.remaining  = run.size;
+      run.output_pos = output_pos;
+
       cur += run_bytes;
       output_pos += run.size;
       fill_index++;
@@ -372,6 +376,39 @@ struct rle_stream {
     return values_processed_shared;
   }
 
+  __device__ inline int skip_runs(int target_count)
+  {
+    // we want to process all runs UP TO BUT NOT INCLUDING the run that overlaps with the skip
+    // amount so threads spin like crazy on fill_run_batch(), skipping writing unnecessary run info.
+    // then when it hits the one that matters, we don't process it at all and bail as if we never
+    // started basically we're setting up the rle_stream vars necessary to start fill_run_batch for
+    // the first time
+    while (cur < end) {
+      rle_run<level_t> run;
+      int run_bytes = get_rle_run_info(run);
+
+      if ((output_pos + run.size) > target_count) {
+        return output_pos;  // bail! we've reached the starting run
+      }
+
+      // skip this run
+      output_pos += run.size;
+      cur += run_bytes;
+    }
+
+    return output_pos;  // we skipped everything
+  }
+
+  __device__ inline int skip_decode(int t, int count)
+  {
+    int const output_count = min(count, total_values - cur_values);
+
+    // if level_bits == 0, there's nothing to do
+    // a very common case: columns with no nulls, especially if they are non-nested
+    cur_values = (level_bits == 0) ? output_count : skip_runs(output_count);
+    return cur_values;
+  }
+
   __device__ inline int decode_next(int t) { return decode_next(t, max_output_values); }
 };