From 246d017669cbeca3570106b4bb52a92f931ea2c1 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 13 Jun 2024 09:33:43 -0500
Subject: [PATCH 01/16] Plumb pylibcudf strings `contains_re` through
 cudf_polars (#15918)

This PR adds cudf-polars code for evaluating the `StringFunction.Contains` expression node.

Depends on https://github.com/rapidsai/cudf/pull/15880/

Authors:
  - https://github.com/brandon-b-miller
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15918
---
 python/cudf_polars/cudf_polars/dsl/expr.py | 51 ++++++++++++++++++
 python/cudf_polars/tests/test_string.py    | 61 ++++++++++++++++++++++
 2 files changed, 112 insertions(+)
 create mode 100644 python/cudf_polars/tests/test_string.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 298ef5ab070..03c1db68dbd 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -644,13 +644,28 @@ def __init__(
         self.options = options
         self.name = name
         self.children = children
+        self._validate_input()
+
+    def _validate_input(self):
         if self.name not in (
             pl_expr.StringFunction.Lowercase,
             pl_expr.StringFunction.Uppercase,
             pl_expr.StringFunction.EndsWith,
             pl_expr.StringFunction.StartsWith,
+            pl_expr.StringFunction.Contains,
         ):
             raise NotImplementedError(f"String function {self.name}")
+        if self.name == pl_expr.StringFunction.Contains:
+            literal, strict = self.options
+            if not literal:
+                if not strict:
+                    raise NotImplementedError(
+                        "f{strict=} is not supported for regex contains"
+                    )
+                if not isinstance(self.children[1], Literal):
+                    raise NotImplementedError(
+                        "Regex contains only supports a scalar pattern"
+                    )
 
     def do_evaluate(
         self,
@@ -660,6 +675,26 @@ def do_evaluate(
         mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
+        if self.name == pl_expr.StringFunction.Contains:
+            child, arg = self.children
+            column = child.evaluate(df, context=context, mapping=mapping)
+
+            literal, _ = self.options
+            if literal:
+                pat = arg.evaluate(df, context=context, mapping=mapping)
+                pattern = (
+                    pat.obj_scalar
+                    if pat.is_scalar and pat.obj.size() != column.obj.size()
+                    else pat.obj
+                )
+                return Column(plc.strings.find.contains(column.obj, pattern))
+            else:
+                assert isinstance(arg, Literal)
+                prog = plc.strings.regex_program.RegexProgram.create(
+                    arg.value.as_py(),
+                    flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
+                )
+                return Column(plc.strings.contains.contains_re(column.obj, prog))
         columns = [
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
@@ -691,6 +726,22 @@ def do_evaluate(
                 )
             )
         else:
+            columns = [
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            ]
+            if self.name == pl_expr.StringFunction.Lowercase:
+                (column,) = columns
+                return Column(plc.strings.case.to_lower(column.obj))
+            elif self.name == pl_expr.StringFunction.Uppercase:
+                (column,) = columns
+                return Column(plc.strings.case.to_upper(column.obj))
+            elif self.name == pl_expr.StringFunction.EndsWith:
+                column, suffix = columns
+                return Column(plc.strings.find.ends_with(column.obj, suffix.obj))
+            elif self.name == pl_expr.StringFunction.StartsWith:
+                column, suffix = columns
+                return Column(plc.strings.find.starts_with(column.obj, suffix.obj))
             raise NotImplementedError(
                 f"StringFunction {self.name}"
             )  # pragma: no cover; handled by init raising
diff --git a/python/cudf_polars/tests/test_string.py b/python/cudf_polars/tests/test_string.py
new file mode 100644
index 00000000000..f1a080d040f
--- /dev/null
+++ b/python/cudf_polars/tests/test_string.py
@@ -0,0 +1,61 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+from functools import partial
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.callback import execute_with_cudf
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture
+def ldf():
+    return pl.DataFrame(
+        {"a": ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]}
+    ).lazy()
+
+
+@pytest.mark.parametrize(
+    "substr",
+    [
+        "A",
+        "de",
+        ".*",
+        "^a",
+        "^A",
+        "[^a-z]",
+        "[a-z]{3,}",
+        "^[A-Z]{2,}",
+        "j|u",
+    ],
+)
+def test_contains_regex(ldf, substr):
+    query = ldf.select(pl.col("a").str.contains(substr))
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize(
+    "literal", ["A", "de", "FGHI", "j", "kLm", "nOPq", "RsT", "uVw"]
+)
+def test_contains_literal(ldf, literal):
+    query = ldf.select(pl.col("a").str.contains(pl.lit(literal), literal=True))
+    assert_gpu_result_equal(query)
+
+
+def test_contains_column(ldf):
+    query = ldf.select(pl.col("a").str.contains(pl.col("a"), literal=True))
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize("pat", ["["])
+def test_contains_invalid(ldf, pat):
+    query = ldf.select(pl.col("a").str.contains(pat))
+
+    with pytest.raises(pl.exceptions.ComputeError):
+        query.collect()
+    with pytest.raises(pl.exceptions.ComputeError):
+        query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True))

From f651f12471edda51bf4c4071d74ff6720bd037fc Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 13 Jun 2024 16:05:44 +0100
Subject: [PATCH 02/16] Port start of datetime.hpp to pylibcudf (#15916)

Start exposing datetime extraction functions.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15916
---
 .../api_docs/pylibcudf/datetime.rst           |  6 ++++
 .../user_guide/api_docs/pylibcudf/index.rst   |  1 +
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |  1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |  4 ++-
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |  4 ++-
 python/cudf/cudf/_lib/pylibcudf/datetime.pxd  |  8 +++++
 python/cudf/cudf/_lib/pylibcudf/datetime.pyx  | 33 +++++++++++++++++++
 .../_lib/pylibcudf/libcudf/CMakeLists.txt     |  2 +-
 python/cudf/cudf/pylibcudf_tests/conftest.py  |  5 +++
 .../cudf/pylibcudf_tests/test_datetime.py     | 30 +++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/test_round.py   |  9 ++---
 11 files changed, 93 insertions(+), 10 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/datetime.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/datetime.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_datetime.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
new file mode 100644
index 00000000000..ebf5fab3052
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
@@ -0,0 +1,6 @@
+=======
+copying
+=======
+
+.. automodule:: cudf._lib.pylibcudf.datetime
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 1e03fa80bb5..f98298ff052 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -14,6 +14,7 @@ This page provides API documentation for pylibcudf.
     column_factories
     concatenate
     copying
+    datetime
     filling
     gpumemoryview
     groupby
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index ed396208f98..0a198f431a7 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -19,6 +19,7 @@ set(cython_sources
     column_factories.pyx
     concatenate.pyx
     copying.pyx
+    datetime.pyx
     filling.pyx
     gpumemoryview.pyx
     groupby.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index a628ecdb038..5131df9a5cd 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -7,6 +7,7 @@ from . cimport (
     column_factories,
     concatenate,
     copying,
+    datetime,
     filling,
     groupby,
     join,
@@ -40,9 +41,10 @@ __all__ = [
     "Table",
     "aggregation",
     "binaryop",
+    "column_factories",
     "concatenate",
     "copying",
-    "column_factories",
+    "datetime",
     "filling",
     "gpumemoryview",
     "groupby",
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 46d0fe13cd1..43a9e2aca31 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -6,6 +6,7 @@
     column_factories,
     concatenate,
     copying,
+    datetime,
     filling,
     groupby,
     interop,
@@ -39,9 +40,10 @@
     "TypeId",
     "aggregation",
     "binaryop",
+    "column_factories",
     "concatenate",
     "copying",
-    "column_factories",
+    "datetime",
     "filling",
     "gpumemoryview",
     "groupby",
diff --git a/python/cudf/cudf/_lib/pylibcudf/datetime.pxd b/python/cudf/cudf/_lib/pylibcudf/datetime.pxd
new file mode 100644
index 00000000000..2fce48cf1b4
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/datetime.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from .column cimport Column
+
+
+cpdef Column extract_year(
+    Column col
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/datetime.pyx b/python/cudf/cudf/_lib/pylibcudf/datetime.pyx
new file mode 100644
index 00000000000..82351327de6
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/datetime.pyx
@@ -0,0 +1,33 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.datetime cimport (
+    extract_year as cpp_extract_year,
+)
+
+from .column cimport Column
+
+
+cpdef Column extract_year(
+    Column values
+):
+    """
+    Extract the year from a datetime column.
+
+    Parameters
+    ----------
+    values : Column
+        The column to extract the year from.
+
+    Returns
+    -------
+    Column
+        Column with the extracted years.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = move(cpp_extract_year(values.view()))
+    return Column.from_libcudf(move(result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
index ac56d42dda8..6c66d01ca57 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.pxd round.pyx
+set(cython_sources aggregation.pyx binaryop.pyx copying.pyx reduce.pyx replace.pyx round.pyx
                    stream_compaction.pyx types.pyx unary.pyx
 )
 
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
index f3c6584ef8c..b169bbdee5b 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -58,3 +58,8 @@ def interp_opt(request):
 )
 def sorted_opt(request):
     return request.param
+
+
+@pytest.fixture(scope="session", params=[False, True])
+def has_nulls(request):
+    return request.param
diff --git a/python/cudf/cudf/pylibcudf_tests/test_datetime.py b/python/cudf/cudf/pylibcudf_tests/test_datetime.py
new file mode 100644
index 00000000000..75af0fa6ca1
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_datetime.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import datetime
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.fixture
+def column(has_nulls):
+    values = [
+        datetime.date(1999, 1, 1),
+        datetime.date(2024, 10, 12),
+        datetime.date(1, 1, 1),
+        datetime.date(9999, 1, 1),
+    ]
+    if has_nulls:
+        values[2] = None
+    return plc.interop.from_arrow(pa.array(values, type=pa.date32()))
+
+
+def test_extract_year(column):
+    got = plc.datetime.extract_year(column)
+    # libcudf produces an int16, arrow produces an int64
+    expect = pa.compute.year(plc.interop.to_arrow(column)).cast(pa.int16())
+
+    assert_column_eq(expect, got)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_round.py b/python/cudf/cudf/pylibcudf_tests/test_round.py
index a234860477f..991e6ed310d 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_round.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_round.py
@@ -7,16 +7,11 @@
 import cudf._lib.pylibcudf as plc
 
 
-@pytest.fixture(params=[False, True])
-def nullable(request):
-    return request.param
-
-
 @pytest.fixture(params=["float32", "float64"])
-def column(request, nullable):
+def column(request, has_nulls):
     values = [2.5, 2.49, 1.6, 8, -1.5, -1.7, -0.5, 0.5]
     typ = {"float32": pa.float32(), "float64": pa.float64()}[request.param]
-    if nullable:
+    if has_nulls:
         values[2] = None
     return plc.interop.from_arrow(pa.array(values, type=typ))
 

From cb564da1204f0da7eaeb8a0e636a0f23c97c314f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 13 Jun 2024 05:11:37 -1000
Subject: [PATCH 03/16] Move some misc Frame methods to appropriate locations
 (#15963)

* Move `Frame._is_sorted` to `MultiIndex._is_sorted` (the only class that uses this method)
* Move `_apply_inverse_column` helper function to define `Column.__invert__`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15963
---
 python/cudf/cudf/core/column/column.py    |  5 ++
 python/cudf/cudf/core/column/numerical.py |  8 +++
 python/cudf/cudf/core/frame.py            | 61 +----------------------
 python/cudf/cudf/core/multiindex.py       | 49 +++++++++++++++++-
 4 files changed, 62 insertions(+), 61 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 001e8996c19..75fc31ddbce 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1118,6 +1118,11 @@ def __cuda_array_interface__(self) -> abc.Mapping[str, Any]:
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return _array_ufunc(self, ufunc, method, inputs, kwargs)
 
+    def __invert__(self):
+        raise TypeError(
+            f"Operation `~` not supported on {self.dtype.type.__name__}"
+        )
+
     def searchsorted(
         self,
         value,
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 6fb4f17b76d..1952d7eeb71 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -194,6 +194,14 @@ def unary_operator(self, unaryop: Union[str, Callable]) -> ColumnBase:
         unaryop = pylibcudf.unary.UnaryOperator[unaryop]
         return libcudf.unary.unary_operation(self, unaryop)
 
+    def __invert__(self):
+        if self.dtype.kind in "ui":
+            return self.unary_operator("invert")
+        elif self.dtype.kind == "b":
+            return self.unary_operator("not")
+        else:
+            return super().__invert__()
+
     def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         int_float_dtype_mapping = {
             np.int8: np.float32,
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index af8886a44a6..01b56f1edc4 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -32,7 +32,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._typing import Dtype
-from cudf.api.types import is_bool_dtype, is_dtype_equal, is_scalar
+from cudf.api.types import is_dtype_equal, is_scalar
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
     ColumnBase,
@@ -1455,51 +1455,6 @@ def _get_sorted_inds(
             stable=True,
         )
 
-    @_cudf_nvtx_annotate
-    def _is_sorted(self, ascending=None, null_position=None):
-        """
-        Returns a boolean indicating whether the data of the Frame are sorted
-        based on the parameters given. Does not account for the index.
-
-        Parameters
-        ----------
-        self : Frame
-            Frame whose columns are to be checked for sort order
-        ascending : None or list-like of booleans
-            None or list-like of boolean values indicating expected sort order
-            of each column. If list-like, size of list-like must be
-            len(columns). If None, all columns expected sort order is set to
-            ascending. False (0) - ascending, True (1) - descending.
-        null_position : None or list-like of booleans
-            None or list-like of boolean values indicating desired order of
-            nulls compared to other elements. If list-like, size of list-like
-            must be len(columns). If None, null order is set to before. False
-            (0) - before, True (1) - after.
-
-        Returns
-        -------
-        returns : boolean
-            Returns True, if sorted as expected by ``ascending`` and
-            ``null_position``, False otherwise.
-        """
-        if ascending is not None and not cudf.api.types.is_list_like(
-            ascending
-        ):
-            raise TypeError(
-                f"Expected a list-like or None for `ascending`, got "
-                f"{type(ascending)}"
-            )
-        if null_position is not None and not cudf.api.types.is_list_like(
-            null_position
-        ):
-            raise TypeError(
-                f"Expected a list-like or None for `null_position`, got "
-                f"{type(null_position)}"
-            )
-        return libcudf.sort.is_sorted(
-            [*self._columns], ascending=ascending, null_position=null_position
-        )
-
     @_cudf_nvtx_annotate
     def _split(self, splits):
         """Split a frame with split points in ``splits``. Returns a list of
@@ -1920,7 +1875,7 @@ def __invert__(self):
         """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
             self._data._from_columns_like_self(
-                (_apply_inverse_column(col) for col in self._data.columns)
+                (~col for col in self._data.columns)
             )
         )
 
@@ -1970,15 +1925,3 @@ def __dask_tokenize__(self):
             str(dict(self._dtypes)),
             normalize_token(self.to_pandas()),
         ]
-
-
-def _apply_inverse_column(col: ColumnBase) -> ColumnBase:
-    """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
-    if np.issubdtype(col.dtype, np.integer):
-        return col.unary_operator("invert")
-    elif is_bool_dtype(col.dtype):
-        return col.unary_operator("not")
-    else:
-        raise TypeError(
-            f"Operation `~` not supported on {col.dtype.type.__name__}"
-        )
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 11b4b9154a2..6d3520e33cf 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1636,9 +1636,54 @@ def is_unique(self):
     def dtype(self):
         return np.dtype("O")
 
+    @_cudf_nvtx_annotate
+    def _is_sorted(self, ascending=None, null_position=None) -> bool:
+        """
+        Returns a boolean indicating whether the data of the MultiIndex are sorted
+        based on the parameters given. Does not account for the index.
+
+        Parameters
+        ----------
+        self : MultiIndex
+            MultiIndex whose columns are to be checked for sort order
+        ascending : None or list-like of booleans
+            None or list-like of boolean values indicating expected sort order
+            of each column. If list-like, size of list-like must be
+            len(columns). If None, all columns expected sort order is set to
+            ascending. False (0) - ascending, True (1) - descending.
+        null_position : None or list-like of booleans
+            None or list-like of boolean values indicating desired order of
+            nulls compared to other elements. If list-like, size of list-like
+            must be len(columns). If None, null order is set to before. False
+            (0) - before, True (1) - after.
+
+        Returns
+        -------
+        returns : boolean
+            Returns True, if sorted as expected by ``ascending`` and
+            ``null_position``, False otherwise.
+        """
+        if ascending is not None and not cudf.api.types.is_list_like(
+            ascending
+        ):
+            raise TypeError(
+                f"Expected a list-like or None for `ascending`, got "
+                f"{type(ascending)}"
+            )
+        if null_position is not None and not cudf.api.types.is_list_like(
+            null_position
+        ):
+            raise TypeError(
+                f"Expected a list-like or None for `null_position`, got "
+                f"{type(null_position)}"
+            )
+        return libcudf.sort.is_sorted(
+            [*self._columns], ascending=ascending, null_position=null_position
+        )
+
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_monotonic_increasing(self):
+    def is_monotonic_increasing(self) -> bool:
         """
         Return if the index is monotonic increasing
         (only equal or increasing) values.
@@ -1647,7 +1692,7 @@ def is_monotonic_increasing(self):
 
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_monotonic_decreasing(self):
+    def is_monotonic_decreasing(self) -> bool:
         """
         Return if the index is monotonic decreasing
         (only equal or decreasing) values.

From 3cb3df3255efaec4a5ebb6cb7606067f753e3554 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 13 Jun 2024 11:54:55 -0500
Subject: [PATCH 04/16] Add ability to enable rmm pool on `cudf.pandas` import
 (#15628)

This PR enables allocating of rmm memory pool on `cudf.pandas` import using the following environment variables:

```
export CUDF_PANDAS_RMM_MODE="pool"
```

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15628
---
 python/cudf/cudf/pandas/__init__.py           | 43 +++++++++++++++++++
 .../cudf_pandas_tests/test_cudf_pandas.py     | 28 ++++++++++++
 2 files changed, 71 insertions(+)

diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py
index 5b3785531d3..59a88f85dda 100644
--- a/python/cudf/cudf/pandas/__init__.py
+++ b/python/cudf/cudf/pandas/__init__.py
@@ -2,6 +2,9 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+
+import warnings
+
 from .fast_slow_proxy import is_proxy_object
 from .magics import load_ipython_extension
 from .profiler import Profiler
@@ -19,6 +22,46 @@ def install():
     loader = ModuleAccelerator.install("pandas", "cudf", "pandas")
     global LOADED
     LOADED = loader is not None
+    import os
+
+    if (rmm_mode := os.getenv("CUDF_PANDAS_RMM_MODE", None)) is not None:
+        import rmm.mr
+        from rmm.mr import available_device_memory
+
+        # Check if a non-default memory resource is set
+        current_mr = rmm.mr.get_current_device_resource()
+        if not isinstance(current_mr, rmm.mr.CudaMemoryResource):
+            warnings.warn(
+                f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={str(rmm_mode)}",
+                UserWarning,
+            )
+        free_memory, _ = available_device_memory()
+        free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
+
+        if rmm_mode == "cuda":
+            mr = rmm.mr.CudaMemoryResource()
+            rmm.mr.set_current_device_resource(mr)
+        elif rmm_mode == "pool":
+            rmm.mr.set_current_device_resource(
+                rmm.mr.PoolMemoryResource(
+                    rmm.mr.get_current_device_resource(),
+                    initial_pool_size=free_memory,
+                )
+            )
+        elif rmm_mode == "async":
+            mr = rmm.mr.CudaAsyncMemoryResource(initial_pool_size=free_memory)
+            rmm.mr.set_current_device_resource(mr)
+        elif rmm_mode == "managed":
+            mr = rmm.mr.ManagedMemoryResource()
+            rmm.mr.set_current_device_resource(mr)
+        elif rmm_mode == "managed_pool":
+            rmm.reinitialize(
+                managed_memory=True,
+                pool_allocator=True,
+                initial_pool_size=free_memory,
+            )
+        else:
+            raise TypeError(f"Unsupported rmm mode: {rmm_mode}")
 
 
 def pytest_load_initial_conftests(early_config, parser, args):
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 515a4714a5a..c251e4a197e 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -9,6 +9,7 @@
 import os
 import pathlib
 import pickle
+import subprocess
 import tempfile
 import types
 from io import BytesIO, StringIO
@@ -1425,6 +1426,33 @@ def test_holidays_within_dates(holiday, start, expected):
     ) == [utc.localize(dt) for dt in expected]
 
 
+@pytest.mark.parametrize(
+    "env_value",
+    ["", "cuda", "pool", "async", "managed", "managed_pool", "abc"],
+)
+def test_rmm_option_on_import(env_value):
+    data_directory = os.path.dirname(os.path.abspath(__file__))
+    # Create a copy of the current environment variables
+    env = os.environ.copy()
+    env["CUDF_PANDAS_RMM_MODE"] = env_value
+
+    sp_completed = subprocess.run(
+        [
+            "python",
+            "-m",
+            "cudf.pandas",
+            data_directory + "/data/profile_basic.py",
+        ],
+        capture_output=True,
+        text=True,
+        env=env,
+    )
+    if env_value in {"cuda", "pool", "async", "managed", "managed_pool"}:
+        assert sp_completed.returncode == 0
+    else:
+        assert sp_completed.returncode == 1
+
+
 def test_cudf_pandas_debugging_different_results(monkeypatch):
     cudf_mean = cudf.Series.mean
 

From 3f8f2149129f97947223611e2709d235e889389b Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 13 Jun 2024 17:04:45 -0500
Subject: [PATCH 05/16] Refactor rmm usage in `cudf.pandas` (#16021)

This PR addresses review comments made by @bdice here: https://github.com/rapidsai/cudf/pull/15628#pullrequestreview-2116067037

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16021
---
 python/cudf/cudf/pandas/__init__.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py
index 59a88f85dda..ff445a63f74 100644
--- a/python/cudf/cudf/pandas/__init__.py
+++ b/python/cudf/cudf/pandas/__init__.py
@@ -2,9 +2,11 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-
+import os
 import warnings
 
+import rmm.mr
+
 from .fast_slow_proxy import is_proxy_object
 from .magics import load_ipython_extension
 from .profiler import Profiler
@@ -22,12 +24,8 @@ def install():
     loader = ModuleAccelerator.install("pandas", "cudf", "pandas")
     global LOADED
     LOADED = loader is not None
-    import os
 
     if (rmm_mode := os.getenv("CUDF_PANDAS_RMM_MODE", None)) is not None:
-        import rmm.mr
-        from rmm.mr import available_device_memory
-
         # Check if a non-default memory resource is set
         current_mr = rmm.mr.get_current_device_resource()
         if not isinstance(current_mr, rmm.mr.CudaMemoryResource):
@@ -35,7 +33,7 @@ def install():
                 f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={str(rmm_mode)}",
                 UserWarning,
             )
-        free_memory, _ = available_device_memory()
+        free_memory, _ = rmm.mr.available_device_memory()
         free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
 
         if rmm_mode == "cuda":
@@ -55,13 +53,13 @@ def install():
             mr = rmm.mr.ManagedMemoryResource()
             rmm.mr.set_current_device_resource(mr)
         elif rmm_mode == "managed_pool":
-            rmm.reinitialize(
-                managed_memory=True,
-                pool_allocator=True,
+            mr = rmm.mr.PoolMemoryResource(
+                rmm.mr.ManagedMemoryResource(),
                 initial_pool_size=free_memory,
             )
+            rmm.mr.set_current_device_resource(mr)
         else:
-            raise TypeError(f"Unsupported rmm mode: {rmm_mode}")
+            raise ValueError(f"Unsupported rmm mode: {rmm_mode}")
 
 
 def pytest_load_initial_conftests(early_config, parser, args):

From 31d909b0af9bcf9cf804ca1c3893ea71fbd5d765 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 13 Jun 2024 13:27:05 -1000
Subject: [PATCH 06/16] Support IntervalDtype in cudf.from_pandas (#16014)

Noticed while running the pandas test suite against `cudf.pandas`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16014
---
 python/cudf/cudf/core/dataframe.py      | 6 +++---
 python/cudf/cudf/tests/test_interval.py | 7 +++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index e1b6cc45dd3..7438b0237d5 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -8072,11 +8072,11 @@ def from_pandas(obj, nan_as_null=no_default):
         return cudf.Index.from_pandas(obj, nan_as_null=nan_as_null)
     elif isinstance(obj, pd.CategoricalDtype):
         return cudf.CategoricalDtype.from_pandas(obj)
+    elif isinstance(obj, pd.IntervalDtype):
+        return cudf.IntervalDtype.from_pandas(obj)
     else:
         raise TypeError(
-            "from_pandas only accepts Pandas Dataframes, Series, "
-            "Index, RangeIndex and MultiIndex objects. "
-            "Got %s" % type(obj)
+            f"from_pandas unsupported for object of type {type(obj).__name__}"
         )
 
 
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index 7b923af1f75..013f4439ad5 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -181,3 +181,10 @@ def test_interval_with_datetime(tz, box):
     else:
         with pytest.raises(NotImplementedError):
             cudf.from_pandas(pobj)
+
+
+def test_from_pandas_intervaldtype():
+    dtype = pd.IntervalDtype("int64", closed="left")
+    result = cudf.from_pandas(dtype)
+    expected = cudf.IntervalDtype("int64", closed="left")
+    assert_eq(result, expected)

From 987879ca4bdcae0d959266fd39196123007fa45e Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 13 Jun 2024 19:27:11 -0700
Subject: [PATCH 07/16] Fix the pool size alignment issue (#16024)

This PR fixes a pool size alignment bug.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16024
---
 cpp/src/utilities/pinned_memory.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp
index 5d2e3ac332a..e90b7969b4d 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/pinned_memory.cpp
@@ -43,9 +43,11 @@ class fixed_pinned_pool_memory_resource {
 
  public:
   fixed_pinned_pool_memory_resource(size_t size)
-    : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)}
+    :  // rmm requires the pool size to be a multiple of 256 bytes
+      pool_size_{rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT)},
+      pool_{new host_pooled_mr(upstream_mr_, pool_size_, pool_size_)}
   {
-    if (pool_size_ == 0) { return; }
+    CUDF_LOG_INFO("Pinned pool size = {}", pool_size_);
 
     // Allocate full size from the pinned pool to figure out the beginning and end address
     pool_begin_ = pool_->allocate_async(pool_size_, stream_);
@@ -145,12 +147,8 @@ CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr(
       return std::min(total / 200, size_t{100} * 1024 * 1024);
     }();
 
-    // rmm requires the pool size to be a multiple of 256 bytes
-    auto const aligned_size = rmm::align_up(size, rmm::RMM_DEFAULT_HOST_ALIGNMENT);
-    CUDF_LOG_INFO("Pinned pool size = {}", aligned_size);
-
     // make the pool with max size equal to the initial size
-    return fixed_pinned_pool_memory_resource{aligned_size};
+    return fixed_pinned_pool_memory_resource{size};
   }();
 
   static rmm::host_device_async_resource_ref mr_ref{mr};

From 829b3a959cc5f0d41fe51dca9a4335dba0da69a5 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 13 Jun 2024 20:40:56 -0700
Subject: [PATCH 08/16] Fix the int32 overflow when computing page fragment
 sizes for large string columns (#16028)

This PR fixes the possible `int32` overflow when computing page fragment sizes for large (2B+ char) string columns.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16028
---
 cpp/src/io/parquet/writer_impl.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 6d466748c17..ca15b532d07 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1763,10 +1763,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     // for multiple fragments per page to smooth things out. using 2 was too
     // unbalanced in final page sizes, so using 4 which seems to be a good
     // compromise at smoothing things out without getting fragment sizes too small.
-    auto frag_size_fn = [&](auto const& col, size_type col_size) {
+    auto frag_size_fn = [&](auto const& col, size_t col_size) {
       int const target_frags_per_page = is_col_fixed_width(col) ? 1 : 4;
       auto const avg_len =
-        target_frags_per_page * util::div_rounding_up_safe<size_type>(col_size, input.num_rows());
+        target_frags_per_page * util::div_rounding_up_safe<size_t>(col_size, input.num_rows());
       if (avg_len > 0) {
         auto const frag_size = util::div_rounding_up_safe<size_type>(max_page_size_bytes, avg_len);
         return std::min<size_type>(max_page_fragment_size, frag_size);

From 34227d3cb687d465f1d4a5f12cbb37a47b97866e Mon Sep 17 00:00:00 2001
From: Zach Puller <zach.puller@gmail.com>
Date: Thu, 13 Jun 2024 23:45:35 -0700
Subject: [PATCH 09/16] orc multithreaded benchmark (#16009)

Addresses: https://github.com/rapidsai/cudf/issues/15973

Adds multithreaded benchmarks for the ORC reader. Based off of the parquet equivalent in https://github.com/rapidsai/cudf/pull/15585

```
# Benchmark Results

## orc_multithreaded_read_decode_mixed

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |    338x | 44.348 ms | 1.18% | 44.343 ms | 1.18% |      12107185968 |       939.341 MiB |        39.557 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |     80x | 77.634 ms | 0.65% | 77.629 ms | 0.65% |      13831742649 |         1.834 GiB |        79.072 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |    341x | 43.921 ms | 1.20% | 43.916 ms | 1.20% |      12224889363 |       825.333 MiB |        39.568 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |     80x | 75.418 ms | 0.70% | 75.414 ms | 0.70% |      14237999015 |         1.611 GiB |        79.113 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |     80x | 42.682 ms | 1.18% | 42.678 ms | 1.18% |      12579566132 |       883.436 MiB |        39.587 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |      9x | 74.056 ms | 0.48% | 74.052 ms | 0.48% |      14499873867 |         1.724 GiB |        79.136 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |     25x | 42.198 ms | 0.50% | 42.194 ms | 0.49% |      12723960975 |       940.562 MiB |        39.600 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |      8x | 73.933 ms | 0.49% | 73.929 ms | 0.49% |      14524042443 |         1.781 GiB |        79.175 MiB |

## orc_multithreaded_read_decode_fixed_width

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |     13x | 40.149 ms | 0.04% | 40.144 ms | 0.04% |      13373482726 |       643.390 MiB |        59.821 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |    211x | 71.216 ms | 0.67% | 71.211 ms | 0.67% |      15078297784 |         1.257 GiB |       119.650 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |    378x | 39.662 ms | 1.31% | 39.658 ms | 1.31% |      13537590893 |       643.392 MiB |        59.833 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |    209x | 71.693 ms | 0.71% | 71.688 ms | 0.71% |      14978085376 |         1.257 GiB |       119.642 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |    377x | 39.731 ms | 1.30% | 39.726 ms | 1.30% |      13514305239 |       643.394 MiB |        59.856 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |      8x | 70.766 ms | 0.08% | 70.761 ms | 0.08% |      15174115364 |         1.030 GiB |       119.665 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |    379x | 39.486 ms | 1.27% | 39.482 ms | 1.27% |      13597888468 |       647.399 MiB |        59.928 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |    207x | 72.686 ms | 2.04% | 72.681 ms | 2.04% |      14773317833 |         1.143 GiB |       119.711 MiB |

## orc_multithreaded_read_decode_string

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |     80x | 22.933 ms | 2.13% | 22.928 ms | 2.13% |      23415352877 |       661.948 MiB |        10.879 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |    160x | 34.167 ms | 1.41% | 34.162 ms | 1.41% |      31430436877 |         1.293 GiB |        21.757 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |    560x | 22.533 ms | 2.18% | 22.528 ms | 2.18% |      23830839172 |       609.407 MiB |        10.941 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |     80x | 34.311 ms | 1.54% | 34.307 ms | 1.54% |      31298288990 |         1.188 GiB |        21.758 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |     23x | 22.179 ms | 0.11% | 22.175 ms | 0.11% |      24211151047 |       624.177 MiB |        10.947 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |     15x | 33.793 ms | 0.08% | 33.789 ms | 0.08% |      31777989791 |         1.190 GiB |        21.881 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |    679x | 22.006 ms | 1.74% | 22.002 ms | 1.74% |      24401381631 |       624.524 MiB |        10.951 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |    160x | 33.320 ms | 1.57% | 33.316 ms | 1.57% |      32229227026 |         1.207 GiB |        21.894 MiB |

## orc_multithreaded_read_decode_list

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | Samples |  CPU Time  | Noise  |  GPU Time  | Noise  | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|---------|------------|--------|------------|--------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |     96x |  74.437 ms |  0.68% |  74.433 ms |  0.68% |       7212831148 |       600.751 MiB |        60.245 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |      7x |  80.994 ms |  0.49% |  80.990 ms |  0.49% |      13257745936 |         1.173 GiB |       120.549 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |     80x |  79.234 ms |  4.57% |  79.229 ms |  4.57% |       6776190522 |       600.950 MiB |        60.250 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |    166x |  90.437 ms | 17.19% |  90.432 ms | 17.19% |      11873413959 |         1.173 GiB |       120.489 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |     80x |  78.613 ms |  2.98% |  78.608 ms |  2.98% |       6829702014 |       602.764 MiB |        60.323 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |    127x | 118.629 ms | 22.67% | 118.624 ms | 22.67% |       9051644873 |         1.174 GiB |       120.499 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |    112x | 133.950 ms |  4.45% | 133.945 ms |  4.45% |       4008135293 |       603.471 MiB |        60.353 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |     90x | 167.850 ms | 15.93% | 167.844 ms | 15.93% |       6397248426 |         1.177 GiB |       120.646 MiB |

## orc_multithreaded_read_decode_chunked_mixed

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |   671088640 |    671088640 |    333x | 45.009 ms | 1.10% | 45.005 ms | 1.10% |      11929261073 |       939.341 MiB |        39.557 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |   671088640 |    671088640 |     96x | 81.524 ms | 0.61% | 81.519 ms | 0.61% |      13171640865 |         1.834 GiB |        79.072 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |   671088640 |    671088640 |    339x | 44.183 ms | 0.96% | 44.179 ms | 0.96% |      12152252271 |       825.333 MiB |        39.568 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |   671088640 |    671088640 |      7x | 79.051 ms | 0.02% | 79.046 ms | 0.02% |      13583676002 |         1.611 GiB |        79.113 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |   671088640 |    671088640 |     12x | 43.276 ms | 0.09% | 43.272 ms | 0.09% |      12407024794 |       883.436 MiB |        39.587 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |   671088640 |    671088640 |     19x | 78.019 ms | 0.49% | 78.014 ms | 0.49% |      13763433041 |         1.724 GiB |        79.136 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |   671088640 |    671088640 |     80x | 42.803 ms | 1.22% | 42.799 ms | 1.22% |      12543864010 |       911.993 MiB |        39.600 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |   671088640 |    671088640 |    193x | 77.856 ms | 0.59% | 77.852 ms | 0.59% |      13792063986 |         1.837 GiB |        79.175 MiB |

## orc_multithreaded_read_decode_chunked_fixed_width

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |   671088640 |    671088640 |    112x | 40.497 ms | 1.23% | 40.493 ms | 1.23% |      13258480947 |       643.390 MiB |        59.821 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |   671088640 |    671088640 |      7x | 75.440 ms | 0.09% | 75.435 ms | 0.09% |      14234033611 |         1.648 GiB |       119.651 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |   671088640 |    671088640 |     80x | 39.793 ms | 1.36% | 39.789 ms | 1.36% |      13493067216 |       643.392 MiB |        59.833 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |   671088640 |    671088640 |     69x | 74.499 ms | 0.50% | 74.494 ms | 0.50% |      14413864845 |         1.336 GiB |       119.642 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |   671088640 |    671088640 |    381x | 39.273 ms | 1.11% | 39.269 ms | 1.11% |      13671742653 |       643.394 MiB |        59.856 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |   671088640 |    671088640 |    204x | 73.755 ms | 0.60% | 73.751 ms | 0.60% |      14559012350 |         1.648 GiB |       119.665 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |   671088640 |    671088640 |     80x | 39.490 ms | 1.31% | 39.486 ms | 1.31% |      13596333864 |       631.980 MiB |        59.928 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |   671088640 |    671088640 |    203x | 73.907 ms | 1.34% | 73.903 ms | 1.34% |      14529071322 |         1.454 GiB |       119.711 MiB |

## orc_multithreaded_read_decode_chunked_string

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |   671088640 |    671088640 |     80x | 23.022 ms | 1.96% | 23.017 ms | 1.96% |      23324556592 |       661.948 MiB |        10.879 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |   671088640 |    671088640 |     80x | 37.687 ms | 1.37% | 37.682 ms | 1.37% |      28494755419 |         1.659 GiB |        21.757 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |   671088640 |    671088640 |     80x | 22.703 ms | 2.30% | 22.699 ms | 2.30% |      23652118769 |       609.407 MiB |        10.941 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |   671088640 |    671088640 |     80x | 37.581 ms | 1.42% | 37.577 ms | 1.42% |      28574723179 |         1.658 GiB |        21.758 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |   671088640 |    671088640 |    544x | 22.296 ms | 1.56% | 22.293 ms | 1.56% |      24082840350 |       631.319 MiB |        10.947 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |   671088640 |    671088640 |     14x | 36.990 ms | 0.14% | 36.985 ms | 0.14% |      29031484389 |         1.554 GiB |        21.881 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |   671088640 |    671088640 |    676x | 22.114 ms | 1.22% | 22.110 ms | 1.22% |      24281965280 |       627.616 MiB |        10.951 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |   671088640 |    671088640 |     80x | 37.409 ms | 1.40% | 37.405 ms | 1.40% |      28706077426 |         1.562 GiB |        21.894 MiB |

## orc_multithreaded_read_decode_chunked_list

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples |  CPU Time  | Noise  |  GPU Time  | Noise  | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|------------|--------|------------|--------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |   671088640 |    671088640 |     80x |  74.780 ms |  0.67% |  74.776 ms |  0.67% |       7179747067 |       600.751 MiB |        60.245 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |   671088640 |    671088640 |    175x |  86.040 ms |  0.56% |  86.035 ms |  0.56% |      12480222210 |         1.576 GiB |       120.549 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |   671088640 |    671088640 |    186x |  80.668 ms |  4.14% |  80.664 ms |  4.14% |       6655685080 |       600.951 MiB |        60.250 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |   671088640 |    671088640 |    143x | 105.217 ms | 21.56% | 105.212 ms | 21.56% |      10205531345 |         1.576 GiB |       120.489 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |   671088640 |    671088640 |    128x |  80.087 ms |  3.05% |  80.082 ms |  3.05% |       6704042147 |       602.764 MiB |        60.323 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |   671088640 |    671088640 |    135x | 111.556 ms | 21.88% | 111.551 ms | 21.88% |       9625546746 |         1.489 GiB |       120.499 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |   671088640 |    671088640 |    112x | 134.677 ms |  4.14% | 134.672 ms |  4.14% |       3986513604 |       603.471 MiB |        60.353 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |   671088640 |    671088640 |     80x | 178.735 ms | 14.17% | 178.730 ms | 14.17% |       6007630497 |         1.520 GiB |       120.646 MiB |

```

Authors:
  - Zach Puller (https://github.com/zpuller)
  - Vukasin Milovanovic (https://github.com/vuule)
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/16009
---
 cpp/benchmarks/CMakeLists.txt                 |   5 +
 .../io/orc/orc_reader_multithreaded.cpp       | 335 ++++++++++++++++++
 2 files changed, 340 insertions(+)
 create mode 100644 cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 49504e53424..8a48126e195 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -267,6 +267,11 @@ ConfigureNVBench(PARQUET_MULTITHREAD_READER_NVBENCH io/parquet/parquet_reader_mu
 # * orc reader benchmark --------------------------------------------------------------------------
 ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp io/orc/orc_reader_options.cpp)
 
+# ##################################################################################################
+# * orc multithreaded benchmark
+# --------------------------------------------------------------------------
+ConfigureNVBench(ORC_MULTITHREADED_NVBENCH io/orc/orc_reader_multithreaded.cpp)
+
 # ##################################################################################################
 # * csv reader benchmark --------------------------------------------------------------------------
 ConfigureNVBench(CSV_READER_NVBENCH io/csv/csv_reader_input.cpp io/csv/csv_reader_options.cpp)
diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
new file mode 100644
index 00000000000..ffbbc6f8464
--- /dev/null
+++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/io/nvbench_helpers.hpp>
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/io/orc.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/thread_pool.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <vector>
+
+size_t get_num_read_threads(nvbench::state const& state) { return state.get_int64("num_threads"); }
+
+size_t get_read_size(nvbench::state const& state)
+{
+  auto const num_reads = get_num_read_threads(state);
+  return state.get_int64("total_data_size") / num_reads;
+}
+
+std::string get_label(std::string const& test_name, nvbench::state const& state)
+{
+  auto const num_cols       = state.get_int64("num_cols");
+  size_t const read_size_mb = get_read_size(state) / (1024 * 1024);
+  return {test_name + ", " + std::to_string(num_cols) + " columns, " +
+          std::to_string(get_num_read_threads(state)) + " threads " + " (" +
+          std::to_string(read_size_mb) + " MB each)"};
+}
+
+std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
+  nvbench::state& state, std::vector<cudf::type_id> const& d_types)
+{
+  auto const cardinality = state.get_int64("cardinality");
+  auto const run_length  = state.get_int64("run_length");
+  auto const num_cols    = state.get_int64("num_cols");
+  size_t const num_files            = get_num_read_threads(state);
+  size_t const per_file_data_size   = get_read_size(state);
+
+  std::vector<cuio_source_sink_pair> source_sink_vector;
+
+  size_t total_file_size = 0;
+
+  for (size_t i = 0; i < num_files; ++i) {
+    cuio_source_sink_pair source_sink{io_type::HOST_BUFFER};
+
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_types, num_cols),
+      table_size_bytes{per_file_data_size},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::orc_writer_options const write_opts =
+      cudf::io::orc_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::SNAPPY);
+
+    cudf::io::write_orc(write_opts);
+    total_file_size += source_sink.size();
+
+    source_sink_vector.push_back(std::move(source_sink));
+  }
+
+  return {std::move(source_sink_vector), total_file_size, num_files};
+}
+
+void BM_orc_multithreaded_read_common(nvbench::state& state,
+                                      std::vector<cudf::type_id> const& d_types,
+                                      std::string const& label)
+{
+  auto const data_size = state.get_int64("total_data_size");
+  auto const num_threads = state.get_int64("num_threads");
+
+  auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
+  cudf::detail::thread_pool threads(num_threads);
+
+  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  std::vector<cudf::io::source_info> source_info_vector;
+  std::transform(source_sink_vector.begin(),
+                 source_sink_vector.end(),
+                 std::back_inserter(source_info_vector),
+                 [](auto& source_sink) { return source_sink.make_source_info(); });
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  {
+    cudf::scoped_range range{("(read) " + label).c_str()};
+    state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+              [&](nvbench::launch& launch, auto& timer) {
+                auto read_func = [&](int index) {
+                  auto const stream = streams[index % num_threads];
+                  cudf::io::orc_reader_options read_opts =
+                    cudf::io::orc_reader_options::builder(source_info_vector[index]);
+                  cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource());
+                };
+
+                threads.paused = true;
+                for (size_t i = 0; i < num_files; ++i) {
+                  threads.submit(read_func, i);
+                }
+                timer.start();
+                threads.paused = false;
+                threads.wait_for_tasks();
+                cudf::detail::join_streams(streams, cudf::get_default_stream());
+                timer.stop();
+              });
+  }
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
+}
+
+void BM_orc_multithreaded_read_mixed(nvbench::state& state)
+{
+  auto label = get_label("mixed", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_common(
+    state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label);
+}
+
+void BM_orc_multithreaded_read_fixed_width(nvbench::state& state)
+{
+  auto label = get_label("fixed width", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_common(state, {cudf::type_id::INT32}, label);
+}
+
+void BM_orc_multithreaded_read_string(nvbench::state& state)
+{
+  auto label = get_label("string", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_common(state, {cudf::type_id::STRING}, label);
+}
+
+void BM_orc_multithreaded_read_list(nvbench::state& state)
+{
+  auto label = get_label("list", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_common(state, {cudf::type_id::LIST}, label);
+}
+
+void BM_orc_multithreaded_read_chunked_common(nvbench::state& state,
+                                              std::vector<cudf::type_id> const& d_types,
+                                              std::string const& label)
+{
+  size_t const data_size    = state.get_int64("total_data_size");
+  auto const num_threads    = state.get_int64("num_threads");
+  size_t const input_limit  = state.get_int64("input_limit");
+  size_t const output_limit = state.get_int64("output_limit");
+
+  auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
+  cudf::detail::thread_pool threads(num_threads);
+  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  std::vector<cudf::io::source_info> source_info_vector;
+  std::transform(source_sink_vector.begin(),
+                 source_sink_vector.end(),
+                 std::back_inserter(source_info_vector),
+                 [](auto& source_sink) { return source_sink.make_source_info(); });
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  {
+    cudf::scoped_range range{("(read) " + label).c_str()};
+    std::vector<cudf::io::table_with_metadata> chunks;
+    state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+              [&](nvbench::launch& launch, auto& timer) {
+                auto read_func = [&](int index) {
+                  auto const stream = streams[index % num_threads];
+                  cudf::io::orc_reader_options read_opts =
+                    cudf::io::orc_reader_options::builder(source_info_vector[index]);
+                  // divide chunk limits by number of threads so the number of chunks produced is the
+                  // same for all cases. this seems better than the alternative, which is to keep the
+                  // limits the same. if we do that, as the number of threads goes up, the number of
+                  // chunks goes down - so are actually benchmarking the same thing in that case?
+                  auto reader = cudf::io::chunked_orc_reader(
+                    output_limit / num_threads, input_limit / num_threads, read_opts, stream);
+
+                  // read all the chunks
+                  do {
+                    auto table = reader.read_chunk();
+                  } while (reader.has_next());
+                };
+
+                threads.paused = true;
+                for (size_t i = 0; i < num_files; ++i) {
+                  threads.submit(read_func, i);
+                }
+                timer.start();
+                threads.paused = false;
+                threads.wait_for_tasks();
+                cudf::detail::join_streams(streams, cudf::get_default_stream());
+                timer.stop();
+              });
+  }
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
+}
+
+void BM_orc_multithreaded_read_chunked_mixed(nvbench::state& state)
+{
+  auto label = get_label("mixed", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_chunked_common(
+    state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label);
+}
+
+void BM_orc_multithreaded_read_chunked_fixed_width(nvbench::state& state)
+{
+  auto label = get_label("fixed width", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::INT32}, label);
+}
+
+void BM_orc_multithreaded_read_chunked_string(nvbench::state& state)
+{
+  auto label = get_label("string", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::STRING}, label);
+}
+
+void BM_orc_multithreaded_read_chunked_list(nvbench::state& state)
+{
+  auto label = get_label("list", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::LIST}, label);
+}
+auto const thread_range  = std::vector<nvbench::int64_t>{1, 2, 4, 8};
+auto const total_data_size = std::vector<nvbench::int64_t>{512 * 1024 * 1024, 1024 * 1024 * 1024};
+
+// mixed data types: fixed width and strings
+NVBENCH_BENCH(BM_orc_multithreaded_read_mixed)
+  .set_name("orc_multithreaded_read_decode_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_fixed_width)
+  .set_name("orc_multithreaded_read_decode_fixed_width")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_string)
+  .set_name("orc_multithreaded_read_decode_string")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_list)
+  .set_name("orc_multithreaded_read_decode_list")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+// mixed data types: fixed width, strings
+NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_mixed)
+  .set_name("orc_multithreaded_read_decode_chunked_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_fixed_width)
+  .set_name("orc_multithreaded_read_decode_chunked_fixed_width")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_string)
+  .set_name("orc_multithreaded_read_decode_chunked_string")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_list)
+  .set_name("orc_multithreaded_read_decode_chunked_list")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});

From 24fe359425b080594b05bab040699a1468483474 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 14 Jun 2024 09:35:13 -0400
Subject: [PATCH 10/16] Remove CCCL 2.2 patches as we now always use 2.5+
 (#15969)

Now that https://github.com/rapidsai/rapids-cmake/pull/607 has been merged we can drop support for patching CCCL 2.2

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Paul Taylor (https://github.com/trxcllnt)

URL: https://github.com/rapidsai/cudf/pull/15969
---
 .../thirdparty/patches/cccl_override.json     | 35 --------------
 .../patches/revert_pr_211_cccl_2.5.0.diff     | 47 -------------------
 .../thrust_disable_64bit_dispatching.diff     | 38 +++++++--------
 ..._disable_64bit_dispatching_cccl_2.5.0.diff | 25 ----------
 .../thrust_faster_scan_compile_times.diff     | 30 ++++++------
 ..._faster_scan_compile_times_cccl_2.5.0.diff | 39 ---------------
 .../thrust_faster_sort_compile_times.diff     | 32 ++++++-------
 ..._faster_sort_compile_times_cccl_2.5.0.diff | 39 ---------------
 8 files changed, 50 insertions(+), 235 deletions(-)
 delete mode 100644 cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
 delete mode 100644 cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff
 delete mode 100644 cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff
 delete mode 100644 cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff

diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index 059f713e7a5..e61102dffac 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -3,60 +3,25 @@
   "packages" : {
     "CCCL" : {
       "patches" : [
-        {
-          "file" : "cccl/bug_fixes.diff",
-          "issue" : "CCCL installs header-search.cmake files in nondeterministic order and has a typo in checking target creation that leads to duplicates",
-          "fixed_in" : "2.3"
-        },
-        {
-          "file" : "cccl/hide_kernels.diff",
-          "issue" : "Mark all cub and thrust kernels with hidden visibility [https://github.com/nvidia/cccl/pulls/443]",
-          "fixed_in" : "2.3"
-        },
         {
           "file" : "cccl/revert_pr_211.diff",
           "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
           "fixed_in" : ""
         },
-        {
-          "file" : "${current_json_dir}/revert_pr_211_cccl_2.5.0.diff",
-          "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
-          "fixed_in" : ""
-        },
-        {
-          "file": "cccl/kernel_pointer_hiding.diff",
-          "issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]",
-          "fixed_in": "2.4"
-        },
         {
           "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
           "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
           "fixed_in" : ""
         },
-        {
-          "file" : "${current_json_dir}/thrust_disable_64bit_dispatching_cccl_2.5.0.diff",
-          "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
-          "fixed_in" : ""
-        },
         {
           "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff",
           "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
           "fixed_in" : ""
         },
-        {
-          "file" : "${current_json_dir}/thrust_faster_sort_compile_times_cccl_2.5.0.diff",
-          "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
-          "fixed_in" : ""
-        },
         {
           "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff",
           "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
           "fixed_in" : ""
-        },
-        {
-          "file" : "${current_json_dir}/thrust_faster_scan_compile_times_cccl_2.5.0.diff",
-          "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
-          "fixed_in" : ""
         }
       ]
     }
diff --git a/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
deleted file mode 100644
index 27ff16744f5..00000000000
--- a/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
+++ /dev/null
@@ -1,47 +0,0 @@
-diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
-index 046eb83c0..8047c9701 100644
---- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
-+++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
-@@ -53,41 +53,15 @@ namespace cuda_cub
- 
- namespace __copy
- {
--template <class Derived, class InputIt, class OutputIt>
--OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
--  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::true_type)
--{
--  typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
--  const auto n = thrust::distance(first, last);
--  if (n > 0)
--  {
--    cudaError status;
--    status = trivial_copy_device_to_device(
--      policy,
--      reinterpret_cast<InputTy*>(thrust::raw_pointer_cast(&*result)),
--      reinterpret_cast<InputTy const*>(thrust::raw_pointer_cast(&*first)),
--      n);
--    cuda_cub::throw_on_error(status, "__copy:: D->D: failed");
--  }
--
--  return result + n;
--}
- 
- template <class Derived, class InputIt, class OutputIt>
- OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
--  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::false_type)
-+  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
- {
-   typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
-   return cuda_cub::transform(policy, first, last, result, thrust::identity<InputTy>());
- }
- 
--template <class Derived, class InputIt, class OutputIt>
--OutputIt THRUST_RUNTIME_FUNCTION
--device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
--{
--  return device_to_device(
--    policy, first, last, result, typename is_indirectly_trivially_relocatable_to<InputIt, OutputIt>::type());
--}
- } // namespace __copy
- 
- } // namespace cuda_cub
diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
index d3f1a26781f..6ae1e1c917b 100644
--- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
+++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
@@ -1,25 +1,25 @@
 diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
-index d0e3f94ec..5c32a9c60 100644
+index 2a3cc4e33..8fb337b26 100644
 --- a/thrust/thrust/system/cuda/detail/dispatch.h
 +++ b/thrust/thrust/system/cuda/detail/dispatch.h
-@@ -32,8 +32,7 @@
-         status = call arguments; \
-     } \
-     else { \
--        auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
--        status = call arguments; \
-+        throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
-     }
-
+@@ -44,8 +44,7 @@
+   }                                                                                   \
+   else                                                                                \
+   {                                                                                   \
+-    auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
+-    status                             = call arguments;                              \
++    throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
+   }
+ 
  /**
-@@ -52,9 +51,7 @@
-         status = call arguments; \
-     } \
-     else { \
--        auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1); \
--        auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2); \
--        status = call arguments; \
-+        throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
-     }
+@@ -66,9 +65,7 @@
+   }                                                                                          \
+   else                                                                                       \
+   {                                                                                          \
+-    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1);      \
+-    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2);      \
+-    status                              = call arguments;                                    \
++    throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
+   }
  /**
   * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff
deleted file mode 100644
index 6ae1e1c917b..00000000000
--- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff
+++ /dev/null
@@ -1,25 +0,0 @@
-diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
-index 2a3cc4e33..8fb337b26 100644
---- a/thrust/thrust/system/cuda/detail/dispatch.h
-+++ b/thrust/thrust/system/cuda/detail/dispatch.h
-@@ -44,8 +44,7 @@
-   }                                                                                   \
-   else                                                                                \
-   {                                                                                   \
--    auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
--    status                             = call arguments;                              \
-+    throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
-   }
- 
- /**
-@@ -66,9 +65,7 @@
-   }                                                                                          \
-   else                                                                                       \
-   {                                                                                          \
--    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1);      \
--    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2);      \
--    status                              = call arguments;                                    \
-+    throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
-   }
- /**
-  * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff
index a606e21b92d..fee46046194 100644
--- a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff
+++ b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff
@@ -1,23 +1,23 @@
 diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
-index 84b6ccffd..25a237f93 100644
+index 0606485bb..dbb99ff13 100644
 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
 +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
-@@ -808,7 +808,7 @@ struct DeviceRadixSortPolicy
-
-
-     /// SM60 (GP100)
--    struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
-+    struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy
+   };
+ 
+   /// SM60 (GP100)
+-  struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
++  struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+   {
+     enum
      {
-         enum {
-             PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 6.9B 32b keys/s (Quadro P100)
 diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
-index 994adc095..d3e6719a7 100644
+index f39613adb..75bd16ff9 100644
 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh
 +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
-@@ -479,7 +479,7 @@ struct DeviceReducePolicy
+@@ -488,7 +488,7 @@ struct DeviceReducePolicy
    };
-
+ 
    /// SM60
 -  struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
 +  struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
@@ -25,15 +25,15 @@ index 994adc095..d3e6719a7 100644
      static constexpr int threads_per_block  = 256;
      static constexpr int items_per_thread   = 16;
 diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
-index 0ea5c41ad..1bcd8a111 100644
+index 419908c4e..6ab0840e1 100644
 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
 +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
-@@ -303,7 +303,7 @@ struct DeviceScanPolicy
+@@ -339,7 +339,7 @@ struct DeviceScanPolicy
    /// SM600
    struct Policy600
        : DefaultTuning
 -      , ChainedPolicy<600, Policy600, Policy520>
 +      , ChainedPolicy<600, Policy600, Policy600>
    {};
-
+ 
    /// SM800
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff
deleted file mode 100644
index fee46046194..00000000000
--- a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff
+++ /dev/null
@@ -1,39 +0,0 @@
-diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
-index 0606485bb..dbb99ff13 100644
---- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
-+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
-@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy
-   };
- 
-   /// SM60 (GP100)
--  struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
-+  struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
-   {
-     enum
-     {
-diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
-index f39613adb..75bd16ff9 100644
---- a/cub/cub/device/dispatch/dispatch_reduce.cuh
-+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
-@@ -488,7 +488,7 @@ struct DeviceReducePolicy
-   };
- 
-   /// SM60
--  struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
-+  struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
-   {
-     static constexpr int threads_per_block  = 256;
-     static constexpr int items_per_thread   = 16;
-diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
-index 419908c4e..6ab0840e1 100644
---- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
-+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
-@@ -339,7 +339,7 @@ struct DeviceScanPolicy
-   /// SM600
-   struct Policy600
-       : DefaultTuning
--      , ChainedPolicy<600, Policy600, Policy520>
-+      , ChainedPolicy<600, Policy600, Policy600>
-   {};
- 
-   /// SM800
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff
index c34b6433d10..cb0cc55f4d2 100644
--- a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff
+++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff
@@ -1,39 +1,39 @@
 diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
-index dc07ef6c2..a066c14da 100644
+index eb76ebb0b..c6c529a50 100644
 --- a/cub/cub/block/block_merge_sort.cuh
 +++ b/cub/cub/block/block_merge_sort.cuh
-@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared,
+@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
    KeyT key1 = keys_shared[keys1_beg];
    KeyT key2 = keys_shared[keys2_beg];
-
+ 
 -#pragma unroll
 +#pragma unroll 1
    for (int item = 0; item < ITEMS_PER_THREAD; ++item)
    {
-     bool p = (keys2_beg < keys2_end) &&
-@@ -383,7 +383,7 @@ public:
+     bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
+@@ -376,7 +376,7 @@ public:
        //
        KeyT max_key = oob_default;
-
--      #pragma unroll
-+      #pragma unroll 1
+ 
+-#pragma unroll
++#pragma unroll 1
        for (int item = 1; item < ITEMS_PER_THREAD; ++item)
        {
          if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
 diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh
-index 5d4867896..b42fb5f00 100644
+index 7d9e8622f..da5627306 100644
 --- a/cub/cub/thread/thread_sort.cuh
 +++ b/cub/cub/thread/thread_sort.cuh
-@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD],
+@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE
  {
-   constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
-
--  #pragma unroll
-+  #pragma unroll 1
+   constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;
+ 
+-#pragma unroll
++#pragma unroll 1
    for (int i = 0; i < ITEMS_PER_THREAD; ++i)
    {
--  #pragma unroll
-+  #pragma unroll 1
+-#pragma unroll
++#pragma unroll 1
      for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
      {
        if (compare_op(keys[j + 1], keys[j]))
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff
deleted file mode 100644
index cb0cc55f4d2..00000000000
--- a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff
+++ /dev/null
@@ -1,39 +0,0 @@
-diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
-index eb76ebb0b..c6c529a50 100644
---- a/cub/cub/block/block_merge_sort.cuh
-+++ b/cub/cub/block/block_merge_sort.cuh
-@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
-   KeyT key1 = keys_shared[keys1_beg];
-   KeyT key2 = keys_shared[keys2_beg];
- 
--#pragma unroll
-+#pragma unroll 1
-   for (int item = 0; item < ITEMS_PER_THREAD; ++item)
-   {
-     bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
-@@ -376,7 +376,7 @@ public:
-       //
-       KeyT max_key = oob_default;
- 
--#pragma unroll
-+#pragma unroll 1
-       for (int item = 1; item < ITEMS_PER_THREAD; ++item)
-       {
-         if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
-diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh
-index 7d9e8622f..da5627306 100644
---- a/cub/cub/thread/thread_sort.cuh
-+++ b/cub/cub/thread/thread_sort.cuh
-@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE
- {
-   constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;
- 
--#pragma unroll
-+#pragma unroll 1
-   for (int i = 0; i < ITEMS_PER_THREAD; ++i)
-   {
--#pragma unroll
-+#pragma unroll 1
-     for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
-     {
-       if (compare_op(keys[j + 1], keys[j]))

From 374ee13adaf18503ee671b652f76a3ccb9dc118b Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 14 Jun 2024 15:28:53 +0100
Subject: [PATCH 11/16] Fix exclude regex in pre-commit clang-format hook
 (#16030)

The clang-tidy changes in #15894 introduce a new exclude regex list to the pre-commit clang-format hook. However, it was a single character too long, ending with a |. Consequently, the exclude regex matched the empty string, and hence excluded every C++ file.

Fix this, and apply formatting changes to the files that were modified in the interim and were not clang-format compatible.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16030
---
 .pre-commit-config.yaml                       |   2 +-
 .../io/orc/orc_reader_multithreaded.cpp       | 107 +++++++++---------
 cpp/tests/interop/from_arrow_test.cpp         |   5 +-
 3 files changed, 58 insertions(+), 56 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index cc08b832e69..f8c4f4b9143 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -60,7 +60,7 @@ repos:
           (?x)^(
             ^cpp/src/io/parquet/ipc/Schema_generated.h|
             ^cpp/src/io/parquet/ipc/Message_generated.h|
-            ^cpp/include/cudf_test/cxxopts.hpp|
+            ^cpp/include/cudf_test/cxxopts.hpp
           )
   - repo: https://github.com/sirosen/texthooks
     rev: 0.6.6
diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
index ffbbc6f8464..aa0ee39a179 100644
--- a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
@@ -50,11 +50,11 @@ std::string get_label(std::string const& test_name, nvbench::state const& state)
 std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
   nvbench::state& state, std::vector<cudf::type_id> const& d_types)
 {
-  auto const cardinality = state.get_int64("cardinality");
-  auto const run_length  = state.get_int64("run_length");
-  auto const num_cols    = state.get_int64("num_cols");
-  size_t const num_files            = get_num_read_threads(state);
-  size_t const per_file_data_size   = get_read_size(state);
+  auto const cardinality          = state.get_int64("cardinality");
+  auto const run_length           = state.get_int64("run_length");
+  auto const num_cols             = state.get_int64("num_cols");
+  size_t const num_files          = get_num_read_threads(state);
+  size_t const per_file_data_size = get_read_size(state);
 
   std::vector<cuio_source_sink_pair> source_sink_vector;
 
@@ -86,7 +86,7 @@ void BM_orc_multithreaded_read_common(nvbench::state& state,
                                       std::vector<cudf::type_id> const& d_types,
                                       std::string const& label)
 {
-  auto const data_size = state.get_int64("total_data_size");
+  auto const data_size   = state.get_int64("total_data_size");
   auto const num_threads = state.get_int64("num_threads");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
@@ -104,24 +104,24 @@ void BM_orc_multithreaded_read_common(nvbench::state& state,
   {
     cudf::scoped_range range{("(read) " + label).c_str()};
     state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
-              [&](nvbench::launch& launch, auto& timer) {
-                auto read_func = [&](int index) {
-                  auto const stream = streams[index % num_threads];
-                  cudf::io::orc_reader_options read_opts =
-                    cudf::io::orc_reader_options::builder(source_info_vector[index]);
-                  cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource());
-                };
-
-                threads.paused = true;
-                for (size_t i = 0; i < num_files; ++i) {
-                  threads.submit(read_func, i);
-                }
-                timer.start();
-                threads.paused = false;
-                threads.wait_for_tasks();
-                cudf::detail::join_streams(streams, cudf::get_default_stream());
-                timer.stop();
-              });
+               [&](nvbench::launch& launch, auto& timer) {
+                 auto read_func = [&](int index) {
+                   auto const stream = streams[index % num_threads];
+                   cudf::io::orc_reader_options read_opts =
+                     cudf::io::orc_reader_options::builder(source_info_vector[index]);
+                   cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource());
+                 };
+
+                 threads.paused = true;
+                 for (size_t i = 0; i < num_files; ++i) {
+                   threads.submit(read_func, i);
+                 }
+                 timer.start();
+                 threads.paused = false;
+                 threads.wait_for_tasks();
+                 cudf::detail::join_streams(streams, cudf::get_default_stream());
+                 timer.stop();
+               });
   }
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
@@ -184,34 +184,35 @@ void BM_orc_multithreaded_read_chunked_common(nvbench::state& state,
     cudf::scoped_range range{("(read) " + label).c_str()};
     std::vector<cudf::io::table_with_metadata> chunks;
     state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
-              [&](nvbench::launch& launch, auto& timer) {
-                auto read_func = [&](int index) {
-                  auto const stream = streams[index % num_threads];
-                  cudf::io::orc_reader_options read_opts =
-                    cudf::io::orc_reader_options::builder(source_info_vector[index]);
-                  // divide chunk limits by number of threads so the number of chunks produced is the
-                  // same for all cases. this seems better than the alternative, which is to keep the
-                  // limits the same. if we do that, as the number of threads goes up, the number of
-                  // chunks goes down - so are actually benchmarking the same thing in that case?
-                  auto reader = cudf::io::chunked_orc_reader(
-                    output_limit / num_threads, input_limit / num_threads, read_opts, stream);
-
-                  // read all the chunks
-                  do {
-                    auto table = reader.read_chunk();
-                  } while (reader.has_next());
-                };
-
-                threads.paused = true;
-                for (size_t i = 0; i < num_files; ++i) {
-                  threads.submit(read_func, i);
-                }
-                timer.start();
-                threads.paused = false;
-                threads.wait_for_tasks();
-                cudf::detail::join_streams(streams, cudf::get_default_stream());
-                timer.stop();
-              });
+               [&](nvbench::launch& launch, auto& timer) {
+                 auto read_func = [&](int index) {
+                   auto const stream = streams[index % num_threads];
+                   cudf::io::orc_reader_options read_opts =
+                     cudf::io::orc_reader_options::builder(source_info_vector[index]);
+                   // divide chunk limits by number of threads so the number of chunks produced is
+                   // the same for all cases. this seems better than the alternative, which is to
+                   // keep the limits the same. if we do that, as the number of threads goes up, the
+                   // number of chunks goes down - so are actually benchmarking the same thing in
+                   // that case?
+                   auto reader = cudf::io::chunked_orc_reader(
+                     output_limit / num_threads, input_limit / num_threads, read_opts, stream);
+
+                   // read all the chunks
+                   do {
+                     auto table = reader.read_chunk();
+                   } while (reader.has_next());
+                 };
+
+                 threads.paused = true;
+                 for (size_t i = 0; i < num_files; ++i) {
+                   threads.submit(read_func, i);
+                 }
+                 timer.start();
+                 threads.paused = false;
+                 threads.wait_for_tasks();
+                 cudf::detail::join_streams(streams, cudf::get_default_stream());
+                 timer.stop();
+               });
   }
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
@@ -249,7 +250,7 @@ void BM_orc_multithreaded_read_chunked_list(nvbench::state& state)
   cudf::scoped_range range{label.c_str()};
   BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::LIST}, label);
 }
-auto const thread_range  = std::vector<nvbench::int64_t>{1, 2, 4, 8};
+auto const thread_range    = std::vector<nvbench::int64_t>{1, 2, 4, 8};
 auto const total_data_size = std::vector<nvbench::int64_t>{512 * 1024 * 1024, 1024 * 1024 * 1024};
 
 // mixed data types: fixed width and strings
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index af20a5c772f..6eaa1a07e08 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -50,7 +50,8 @@ std::unique_ptr<cudf::table> get_cudf_table()
                                                               {true, false, true, true, true});
   columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
   columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
-                         {true, false, true, false, true}, {true, false, true, true, false}).release());
+                         {true, false, true, false, true}, {true, false, true, true, false})
+                         .release());
   columns.emplace_back(cudf::test::strings_column_wrapper(
                          {
                            "",
@@ -338,7 +339,7 @@ TEST_F(FromArrowTest, ChunkedArray)
     std::vector<std::shared_ptr<arrow::Array>>{dict_array1, dict_array2});
   auto boolean_array =
     get_arrow_array<bool>({true, false, true, false, true}, {true, false, true, true, false});
-  auto boolean_chunked_array = std::make_shared<arrow::ChunkedArray>(boolean_array);
+  auto boolean_chunked_array      = std::make_shared<arrow::ChunkedArray>(boolean_array);
   auto large_string_chunked_array = std::make_shared<arrow::ChunkedArray>(
     std::vector<std::shared_ptr<arrow::Array>>{large_string_array_1});
 

From 2297f9a61e2f4153ab2e8a0631f7cfe7971ead14 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 14 Jun 2024 17:43:17 +0100
Subject: [PATCH 12/16] Fix initialization error in to_arrow for empty string
 views (#16033)

When converting an empty string view to arrow, we don't bother with copies from device, but rather create the arrow arrays directly. The offset buffer is therefore a singleton int32 array with zero in it.

Previously, the initialization of this array was incorrect, since mutable_data() returns a uint8_t pointer, and so setting the single element could leave 24 of the 32 bits uninitialized.

Fix this by using memset instead to zero out the full buffer.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16033
---
 cpp/src/interop/to_arrow.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 47aee982c32..2b3aa2f08f1 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -292,9 +292,9 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::string_view>(
   auto child_arrays      = fetch_child_array(input_view, {{}, {}}, ar_mr, stream);
   if (child_arrays.empty()) {
     // Empty string will have only one value in offset of 4 bytes
-    auto tmp_offset_buffer               = allocate_arrow_buffer(4, ar_mr);
-    auto tmp_data_buffer                 = allocate_arrow_buffer(0, ar_mr);
-    tmp_offset_buffer->mutable_data()[0] = 0;
+    auto tmp_offset_buffer = allocate_arrow_buffer(sizeof(int32_t), ar_mr);
+    auto tmp_data_buffer   = allocate_arrow_buffer(0, ar_mr);
+    memset(tmp_offset_buffer->mutable_data(), 0, sizeof(int32_t));
 
     return std::make_shared<arrow::StringArray>(
       0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer));

From 5facc8cde15cc8301adb0c06fc682f558828fbc8 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 14 Jun 2024 07:12:09 -1000
Subject: [PATCH 13/16] Enable ruff TCH: typing imports under if TYPE_CHECKING
 (#16015)

Reduces some unnecessary imports for running cudf and nicely delineates which imports are meant for typing purposes

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16015
---
 docs/cudf/source/conf.py                      |  6 ++++
 pyproject.toml                                |  2 +-
 python/cudf/cudf/_typing.py                   |  3 +-
 python/cudf/cudf/core/_base_index.py          |  9 ++++--
 python/cudf/cudf/core/buffer/spill_manager.py |  6 ++--
 python/cudf/cudf/core/column/categorical.py   | 20 +++++++++----
 python/cudf/cudf/core/column/column.py        |  8 +++--
 python/cudf/cudf/core/column/datetime.py      | 16 +++++-----
 python/cudf/cudf/core/column/decimal.py       |  6 ++--
 python/cudf/cudf/core/column/lists.py         |  6 ++--
 python/cudf/cudf/core/column/numerical.py     | 29 +++++++++++++------
 .../cudf/cudf/core/column/numerical_base.py   |  6 ++--
 python/cudf/cudf/core/column/string.py        |  9 +++---
 python/cudf/cudf/core/column/struct.py        |  5 +++-
 python/cudf/cudf/core/column/timedelta.py     |  6 ++--
 python/cudf/cudf/core/dataframe.py            |  5 +++-
 python/cudf/cudf/core/dtypes.py               |  6 ++--
 python/cudf/cudf/core/frame.py                | 10 +++++--
 python/cudf/cudf/core/index.py                |  5 +++-
 python/cudf/cudf/core/indexed_frame.py        | 15 ++++++----
 python/cudf/cudf/core/multiindex.py           |  9 ++++--
 python/cudf/cudf/core/series.py               | 15 ++++++----
 python/cudf/cudf/core/single_column_frame.py  | 13 +++++----
 23 files changed, 143 insertions(+), 72 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index e9c760e288e..108f12bc099 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -554,6 +554,12 @@ def on_missing_reference(app, env, node, contnode):
 nitpick_ignore = [
     ("py:class", "SeriesOrIndex"),
     ("py:class", "Dtype"),
+    # The following are erroneously warned due to
+    # https://github.com/sphinx-doc/sphinx/issues/11225
+    ("py:class", "pa.Array"),
+    ("py:class", "ScalarLike"),
+    ("py:class", "ParentType"),
+    ("py:class", "ColumnLike"),
     # TODO: Remove this when we figure out why typing_extensions doesn't seem
     # to map types correctly for intersphinx
     ("py:class", "typing_extensions.Self"),
diff --git a/pyproject.toml b/pyproject.toml
index d343b237ee7..c602240a0b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ quiet-level = 3
 line-length = 79
 
 [tool.ruff.lint]
-select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418"]
+select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH"]
 ignore = [
     # whitespace before :
     "E203",
diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py
index 206173919e1..34c96cc8cb3 100644
--- a/python/cudf/cudf/_typing.py
+++ b/python/cudf/cudf/_typing.py
@@ -5,9 +5,10 @@
 
 import numpy as np
 from pandas import Period, Timedelta, Timestamp
-from pandas.api.extensions import ExtensionDtype
 
 if TYPE_CHECKING:
+    from pandas.api.extensions import ExtensionDtype
+
     import cudf
 
 # Backwards compat: mypy >= 0.790 rejects Type[NotImplemented], but
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 5d0f7c4ede4..b29fc475b29 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -4,9 +4,8 @@
 
 import pickle
 import warnings
-from collections.abc import Generator
 from functools import cached_property
-from typing import Any, Literal, Set, Tuple
+from typing import TYPE_CHECKING, Any, Literal, Set, Tuple
 
 import pandas as pd
 from typing_extensions import Self
@@ -31,12 +30,16 @@
 )
 from cudf.core.abc import Serializable
 from cudf.core.column import ColumnBase, column
-from cudf.core.column_accessor import ColumnAccessor
 from cudf.errors import MixedTypeError
 from cudf.utils import ioutils
 from cudf.utils.dtypes import can_convert_to_column, is_mixed_with_object_dtype
 from cudf.utils.utils import _is_same_name
 
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from cudf.core.column_accessor import ColumnAccessor
+
 
 class BaseIndex(Serializable):
     """Base class for all cudf Index types."""
diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index cd81149bdb8..7bcf97302aa 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -13,15 +13,17 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
-from typing import Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
 
 import rmm.mr
 
-from cudf.core.buffer.spillable_buffer import SpillableBufferOwner
 from cudf.options import get_option
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.string import format_bytes
 
+if TYPE_CHECKING:
+    from cudf.core.buffer.spillable_buffer import SpillableBufferOwner
+
 _spill_cudf_nvtx_annotate = partial(
     _cudf_nvtx_annotate, domain="cudf_python-spill"
 )
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index de20b2ace1d..97c2ce5cf1f 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -3,21 +3,17 @@
 from __future__ import annotations
 
 import warnings
-from collections import abc
 from functools import cached_property
 from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Tuple, cast
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from numba import cuda
 from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.transform import bools_to_mask
-from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
-from cudf.core.buffer import Buffer
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import CategoricalDtype, IntervalDtype
@@ -29,7 +25,19 @@
 )
 
 if TYPE_CHECKING:
-    from cudf._typing import SeriesOrIndex, SeriesOrSingleColumnIndex
+    from collections import abc
+
+    import numba.cuda
+
+    from cudf._typing import (
+        ColumnBinaryOperand,
+        ColumnLike,
+        Dtype,
+        ScalarLike,
+        SeriesOrIndex,
+        SeriesOrSingleColumnIndex,
+    )
+    from cudf.core.buffer import Buffer
     from cudf.core.column import (
         ColumnBase,
         DatetimeColumn,
@@ -868,7 +876,7 @@ def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase":
 
     def data_array_view(
         self, *, mode="write"
-    ) -> cuda.devicearray.DeviceNDArray:
+    ) -> numba.cuda.devicearray.DeviceNDArray:
         return self.codes.data_array_view(mode=mode)
 
     def unique(self) -> CategoricalColumn:
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 75fc31ddbce..dc937dc0469 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2,13 +2,13 @@
 
 from __future__ import annotations
 
-import builtins
 import pickle
 from collections import abc
 from functools import cached_property
 from itertools import chain
 from types import SimpleNamespace
 from typing import (
+    TYPE_CHECKING,
     Any,
     Dict,
     List,
@@ -49,7 +49,6 @@
 )
 from cudf._lib.transform import bools_to_mask
 from cudf._lib.types import size_type_dtype
-from cudf._typing import ColumnLike, Dtype, ScalarLike
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
@@ -89,6 +88,11 @@
 )
 from cudf.utils.utils import _array_ufunc, mask_dtype
 
+if TYPE_CHECKING:
+    import builtins
+
+    from cudf._typing import ColumnLike, Dtype, ScalarLike
+
 if PANDAS_GE_210:
     NumpyExtensionArray = pd.arrays.NumpyExtensionArray
 else:
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 057169aa7e1..e24d85bfedf 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -19,22 +19,22 @@
 from cudf import _lib as libcudf
 from cudf._lib.labeling import label_bins
 from cudf._lib.search import search_sorted
-from cudf._typing import (
-    ColumnBinaryOperand,
-    DatetimeLikeScalar,
-    Dtype,
-    DtypeObj,
-    ScalarLike,
-)
 from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
 from cudf.core._compat import PANDAS_GE_220
-from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
 from cudf.utils.utils import _all_bools_with_nulls
 
 if TYPE_CHECKING:
+    from cudf._typing import (
+        ColumnBinaryOperand,
+        DatetimeLikeScalar,
+        Dtype,
+        DtypeObj,
+        ScalarLike,
+    )
+    from cudf.core.buffer import Buffer
     from cudf.core.column.numerical import NumericalColumn
 
 if PANDAS_GE_220:
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 3a0f6649e21..9c1bedc9926 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -4,7 +4,7 @@
 
 import warnings
 from decimal import Decimal
-from typing import Any, Optional, Sequence, Union, cast
+from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast
 
 import cupy as cp
 import numpy as np
@@ -16,7 +16,6 @@
 from cudf._lib.strings.convert.convert_fixed_point import (
     from_decimal as cpp_from_decimal,
 )
-from cudf._typing import ColumnBinaryOperand, Dtype
 from cudf.api.types import is_integer_dtype, is_scalar
 from cudf.core.buffer import as_buffer
 from cudf.core.column import ColumnBase
@@ -31,6 +30,9 @@
 
 from .numerical_base import NumericalBaseColumn
 
+if TYPE_CHECKING:
+    from cudf._typing import ColumnBinaryOperand, Dtype
+
 
 class DecimalBaseColumn(NumericalBaseColumn):
     """Base column for decimal32, decimal64 or decimal128 columns"""
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 8f8ee46c796..080ba949d62 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from functools import cached_property
-from typing import List, Optional, Sequence, Tuple, Union
+from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -26,13 +26,15 @@
 )
 from cudf._lib.strings.convert.convert_lists import format_list_column
 from cudf._lib.types import size_type_dtype
-from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethods, ParentType
 from cudf.core.dtypes import ListDtype
 from cudf.core.missing import NA
 
+if TYPE_CHECKING:
+    from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
+
 
 class ListColumn(ColumnBase):
     dtype: ListDtype
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 1952d7eeb71..6af67e02bb4 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -3,7 +3,16 @@
 from __future__ import annotations
 
 import functools
-from typing import Any, Callable, Optional, Sequence, Tuple, Union, cast
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+    cast,
+)
 
 import cupy as cp
 import numpy as np
@@ -14,13 +23,6 @@
 from cudf import _lib as libcudf
 from cudf._lib import pylibcudf
 from cudf._lib.types import size_type_dtype
-from cudf._typing import (
-    ColumnBinaryOperand,
-    ColumnLike,
-    Dtype,
-    DtypeObj,
-    ScalarLike,
-)
 from cudf.api.types import (
     is_bool_dtype,
     is_float_dtype,
@@ -28,7 +30,6 @@
     is_integer_dtype,
     is_scalar,
 )
-from cudf.core.buffer import Buffer
 from cudf.core.column import (
     ColumnBase,
     as_column,
@@ -48,6 +49,16 @@
 
 from .numerical_base import NumericalBaseColumn
 
+if TYPE_CHECKING:
+    from cudf._typing import (
+        ColumnBinaryOperand,
+        ColumnLike,
+        Dtype,
+        DtypeObj,
+        ScalarLike,
+    )
+    from cudf.core.buffer import Buffer
+
 _unaryop_map = {
     "ASIN": "ARCSIN",
     "ACOS": "ARCCOS",
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index d38ec9cf30f..bd48054a951 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -3,17 +3,19 @@
 
 from __future__ import annotations
 
-from typing import Optional, cast
+from typing import TYPE_CHECKING, Optional, cast
 
 import numpy as np
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._typing import ScalarLike
 from cudf.core.column import ColumnBase
 from cudf.core.missing import NA
 from cudf.core.mixins import Scannable
 
+if TYPE_CHECKING:
+    from cudf._typing import ScalarLike
+
 
 class NumericalBaseColumn(ColumnBase, Scannable):
     """A column composed of numerical data.
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index ad7dbe5e52e..87df2d2f1f1 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -16,11 +16,9 @@
     overload,
 )
 
-import cupy
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from numba import cuda
 from typing_extensions import Self
 
 import cudf
@@ -30,7 +28,6 @@
 from cudf._lib.column import Column
 from cudf._lib.types import size_type_dtype
 from cudf.api.types import is_integer, is_scalar, is_string_dtype
-from cudf.core.buffer import Buffer
 from cudf.core.column import column, datetime
 from cudf.core.column.column import ColumnBase
 from cudf.core.column.methods import ColumnMethods
@@ -46,6 +43,9 @@ def str_to_boolean(column: StringColumn):
 
 
 if TYPE_CHECKING:
+    import cupy
+    import numba.cuda
+
     from cudf._typing import (
         ColumnBinaryOperand,
         ColumnLike,
@@ -53,6 +53,7 @@ def str_to_boolean(column: StringColumn):
         ScalarLike,
         SeriesOrIndex,
     )
+    from cudf.core.buffer import Buffer
 
 
 _str_to_numeric_typecast_functions = {
@@ -5598,7 +5599,7 @@ def any(self, skipna: bool = True) -> bool:
 
     def data_array_view(
         self, *, mode="write"
-    ) -> cuda.devicearray.DeviceNDArray:
+    ) -> numba.cuda.devicearray.DeviceNDArray:
         raise ValueError("Cannot get an array view of a StringColumn")
 
     @property
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 6dd35570b95..c2ce787eeae 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -2,17 +2,20 @@
 from __future__ import annotations
 
 from functools import cached_property
+from typing import TYPE_CHECKING
 
 import pandas as pd
 import pyarrow as pa
 
 import cudf
-from cudf._typing import Dtype
 from cudf.core.column import ColumnBase
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import StructDtype
 from cudf.core.missing import NA
 
+if TYPE_CHECKING:
+    from cudf._typing import Dtype
+
 
 class StructColumn(ColumnBase):
     """
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index c6af052b56f..0af847f38af 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -4,7 +4,7 @@
 
 import datetime
 import functools
-from typing import Any, Optional, Sequence, cast
+from typing import TYPE_CHECKING, Any, Optional, Sequence, cast
 
 import numpy as np
 import pandas as pd
@@ -13,13 +13,15 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype
 from cudf.api.types import is_scalar, is_timedelta64_dtype
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column import ColumnBase, column, string
 from cudf.utils.dtypes import np_to_pa_dtype
 from cudf.utils.utils import _all_bools_with_nulls
 
+if TYPE_CHECKING:
+    from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype
+
 _unit_to_nanoseconds_conversion = {
     "ns": 1,
     "us": 1_000,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7438b0237d5..70820fa8e00 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -15,6 +15,7 @@
 from collections import abc, defaultdict
 from collections.abc import Iterator
 from typing import (
+    TYPE_CHECKING,
     Any,
     Callable,
     Dict,
@@ -41,7 +42,6 @@
 import cudf
 import cudf.core.common
 from cudf import _lib as libcudf
-from cudf._typing import ColumnLike, Dtype, NotImplementedType
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
@@ -99,6 +99,9 @@
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
 
+if TYPE_CHECKING:
+    from cudf._typing import ColumnLike, Dtype, NotImplementedType
+
 _cupy_nan_methods_map = {
     "min": "nanmin",
     "max": "nanmax",
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 4729233ee6e..b1282040e60 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -6,7 +6,7 @@
 import textwrap
 import warnings
 from functools import cached_property
-from typing import Any, Callable, Dict, List, Tuple, Type, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, Type, Union
 
 import numpy as np
 import pandas as pd
@@ -19,9 +19,11 @@
 from cudf._typing import Dtype
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.abc import Serializable
-from cudf.core.buffer import Buffer
 from cudf.utils.docutils import doc_apply
 
+if TYPE_CHECKING:
+    from cudf.core.buffer import Buffer
+
 
 def dtype(arbitrary):
     """
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 01b56f1edc4..ffaa90ef915 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6,10 +6,10 @@
 import itertools
 import operator
 import pickle
-import types
 import warnings
 from collections import abc
 from typing import (
+    TYPE_CHECKING,
     Any,
     Callable,
     Dict,
@@ -31,7 +31,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._typing import Dtype
 from cudf.api.types import is_dtype_equal, is_scalar
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
@@ -48,6 +47,11 @@
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf
 
+if TYPE_CHECKING:
+    from types import ModuleType
+
+    from cudf._typing import Dtype
+
 
 # TODO: It looks like Frame is missing a declaration of `copy`, need to add
 class Frame(BinaryOperand, Scannable):
@@ -410,7 +414,7 @@ def __arrow_array__(self, type=None):
     def _to_array(
         self,
         get_array: Callable,
-        module: types.ModuleType,
+        module: ModuleType,
         copy: bool,
         dtype: Union[Dtype, None] = None,
         na_value=None,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 732e5cdb01a..655f7607b37 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -5,10 +5,10 @@
 import operator
 import pickle
 import warnings
-from collections.abc import Generator
 from functools import cache, cached_property
 from numbers import Number
 from typing import (
+    TYPE_CHECKING,
     Any,
     List,
     Literal,
@@ -71,6 +71,9 @@
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
 
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
 
 class IndexMeta(type):
     """Custom metaclass for Index that overrides instance/subclass tests."""
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index fdc78005996..75614fa46c7 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -9,6 +9,7 @@
 import warnings
 from collections import Counter, abc
 from typing import (
+    TYPE_CHECKING,
     Any,
     Callable,
     Dict,
@@ -31,12 +32,6 @@
 
 import cudf
 import cudf._lib as libcudf
-from cudf._typing import (
-    ColumnLike,
-    DataFrameOrSeries,
-    Dtype,
-    NotImplementedType,
-)
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -70,6 +65,14 @@
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.utils import _warn_no_dask_cudf
 
+if TYPE_CHECKING:
+    from cudf._typing import (
+        ColumnLike,
+        DataFrameOrSeries,
+        Dtype,
+        NotImplementedType,
+    )
+
 doc_reset_index_template = """
         Reset the index of the {klass}, or a level of it.
 
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 6d3520e33cf..865d9660b1d 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -8,10 +8,9 @@
 import pickle
 import warnings
 from collections import abc
-from collections.abc import Generator
 from functools import cached_property
 from numbers import Integral
-from typing import Any, List, MutableMapping, Tuple, Union
+from typing import TYPE_CHECKING, Any, List, MutableMapping, Tuple, Union
 
 import cupy as cp
 import numpy as np
@@ -20,7 +19,6 @@
 import cudf
 import cudf._lib as libcudf
 from cudf._lib.types import size_type_dtype
-from cudf._typing import DataFrameOrSeries
 from cudf.api.extensions import no_default
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
 from cudf.core import column
@@ -36,6 +34,11 @@
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from cudf._typing import DataFrameOrSeries
+
 
 def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
     """Makes best effort to convert an array of indices into a python slice.
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index a52b583d3b4..1b1e82333cf 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -10,6 +10,7 @@
 from collections import abc
 from shutil import get_terminal_size
 from typing import (
+    TYPE_CHECKING,
     Any,
     Dict,
     Literal,
@@ -27,12 +28,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._typing import (
-    ColumnLike,
-    DataFrameOrSeries,
-    NotImplementedType,
-    ScalarLike,
-)
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -85,6 +80,14 @@
 )
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
+if TYPE_CHECKING:
+    from cudf._typing import (
+        ColumnLike,
+        DataFrameOrSeries,
+        NotImplementedType,
+        ScalarLike,
+    )
+
 
 def _format_percentile_names(percentiles):
     return [f"{int(x * 100)}%" for x in percentiles]
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index acc74129a29..6fd4e857e02 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -3,15 +3,11 @@
 
 from __future__ import annotations
 
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
 
-import cupy
-import numpy
-import pyarrow as pa
 from typing_extensions import Self
 
 import cudf
-from cudf._typing import NotImplementedType, ScalarLike
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
@@ -25,6 +21,13 @@
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.utils import NotIterable
 
+if TYPE_CHECKING:
+    import cupy
+    import numpy
+    import pyarrow as pa
+
+    from cudf._typing import NotImplementedType, ScalarLike
+
 
 class SingleColumnFrame(Frame, NotIterable):
     """A one-dimensional frame.

From 9225633e83ca09592c5a144c523f46e95c6e9d75 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 14 Jun 2024 07:13:00 -1000
Subject: [PATCH 14/16] Avoid redefining Frame._get_columns_by_label in
 subclasses (#15912)

`Frame._get_columns_by_label` was redefined in `Series` and `DataFrame` to handle some special edge cases in `DataFrame.__getitem__` and empty `Series`

By making `_from_data_like_self` more consistent in preserving external properties and moving special casing, we can only define `Frame._get_columns_by_label` once

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/15912
---
 python/cudf/cudf/core/dataframe.py     | 36 +++++++-------------------
 python/cudf/cudf/core/frame.py         | 28 +++++++++++---------
 python/cudf/cudf/core/indexed_frame.py |  4 +--
 python/cudf/cudf/core/series.py        | 20 +++++---------
 4 files changed, 34 insertions(+), 54 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 70820fa8e00..80260c7699b 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1348,7 +1348,16 @@ def __getitem__(self, arg):
         8  8  8  8
         """
         if _is_scalar_or_zero_d_array(arg) or isinstance(arg, tuple):
-            return self._get_columns_by_label(arg, downcast=True)
+            out = self._get_columns_by_label(arg)
+            if is_scalar(arg):
+                nlevels = 1
+            elif isinstance(arg, tuple):
+                nlevels = len(arg)
+            if self._data.multiindex is False or nlevels == self._data.nlevels:
+                out = self._constructor_sliced._from_data(out._data)
+                out.index = self.index
+                out.name = arg
+            return out
 
         elif isinstance(arg, slice):
             return self._slice(arg)
@@ -1993,31 +2002,6 @@ def _repr_html_(self):
     def _repr_latex_(self):
         return self._get_renderable_dataframe().to_pandas()._repr_latex_()
 
-    @_cudf_nvtx_annotate
-    def _get_columns_by_label(
-        self, labels, *, downcast=False
-    ) -> Self | Series:
-        """
-        Return columns of dataframe by `labels`
-
-        If downcast is True, try and downcast from a DataFrame to a Series
-        """
-        ca = self._data.select_by_label(labels)
-        if downcast:
-            if is_scalar(labels):
-                nlevels = 1
-            elif isinstance(labels, tuple):
-                nlevels = len(labels)
-            if self._data.multiindex is False or nlevels == self._data.nlevels:
-                out = self._constructor_sliced._from_data(
-                    ca, index=self.index, name=labels
-                )
-                return out
-        out = self.__class__._from_data(
-            ca, index=self.index, columns=ca.to_pandas_index()
-        )
-        return out
-
     def _make_operands_and_index_for_binop(
         self,
         other: Any,
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index ffaa90ef915..ee310cfcb58 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -136,12 +136,19 @@ def deserialize(cls, header, frames):
     @classmethod
     @_cudf_nvtx_annotate
     def _from_data(cls, data: MutableMapping) -> Self:
+        """
+        Construct cls from a ColumnAccessor-like mapping.
+        """
         obj = cls.__new__(cls)
         Frame.__init__(obj, data)
         return obj
 
     @_cudf_nvtx_annotate
     def _from_data_like_self(self, data: MutableMapping) -> Self:
+        """
+        Return type(self) from a ColumnAccessor-like mapping but
+        with the external properties, e.g. .index, .name, of self.
+        """
         return self._from_data(data)
 
     @_cudf_nvtx_annotate
@@ -355,12 +362,13 @@ def equals(self, other) -> bool:
         )
 
     @_cudf_nvtx_annotate
-    def _get_columns_by_label(self, labels, *, downcast=False) -> Self:
+    def _get_columns_by_label(self, labels) -> Self:
         """
-        Returns columns of the Frame specified by `labels`
+        Returns columns of the Frame specified by `labels`.
 
+        Akin to cudf.DataFrame(...).loc[:, labels]
         """
-        return self.__class__._from_data(self._data.select_by_label(labels))
+        return self._from_data_like_self(self._data.select_by_label(labels))
 
     @property
     @_cudf_nvtx_annotate
@@ -1438,14 +1446,10 @@ def _get_sorted_inds(
         Get the indices required to sort self according to the columns
         specified in by.
         """
-
-        to_sort = [
-            *(
-                self
-                if by is None
-                else self._get_columns_by_label(list(by), downcast=False)
-            )._columns
-        ]
+        if by is None:
+            to_sort = self._columns
+        else:
+            to_sort = self._get_columns_by_label(list(by))._columns
 
         if is_scalar(ascending):
             ascending_lst = [ascending] * len(to_sort)
@@ -1453,7 +1457,7 @@ def _get_sorted_inds(
             ascending_lst = list(ascending)
 
         return libcudf.sort.order_by(
-            to_sort,
+            list(to_sort),
             ascending_lst,
             na_position,
             stable=True,
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 75614fa46c7..3a4f4874e35 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -309,8 +309,8 @@ def _from_data(
 
     @_cudf_nvtx_annotate
     def _from_data_like_self(self, data: MutableMapping):
-        out = self._from_data(data, self.index)
-        out._data._level_names = self._data._level_names
+        out = super()._from_data_like_self(data)
+        out.index = self.index
         return out
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 1b1e82333cf..ebf6910ca5f 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -685,6 +685,12 @@ def _from_data(
             out.name = name
         return out
 
+    @_cudf_nvtx_annotate
+    def _from_data_like_self(self, data: MutableMapping):
+        out = super()._from_data_like_self(data)
+        out.name = self.name
+        return out
+
     @_cudf_nvtx_annotate
     def __contains__(self, item):
         return item in self.index
@@ -859,20 +865,6 @@ def deserialize(cls, header, frames):
 
         return obj
 
-    def _get_columns_by_label(self, labels, *, downcast=False) -> Self:
-        """Return the column specified by `labels`
-
-        For cudf.Series, either the column, or an empty series is returned.
-        Parameter `downcast` does not have effects.
-        """
-        ca = self._data.select_by_label(labels)
-
-        return (
-            self.__class__._from_data(data=ca, index=self.index)
-            if len(ca) > 0
-            else self.__class__(dtype=self.dtype, name=self.name)
-        )
-
     @_cudf_nvtx_annotate
     def drop(
         self,

From 9dc5e8c2836fa2e54831d25b7f051e031bf553b9 Mon Sep 17 00:00:00 2001
From: Ben Jarmak <104460670+jarmak-nv@users.noreply.github.com>
Date: Fri, 14 Jun 2024 13:31:29 -0400
Subject: [PATCH 15/16] Project automation update: skip if not in project
 (#16035)

This PR adds another condition to when we should run the automation work. PRs aren't always in the cuDF Python project so when this is the case we should skip the job rather than attempting to run it and have it throw an error.

Authors:
  - Ben Jarmak (https://github.com/jarmak-nv)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16035
---
 .github/workflows/pr_issue_status_automation.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index 837963c3286..8ca971dc28d 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -35,7 +35,7 @@ jobs:
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
       uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08
-      if: github.event.pull_request.state == 'open'
+      if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
         PROJECT_ID: "PVT_kwDOAp2shc4AiNzl"
@@ -51,7 +51,7 @@ jobs:
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
       uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08
-      if: github.event.pull_request.state == 'open'
+      if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
         PROJECT_ID: "PVT_kwDOAp2shc4AiNzl"

From f89cc07b50d3f89e7da8f98afb5fe8f9d9cf33c6 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 14 Jun 2024 13:22:49 -0500
Subject: [PATCH 16/16] Add `codecov` coverage for `pandas_tests` (#14513)

Fixes: #14496

This PR enables code-coverage for `pandas` tests that are run in cudf CI in pandas accelerator mode.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14513
---
 ci/cudf_pandas_scripts/run_tests.sh               | 11 ++++++++++-
 python/cudf/cudf_pandas_tests/test_cudf_pandas.py |  3 +++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 78945d37f22..1c3b99953fb 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -5,6 +5,10 @@
 
 set -eoxu pipefail
 
+RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}
+RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"}
+mkdir -p "${RAPIDS_TESTS_DIR}" "${RAPIDS_COVERAGE_DIR}"
+
 # Function to display script usage
 function display_usage {
     echo "Usage: $0 [--no-cudf]"
@@ -36,4 +40,9 @@ else
     python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,cudf-pandas-tests]
 fi
 
-python -m pytest -p cudf.pandas ./python/cudf/cudf_pandas_tests/
+python -m pytest -p cudf.pandas \
+    --cov-config=./python/cudf/.coveragerc \
+    --cov=cudf \
+    --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \
+    --cov-report=term \
+    ./python/cudf/cudf_pandas_tests/
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index c251e4a197e..5be4d350c0b 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -464,6 +464,9 @@ def test_options_mode():
     assert xpd.options.mode.copy_on_write == pd.options.mode.copy_on_write
 
 
+# Codecov and Profiler interfere with each-other,
+# hence we don't want to run code-cov on this test.
+@pytest.mark.no_cover
 def test_profiler():
     pytest.importorskip("cudf")