From a71c249f9f320ecb61aa8135bbda300122e43491 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 27 Jun 2024 14:29:31 -0500
Subject: [PATCH 01/27] Fix dtype errors in `StringArrays` (#16111)

This PR adds proxy classes for `ArrowStringArray` and `ArrowStringArrayNumpySemantics` that will increase the pandas test pass rate by 1%.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16111
---
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 16 +++++++++++++
 .../cudf/pandas/scripts/run-pandas-tests.sh   |  3 ++-
 .../cudf_pandas_tests/test_cudf_pandas.py     | 23 +++++++++++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 0ba432d6d0e..a64bf7772fe 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -522,6 +522,22 @@ def Index__new__(cls, *args, **kwargs):
     },
 )
 
+ArrowStringArrayNumpySemantics = make_final_proxy_type(
+    "ArrowStringArrayNumpySemantics",
+    _Unusable,
+    pd.core.arrays.string_arrow.ArrowStringArrayNumpySemantics,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+)
+
+ArrowStringArray = make_final_proxy_type(
+    "ArrowStringArray",
+    _Unusable,
+    pd.core.arrays.string_arrow.ArrowStringArray,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+)
+
 StringDtype = make_final_proxy_type(
     "StringDtype",
     _Unusable,
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index cd9f90d50fe..a66f63c09b3 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -133,7 +133,8 @@ and not test_s3_roundtrip"
 TEST_THAT_CRASH_PYTEST_WORKERS="not test_bitmasks_pyarrow \
 and not test_large_string_pyarrow \
 and not test_interchange_from_corrected_buffer_dtypes \
-and not test_eof_states"
+and not test_eof_states \
+and not test_array_tz"
 
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index eed5037cbea..0d46e2e9311 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1533,3 +1533,26 @@ def test_is_proxy_object():
     assert is_proxy_object(np_arr_proxy)
     assert is_proxy_object(s1)
     assert not is_proxy_object(s2)
+
+
+def test_arrow_string_arrays():
+    cu_s = xpd.Series(["a", "b", "c"])
+    pd_s = pd.Series(["a", "b", "c"])
+
+    cu_arr = xpd.arrays.ArrowStringArray._from_sequence(
+        cu_s, dtype=xpd.StringDtype("pyarrow")
+    )
+    pd_arr = pd.arrays.ArrowStringArray._from_sequence(
+        pd_s, dtype=pd.StringDtype("pyarrow")
+    )
+
+    tm.assert_equal(cu_arr, pd_arr)
+
+    cu_arr = xpd.core.arrays.string_arrow.ArrowStringArray._from_sequence(
+        cu_s, dtype=xpd.StringDtype("pyarrow_numpy")
+    )
+    pd_arr = pd.core.arrays.string_arrow.ArrowStringArray._from_sequence(
+        pd_s, dtype=pd.StringDtype("pyarrow_numpy")
+    )
+
+    tm.assert_equal(cu_arr, pd_arr)

From 2ed69c9e830d90a8e565ea23ba1813e594a9f4d9 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 27 Jun 2024 10:11:09 -1000
Subject: [PATCH 02/27] Ensure MultiIndex.to_frame deep copies columns (#16110)

Additionally, this allows simplification in `MultiIndex.__repr__` which avoids a shallow copy and also caught a bug where `NaT` was not supposed to be quoted

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16110
---
 python/cudf/cudf/core/multiindex.py | 88 ++++++++++-------------------
 python/cudf/cudf/tests/test_repr.py | 10 ++--
 2 files changed, 35 insertions(+), 63 deletions(-)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index a01242d957d..547c14cdc99 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -23,6 +23,7 @@
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
 from cudf.core import column
 from cudf.core._base_index import _return_get_indexer_result
+from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
 from cudf.core.index import (
     BaseIndex,
@@ -446,45 +447,26 @@ def __repr__(self):
             )
             preprocess = self.take(indices)
         else:
-            preprocess = self.copy(deep=False)
-
-        if any(col.has_nulls() for col in preprocess._data.columns):
-            preprocess_df = preprocess.to_frame(index=False)
-            for name, col in preprocess._data.items():
-                if isinstance(
-                    col,
-                    (
-                        column.datetime.DatetimeColumn,
-                        column.timedelta.TimeDeltaColumn,
-                    ),
-                ):
-                    preprocess_df[name] = col.astype("str").fillna(
-                        str(cudf.NaT)
-                    )
+            preprocess = self
 
-            tuples_list = list(
-                zip(
-                    *list(
-                        map(lambda val: pd.NA if val is None else val, col)
-                        for col in preprocess_df.to_arrow()
-                        .to_pydict()
-                        .values()
-                    )
-                )
-            )
+        arrays = []
+        for name, col in zip(self.names, preprocess._columns):
+            try:
+                pd_idx = col.to_pandas(nullable=True)
+            except NotImplementedError:
+                pd_idx = col.to_pandas(nullable=False)
+            pd_idx.name = name
+            arrays.append(pd_idx)
 
-            preprocess = preprocess.to_pandas(nullable=True)
-            preprocess.values[:] = tuples_list
-        else:
-            preprocess = preprocess.to_pandas(nullable=True)
+        preprocess_pd = pd.MultiIndex.from_arrays(arrays)
 
-        output = repr(preprocess)
+        output = repr(preprocess_pd)
         output_prefix = self.__class__.__name__ + "("
         output = output.lstrip(output_prefix)
         lines = output.split("\n")
 
         if len(lines) > 1:
-            if "length=" in lines[-1] and len(self) != len(preprocess):
+            if "length=" in lines[-1] and len(self) != len(preprocess_pd):
                 last_line = lines[-1]
                 length_index = last_line.index("length=")
                 last_line = last_line[:length_index] + f"length={len(self)})"
@@ -1022,42 +1004,32 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False):
         a c  a  c
         b d  b  d
         """
-        # TODO: Currently this function makes a shallow copy, which is
-        # incorrect. We want to make a deep copy, otherwise further
-        # modifications of the resulting DataFrame will affect the MultiIndex.
         if name is no_default:
             column_names = [
                 level if name is None else name
                 for level, name in enumerate(self.names)
             ]
+        elif not is_list_like(name):
+            raise TypeError(
+                "'name' must be a list / sequence of column names."
+            )
+        elif len(name) != len(self.levels):
+            raise ValueError(
+                "'name' should have the same length as "
+                "number of levels on index."
+            )
         else:
-            if not is_list_like(name):
-                raise TypeError(
-                    "'name' must be a list / sequence of column names."
-                )
-            if len(name) != len(self.levels):
-                raise ValueError(
-                    "'name' should have the same length as "
-                    "number of levels on index."
-                )
             column_names = name
 
-        all_none_names = None
-        if not (
-            all_none_names := all(x is None for x in column_names)
-        ) and len(column_names) != len(set(column_names)):
+        if len(column_names) != len(set(column_names)):
             raise ValueError("Duplicate column names are not allowed")
-        df = cudf.DataFrame._from_data(
-            data=self._data,
-            columns=column_names
-            if name is not no_default and not all_none_names
-            else None,
+        ca = ColumnAccessor(
+            dict(zip(column_names, (col.copy() for col in self._columns))),
+            verify=False,
+        )
+        return cudf.DataFrame._from_data(
+            data=ca, index=self if index else None
         )
-
-        if index:
-            df = df.set_index(self)
-
-        return df
 
     @_cudf_nvtx_annotate
     def get_level_values(self, level):
@@ -1243,7 +1215,7 @@ def values(self):
 
     @classmethod
     @_cudf_nvtx_annotate
-    def from_frame(cls, df, names=None):
+    def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         """
         Make a MultiIndex from a DataFrame.
 
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 8f65bd26bd1..193d64a9e7f 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1210,7 +1210,7 @@ def test_multiindex_repr(pmi, max_seq_items):
             .index,
             textwrap.dedent(
                 """
-                MultiIndex([('abc',                       'NaT', 0.345),
+                MultiIndex([('abc',                         NaT, 0.345),
                             ( <NA>, '0 days 00:00:00.000000001',  <NA>),
                             ('xyz', '0 days 00:00:00.000000002', 100.0),
                             ( <NA>, '0 days 00:00:00.000000003',  10.0)],
@@ -1252,10 +1252,10 @@ def test_multiindex_repr(pmi, max_seq_items):
             .index,
             textwrap.dedent(
                 """
-            MultiIndex([('NaT', <NA>),
-                        ('NaT', <NA>),
-                        ('NaT', <NA>),
-                        ('NaT', <NA>)],
+            MultiIndex([(NaT, <NA>),
+                        (NaT, <NA>),
+                        (NaT, <NA>),
+                        (NaT, <NA>)],
                     names=['b', 'a'])
             """
             ),

From c847b98291bd41f98ac417becf0c53293a392ce3 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 27 Jun 2024 21:33:29 +0100
Subject: [PATCH 03/27] Finish implementation of cudf-polars boolean function
 handlers (#16098)

The missing nodes were `is_in`, `not` (both easy), `is_finite` and `is_infinite` (obtained by translating to `contains` calls).

While here, remove the implementation of `IsBetween` and just translate to an expression with binary operations. This removes the need for special-casing scalar arguments to `IsBetween` and reproducing the code for binop evaluation.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16098
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 67 +++++++++++--------
 .../cudf_polars/cudf_polars/dsl/translate.py  | 10 +++
 .../tests/expressions/test_booleanfunction.py | 48 +++++++++++--
 3 files changed, 90 insertions(+), 35 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 871134665af..97325161650 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -443,12 +443,12 @@ def __init__(
         ):
             # With ignore_nulls == False, polars uses Kleene logic
             raise NotImplementedError(f"Kleene logic for {self.name}")
-        if self.name in (
-            pl_expr.BooleanFunction.IsFinite,
-            pl_expr.BooleanFunction.IsInfinite,
-            pl_expr.BooleanFunction.IsIn,
+        if self.name == pl_expr.BooleanFunction.IsIn and not all(
+            c.dtype == self.children[0].dtype for c in self.children
         ):
-            raise NotImplementedError(f"{self.name}")
+            # TODO: If polars IR doesn't put the casts in, we need to
+            # mimic the supertype promotion rules.
+            raise NotImplementedError("IsIn doesn't support supertype casting")
 
     @staticmethod
     def _distinct(
@@ -506,6 +506,33 @@ def do_evaluate(
         mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
+        if self.name in (
+            pl_expr.BooleanFunction.IsFinite,
+            pl_expr.BooleanFunction.IsInfinite,
+        ):
+            # Avoid evaluating the child if the dtype tells us it's unnecessary.
+            (child,) = self.children
+            is_finite = self.name == pl_expr.BooleanFunction.IsFinite
+            if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
+                value = plc.interop.from_arrow(
+                    pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype))
+                )
+                return Column(plc.Column.from_scalar(value, df.num_rows))
+            needles = child.evaluate(df, context=context, mapping=mapping)
+            to_search = [-float("inf"), float("inf")]
+            if is_finite:
+                # NaN is neither finite not infinite
+                to_search.append(float("nan"))
+            haystack = plc.interop.from_arrow(
+                pa.array(
+                    to_search,
+                    type=plc.interop.to_arrow(needles.obj.type()),
+                )
+            )
+            result = plc.search.contains(haystack, needles.obj)
+            if is_finite:
+                result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
+            return Column(result)
         columns = [
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
@@ -612,31 +639,13 @@ def do_evaluate(
                     (c.obj for c in columns),
                 )
             )
-        elif self.name == pl_expr.BooleanFunction.IsBetween:
-            column, lo, hi = columns
-            (closed,) = self.options
-            lop, rop = self._BETWEEN_OPS[closed]
-            lo_obj = (
-                lo.obj_scalar
-                if lo.is_scalar and lo.obj.size() != column.obj.size()
-                else lo.obj
-            )
-            hi_obj = (
-                hi.obj_scalar
-                if hi.is_scalar and hi.obj.size() != column.obj.size()
-                else hi.obj
-            )
+        elif self.name == pl_expr.BooleanFunction.IsIn:
+            needles, haystack = columns
+            return Column(plc.search.contains(haystack.obj, needles.obj))
+        elif self.name == pl_expr.BooleanFunction.Not:
+            (column,) = columns
             return Column(
-                plc.binaryop.binary_operation(
-                    plc.binaryop.binary_operation(
-                        column.obj, lo_obj, lop, output_type=self.dtype
-                    ),
-                    plc.binaryop.binary_operation(
-                        column.obj, hi_obj, rop, output_type=self.dtype
-                    ),
-                    plc.binaryop.BinaryOperator.LOGICAL_AND,
-                    self.dtype,
-                )
+                plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT)
             )
         else:
             raise NotImplementedError(
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 5d289885f47..742e5a591ee 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -342,6 +342,16 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             *(translate_expr(visitor, n=n) for n in node.input),
         )
     elif isinstance(name, pl_expr.BooleanFunction):
+        if name == pl_expr.BooleanFunction.IsBetween:
+            column, lo, hi = (translate_expr(visitor, n=n) for n in node.input)
+            (closed,) = options
+            lop, rop = expr.BooleanFunction._BETWEEN_OPS[closed]
+            return expr.BinOp(
+                dtype,
+                plc.binaryop.BinaryOperator.LOGICAL_AND,
+                expr.BinOp(dtype, lop, column, lo),
+                expr.BinOp(dtype, rop, column, hi),
+            )
         return expr.BooleanFunction(
             dtype,
             name,
diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py
index a52fba26528..97421008669 100644
--- a/python/cudf_polars/tests/expressions/test_booleanfunction.py
+++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py
@@ -6,7 +6,10 @@
 
 import polars as pl
 
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 @pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"])
@@ -67,23 +70,26 @@ def test_boolean_function_unary(request, expr, has_nans, has_nulls):
 
     df = pl.LazyFrame({"a": pl.Series(values, dtype=pl.Float32())})
 
-    q = df.select(expr(pl.col("a")))
+    q = df.select(expr(pl.col("a")), expr(pl.col("a")).not_().alias("b"))
 
     assert_gpu_result_equal(q)
 
 
-@pytest.mark.xfail(reason="Evaluation handlers not yet implemented")
 @pytest.mark.parametrize(
     "expr",
     [
         pl.col("a").is_finite(),
         pl.col("a").is_infinite(),
-        pl.col("a").is_in(pl.col("b")),
+        [pl.col("a").is_infinite(), pl.col("b").is_finite()],
     ],
 )
-def test_unsupported_boolean_function(expr):
+def test_boolean_finite(expr):
     df = pl.LazyFrame(
-        {"a": pl.Series([1, float("nan"), 2, 4], dtype=pl.Float64()), "b": [1, 2, 3, 4]}
+        {
+            "a": pl.Series([1, float("nan"), 2, float("inf")], dtype=pl.Float64()),
+            "b": [1, 2, 3, 4],
+            "c": pl.Series([1, 2, 3, 4], dtype=pl.Float64()),
+        }
     )
 
     q = df.select(expr)
@@ -133,3 +139,33 @@ def test_boolean_horizontal(request, expr, has_nulls, wide):
     q = ldf.select(expr)
 
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.col("a").is_in(pl.col("b")),
+        pl.col("a").is_in(pl.col("c")),
+        pl.col("c").is_in(pl.col("d")),
+    ],
+)
+def test_boolean_is_in(expr):
+    ldf = pl.LazyFrame(
+        {
+            "a": pl.Series([1, 2, 3], dtype=pl.Int64()),
+            "b": pl.Series([3, 4, 2], dtype=pl.Int64()),
+            "c": pl.Series([1, None, 3], dtype=pl.Int64()),
+            "d": pl.Series([10, None, 11], dtype=pl.Int64()),
+        }
+    )
+
+    q = ldf.select(expr)
+
+    assert_gpu_result_equal(q)
+
+
+def test_boolean_is_in_raises_unsupported():
+    ldf = pl.LazyFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int64)})
+    q = ldf.select(pl.col("a").is_in(pl.lit(1, dtype=pl.Int32())))
+
+    assert_ir_translation_raises(q, NotImplementedError)

From e35da6b3df55bfa7b8d5df12c35039740566cb21 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 28 Jun 2024 09:54:03 +0100
Subject: [PATCH 04/27] Implement Ternary copy_if_else (#16114)

A straightforward evaluation using `copy_if_else`.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16114
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 29 +++++++++++++++++++
 .../cudf_polars/cudf_polars/dsl/translate.py  | 10 +++++++
 .../tests/expressions/test_when_then.py       | 27 +++++++++++++++++
 3 files changed, 66 insertions(+)
 create mode 100644 python/cudf_polars/tests/expressions/test_when_then.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 97325161650..17d7d15e4e5 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -51,6 +51,7 @@
     "GroupedRollingWindow",
     "Cast",
     "Agg",
+    "Ternary",
     "BinOp",
 ]
 
@@ -1112,6 +1113,34 @@ def do_evaluate(
         return self.op(child.evaluate(df, context=context, mapping=mapping))
 
 
+class Ternary(Expr):
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+    children: tuple[Expr, Expr, Expr]
+
+    def __init__(
+        self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr
+    ) -> None:
+        super().__init__(dtype)
+        self.children = (when, then, otherwise)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        when, then, otherwise = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        then_obj = then.obj_scalar if then.is_scalar else then.obj
+        otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj
+        return Column(plc.copying.copy_if_else(then_obj, otherwise_obj, when.obj))
+
+
 class BinOp(Expr):
     __slots__ = ("op", "children")
     _non_child = ("dtype", "op")
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 742e5a591ee..953ff636cce 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -446,6 +446,16 @@ def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Ex
     )
 
 
+@_translate_expr.register
+def _(node: pl_expr.Ternary, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+    return expr.Ternary(
+        dtype,
+        translate_expr(visitor, n=node.predicate),
+        translate_expr(visitor, n=node.truthy),
+        translate_expr(visitor, n=node.falsy),
+    )
+
+
 @_translate_expr.register
 def _(
     node: pl_expr.BinaryExpr, visitor: NodeTraverser, dtype: plc.DataType
diff --git a/python/cudf_polars/tests/expressions/test_when_then.py b/python/cudf_polars/tests/expressions/test_when_then.py
new file mode 100644
index 00000000000..cf1c0fe7fce
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_when_then.py
@@ -0,0 +1,27 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize("then_scalar", [False, True])
+@pytest.mark.parametrize("otherwise_scalar", [False, True])
+@pytest.mark.parametrize("expr", [pl.col("c"), pl.col("c").is_not_null()])
+def test_when_then(then_scalar, otherwise_scalar, expr):
+    ldf = pl.LazyFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [10, 13, 11, 15, 16, 11, 10],
+            "c": [None, True, False, False, True, True, False],
+        }
+    )
+
+    then = pl.lit(10) if then_scalar else pl.col("a")
+    otherwise = pl.lit(-2) if otherwise_scalar else pl.col("b")
+    q = ldf.select(pl.when(expr).then(then).otherwise(otherwise))
+    assert_gpu_result_equal(q)

From 6b04fd3b704efdae7d39d09beba026fcbca5f996 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Fri, 28 Jun 2024 12:31:18 +0200
Subject: [PATCH 05/27] Memory Profiling (#15866)

Use [RMM's new memory profiler](https://github.com/rapidsai/rmm/pull/1563) to profile all functions already decorated with `_cudf_nvtx_annotate`.

Example
```python
import cudf
from cudf.utils.performance_tracking import print_memory_report

cudf.set_option("memory_profiling", True)

df1 = cudf.DataFrame({"a": [1, 2, 3]})
df2 = cudf.DataFrame({"a": [2, 2, 3]})
df3 = df1.merge(df2)

print_memory_report()
```

Output:
```
Memory Profiling
================

Ordered by: memory_peak

ncalls     memory_peak    memory_total  filename:lineno(function)
     1             272             688  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/dataframe.py:4072(DataFrame.merge)
     2              32              64  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/dataframe.py:1043(DataFrame._init_from_dict_like)
     2              32              64  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/dataframe.py:690(DataFrame.__init__)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/dataframe.py:1131(DataFrame._align_input_series_indices)
     7               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/index.py:214(RangeIndex.__init__)
     6               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/index.py:424(RangeIndex.__len__)
     4               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/frame.py:271(Frame.__len__)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/dataframe.py:3195(DataFrame._insert)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/index.py:270(RangeIndex.name)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/index.py:369(RangeIndex.copy)
     5               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/frame.py:134(Frame._from_data)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/frame.py:1039(Frame._copy_type_metadata)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/indexed_frame.py:315(IndexedFrame._from_columns_like_self)
```

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15866
---
 .../cudf/source/user_guide/api_docs/index.rst |   1 +
 .../api_docs/performance_tracking.rst         |  12 +
 docs/cudf/source/user_guide/index.md          |   1 +
 .../source/user_guide/memory-profiling.md     |  44 ++++
 python/cudf/cudf/core/buffer/spill_manager.py |   4 +-
 .../cudf/cudf/core/buffer/spillable_buffer.py |   7 +-
 python/cudf/cudf/core/dataframe.py            | 180 +++++++-------
 python/cudf/cudf/core/frame.py                | 110 ++++-----
 python/cudf/cudf/core/groupby/groupby.py      |  60 ++---
 python/cudf/cudf/core/index.py                | 228 +++++++++---------
 python/cudf/cudf/core/indexed_frame.py        | 144 +++++------
 python/cudf/cudf/core/multiindex.py           | 130 +++++-----
 python/cudf/cudf/core/series.py               | 228 +++++++++---------
 python/cudf/cudf/core/single_column_frame.py  |  42 ++--
 python/cudf/cudf/core/udf/groupby_utils.py    |   4 +-
 python/cudf/cudf/core/udf/utils.py            |   6 +-
 python/cudf/cudf/io/csv.py                    |   6 +-
 python/cudf/cudf/io/parquet.py                |  28 +--
 python/cudf/cudf/io/text.py                   |   6 +-
 python/cudf/cudf/options.py                   |  14 ++
 .../cudf/tests/test_performance_tracking.py   |  41 ++++
 python/cudf/cudf/utils/nvtx_annotation.py     |  30 ---
 .../cudf/cudf/utils/performance_tracking.py   |  82 +++++++
 python/cudf/cudf/utils/utils.py               |   5 +-
 python/dask_cudf/dask_cudf/backends.py        |  40 +--
 python/dask_cudf/dask_cudf/core.py            |  62 ++---
 python/dask_cudf/dask_cudf/groupby.py         |  72 +++---
 python/dask_cudf/dask_cudf/sorting.py         |  16 +-
 28 files changed, 885 insertions(+), 718 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/performance_tracking.rst
 create mode 100644 docs/cudf/source/user_guide/memory-profiling.md
 create mode 100644 python/cudf/cudf/tests/test_performance_tracking.py
 delete mode 100644 python/cudf/cudf/utils/nvtx_annotation.py
 create mode 100644 python/cudf/cudf/utils/performance_tracking.py

diff --git a/docs/cudf/source/user_guide/api_docs/index.rst b/docs/cudf/source/user_guide/api_docs/index.rst
index 5f26a921012..d05501f4a4a 100644
--- a/docs/cudf/source/user_guide/api_docs/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/index.rst
@@ -26,3 +26,4 @@ This page provides a list of all publicly accessible modules, methods and classe
     options
     extension_dtypes
     pylibcudf/index.rst
+    performance_tracking
diff --git a/docs/cudf/source/user_guide/api_docs/performance_tracking.rst b/docs/cudf/source/user_guide/api_docs/performance_tracking.rst
new file mode 100644
index 00000000000..9da79e69fb2
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/performance_tracking.rst
@@ -0,0 +1,12 @@
+.. _api.performance_tracking:
+
+====================
+Performance Tracking
+====================
+
+.. currentmodule:: cudf.utils.performance_tracking
+.. autosummary::
+   :toctree: api/
+
+   get_memory_records
+   print_memory_report
diff --git a/docs/cudf/source/user_guide/index.md b/docs/cudf/source/user_guide/index.md
index 486368c3b8b..df4e4795a08 100644
--- a/docs/cudf/source/user_guide/index.md
+++ b/docs/cudf/source/user_guide/index.md
@@ -16,5 +16,6 @@ options
 performance-comparisons/index
 PandasCompat
 copy-on-write
+memory-profiling
 pandas-2.0-breaking-changes
 ```
diff --git a/docs/cudf/source/user_guide/memory-profiling.md b/docs/cudf/source/user_guide/memory-profiling.md
new file mode 100644
index 00000000000..ab5433685e6
--- /dev/null
+++ b/docs/cudf/source/user_guide/memory-profiling.md
@@ -0,0 +1,44 @@
+(memory-profiling-user-doc)=
+
+# Memory Profiling
+
+Peak memory usage is a common concern in GPU programming because GPU memory is typically smaller than available CPU memory. To easily identify memory hotspots, cuDF provides a memory profiler. It comes with an overhead so avoid using it in performance-sensitive code.
+
+## Enabling Memory Profiling
+
+First, enable memory profiling in RMM by calling {py:func}`rmm.statistics.enable_statistics()`. This adds a statistics resource adaptor to the current RMM memory resource, which enables cuDF to access memory profiling information. See the [RMM documentation](https://docs.rapids.ai/api/rmm/stable/guide/#memory-statistics-and-profiling) for more details.
+
+Second, enable memory profiling in cuDF by setting the `memory_profiling` option to `True`. Use {py:func}`cudf.set_option` or set the environment variable ``CUDF_MEMORY_PROFILING=1`` prior to the launch of the Python interpreter.
+
+To get the result of the profiling, use {py:func}`cudf.utils.performance_tracking.print_memory_report` or access the raw profiling data by using: {py:func}`cudf.utils.performance_tracking.get_memory_records`.
+
+### Example
+In the following, we enable profiling, do some work, and then print the profiling results:
+
+```python
+>>> import cudf
+>>> from cudf.utils.performance_tracking import print_memory_report
+>>> from rmm.statistics import enable_statistics
+>>> enable_statistics()
+>>> cudf.set_option("memory_profiling", True)
+>>> cudf.DataFrame({"a": [1, 2, 3]})  # Some work
+   a
+0  1
+1  2
+2  3
+>>> print_memory_report()  # Pretty print the result of the profiling
+Memory Profiling
+================
+
+Legends:
+ncalls       - number of times the function or code block was called
+memory_peak  - peak memory allocated in function or code block (in bytes)
+memory_total - total memory allocated in function or code block (in bytes)
+
+Ordered by: memory_peak
+
+ncalls memory_peak memory_total filename:lineno(function)
+     1          32           32 cudf/core/dataframe.py:690(DataFrame.__init__)
+     2           0            0 cudf/core/index.py:214(RangeIndex.__init__)
+     6           0            0 cudf/core/index.py:424(RangeIndex.__len__)
+```
diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index 762cd7f9e86..ed351a6b107 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -18,14 +18,14 @@
 import rmm.mr
 
 from cudf.options import get_option
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.string import format_bytes
 
 if TYPE_CHECKING:
     from cudf.core.buffer.spillable_buffer import SpillableBufferOwner
 
 _spill_cudf_nvtx_annotate = partial(
-    _cudf_nvtx_annotate, domain="cudf_python-spill"
+    _performance_tracking, domain="cudf_python-spill"
 )
 
 
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index eb57a371965..4c9e524ee05 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -10,6 +10,7 @@
 from typing import TYPE_CHECKING, Any, Literal
 
 import numpy
+import nvtx
 from typing_extensions import Self
 
 import rmm
@@ -21,7 +22,7 @@
     host_memory_allocation,
 )
 from cudf.core.buffer.exposure_tracked_buffer import ExposureTrackedBuffer
-from cudf.utils.nvtx_annotation import _get_color_for_nvtx, annotate
+from cudf.utils.performance_tracking import _get_color_for_nvtx
 from cudf.utils.string import format_bytes
 
 if TYPE_CHECKING:
@@ -200,7 +201,7 @@ def spill(self, target: str = "cpu") -> None:
                 )
 
             if (ptr_type, target) == ("gpu", "cpu"):
-                with annotate(
+                with nvtx.annotate(
                     message="SpillDtoH",
                     color=_get_color_for_nvtx("SpillDtoH"),
                     domain="cudf_python-spill",
@@ -218,7 +219,7 @@ def spill(self, target: str = "cpu") -> None:
                 # trigger a new call to this buffer's `spill()`.
                 # Therefore, it is important that spilling-on-demand doesn't
                 # try to unspill an already locked buffer!
-                with annotate(
+                with nvtx.annotate(
                     message="SpillHtoD",
                     color=_get_color_for_nvtx("SpillHtoD"),
                     domain="cudf_python-spill",
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f7f5ef792d6..3fc29582c4c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -83,7 +83,7 @@
     min_scalar_type,
     numeric_normalize_types,
 )
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
 
 if TYPE_CHECKING:
@@ -145,7 +145,7 @@ def __setitem__(self, key, value):
             key = (key, slice(None))
         return self._setitem_tuple_arg(key, value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _can_downcast_to_series(self, df, arg):
         """
         This method encapsulates the logic used
@@ -188,7 +188,7 @@ def _can_downcast_to_series(self, df, arg):
                 return True
         return False
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _downcast_to_series(self, df, arg):
         """
         "Downcast" from a DataFrame to a Series
@@ -233,11 +233,11 @@ class _DataFrameLocIndexer(_DataFrameIndexer):
     For selection by label.
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _getitem_scalar(self, arg):
         return self._frame[arg[1]].loc[arg[0]]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _getitem_tuple_arg(self, arg):
         from uuid import uuid4
 
@@ -363,7 +363,7 @@ def _getitem_tuple_arg(self, arg):
             return self._downcast_to_series(df, arg)
         return df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _setitem_tuple_arg(self, key, value):
         if (
             isinstance(self._frame.index, MultiIndex)
@@ -532,7 +532,7 @@ def __getitem__(self, arg):
             return frame._empty_like(keep_index=True)
         assert_never(row_spec)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _setitem_tuple_arg(self, key, value):
         columns_df = self._frame._from_data(
             self._frame._data.select_by_index(key[1]), self._frame.index
@@ -677,7 +677,7 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     _groupby = DataFrameGroupBy
     _resampler = DataFrameResampler
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -859,7 +859,7 @@ def __init__(
             columns, pd.MultiIndex
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _init_from_series_list(self, data, columns, index):
         if index is None:
             # When `index` is `None`, the final index of
@@ -972,7 +972,7 @@ def _init_from_series_list(self, data, columns, index):
         else:
             self._data.rangeindex = True
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _init_from_list_like(self, data, index=None, columns=None):
         if index is None:
             index = RangeIndex(start=0, stop=len(data))
@@ -1030,7 +1030,7 @@ def _init_from_list_like(self, data, index=None, columns=None):
             )
             self._data.label_dtype = getattr(columns, "dtype", None)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _init_from_dict_like(
         self, data, index=None, columns=None, nan_as_null=None
     ):
@@ -1119,7 +1119,7 @@ def _from_data(
         return out
 
     @staticmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _align_input_series_indices(data, index):
         input_series = [
             Series(val)
@@ -1187,7 +1187,7 @@ def deserialize(cls, header, frames):
         return obj
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def shape(self):
         """Returns a tuple representing the dimensionality of the DataFrame."""
         return self._num_rows, self._num_columns
@@ -1270,7 +1270,7 @@ def __setattr__(self, key, col):
         else:
             super().__setattr__(key, col)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, arg):
         """
         If *arg* is a ``str`` or ``int`` type, return the column Series.
@@ -1364,7 +1364,7 @@ def __getitem__(self, arg):
                 f"__getitem__ on type {type(arg)} is not supported"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __setitem__(self, arg, value):
         """Add/set column by *arg or DataFrame*"""
         if isinstance(arg, DataFrame):
@@ -1482,7 +1482,7 @@ def __setitem__(self, arg, value):
     def __delitem__(self, name):
         self._drop_column(name)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, index=True, deep=False):
         mem_usage = [col.memory_usage for col in self._data.columns]
         names = [str(name) for name in self._data.names]
@@ -1494,7 +1494,7 @@ def memory_usage(self, index=True, deep=False):
             index=as_index(names),
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_function__(self, func, types, args, kwargs):
         if "out" in kwargs or not all(
             issubclass(t, (Series, DataFrame)) for t in types
@@ -1528,7 +1528,7 @@ def __array_function__(self, func, types, args, kwargs):
         return NotImplemented
 
     # The _get_numeric_data method is necessary for dask compatibility.
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_numeric_data(self):
         """Return a dataframe with only numeric data types"""
         columns = [
@@ -1538,7 +1538,7 @@ def _get_numeric_data(self):
         ]
         return self[columns]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def assign(self, **kwargs: Callable[[Self], Any] | Any):
         """
         Assign columns to DataFrame from keyword arguments.
@@ -1571,7 +1571,7 @@ def assign(self, **kwargs: Callable[[Self], Any] | Any):
         return new_df
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _concat(
         cls, objs, axis=0, join="outer", ignore_index=False, sort=False
     ):
@@ -1963,12 +1963,12 @@ def _get_renderable_dataframe(self):
 
         return output
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __repr__(self):
         output = self._get_renderable_dataframe()
         return self._clean_renderable_dataframe(output)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _repr_html_(self):
         lines = (
             self._get_renderable_dataframe()
@@ -1984,7 +1984,7 @@ def _repr_html_(self):
             lines.append("</div>")
         return "\n".join(lines)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _repr_latex_(self):
         return self._get_renderable_dataframe().to_pandas()._repr_latex_()
 
@@ -2098,7 +2098,7 @@ def _make_operands_and_index_for_binop(
         return operands, index, can_use_self_column_name
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_dict(
         cls,
         data: dict,
@@ -2233,7 +2233,7 @@ def from_dict(
                 f"parameter. Got '{orient}' instead"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_dict(
         self,
         orient: str = "dict",
@@ -2354,7 +2354,7 @@ def to_dict(
 
         return self.to_pandas().to_dict(orient=orient, into=into)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def scatter_by_map(
         self, map_index, map_size=None, keep_index=True, debug: bool = False
     ):
@@ -2447,7 +2447,7 @@ def scatter_by_map(
 
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def update(
         self,
         other,
@@ -2542,23 +2542,23 @@ def update(
 
         self._mimic_inplace(source_df, inplace=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __iter__(self):
         return iter(self._column_names)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __contains__(self, item):
         # This must check against containment in the pandas Index and not
         # self._column_names to handle NA, None, nan, etc. correctly.
         return item in self._data.to_pandas_index()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def items(self):
         """Iterate over column names and series pairs"""
         for k in self:
             yield (k, self[k])
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:
         ret = super().equals(other)
         # If all other checks matched, validate names.
@@ -2591,13 +2591,13 @@ def at(self):
         "index is absolutely necessary. For checking if the columns are a "
         "MultiIndex, use _data.multiindex."
     )
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def columns(self):
         """Returns a tuple of columns"""
         return self._data.to_pandas_index()
 
     @columns.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def columns(self, columns):
         multiindex = False
         rangeindex = False
@@ -2665,7 +2665,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None:
             verify=False,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def reindex(
         self,
         labels=None,
@@ -2813,7 +2813,7 @@ def reindex(
             fill_value=fill_value,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def set_index(
         self,
         keys,
@@ -2980,7 +2980,7 @@ def set_index(
         df.index = idx
         return df if not inplace else None
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
     ):  # noqa: D102
@@ -3006,7 +3006,7 @@ def fillna(
             value=value, method=method, axis=axis, inplace=inplace, limit=limit
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         from cudf.core._internals.where import (
             _check_and_cast_columns_with_other,
@@ -3163,7 +3163,7 @@ def reset_index(
             inplace=inplace,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def insert(self, loc, name, value, nan_as_null=no_default):
         """Add a column to DataFrame at the index specified by loc.
 
@@ -3189,7 +3189,7 @@ def insert(self, loc, name, value, nan_as_null=no_default):
             ignore_index=False,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
         """
         Same as `insert`, with additional `ignore_index` param.
@@ -3271,7 +3271,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
         self._data.insert(name, value, loc=loc)
 
     @property  # type:ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def axes(self):
         """
         Return a list representing the axes of the DataFrame.
@@ -3363,7 +3363,7 @@ def diff(self, periods=1, axis=0):
 
         return self - self.shift(periods=periods)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop_duplicates(
         self,
         subset=None,
@@ -3451,14 +3451,14 @@ def drop_duplicates(
 
         return self._mimic_inplace(outdf, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pop(self, item):
         """Return a column and drop it from the DataFrame."""
         popped = self[item]
         del self[item]
         return popped
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rename(
         self,
         mapper=None,
@@ -3616,7 +3616,7 @@ def rename(
 
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def add_prefix(self, prefix):
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
@@ -3625,7 +3625,7 @@ def add_prefix(self, prefix):
         ]
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def add_suffix(self, suffix):
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
@@ -3634,7 +3634,7 @@ def add_suffix(self, suffix):
         ]
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def agg(self, aggs, axis=None):
         """
         Aggregate using one or more operations over the specified axis.
@@ -3770,7 +3770,7 @@ def agg(self, aggs, axis=None):
         else:
             raise ValueError("argument must be a string, list or dict")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nlargest(self, n, columns, keep="first"):
         """Return the first *n* rows ordered by *columns* in descending order.
 
@@ -3910,7 +3910,7 @@ def nsmallest(self, n, columns, keep="first"):
         """
         return self._n_largest_or_smallest(False, n, columns, keep)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def swaplevel(self, i=-2, j=-1, axis=0):
         """
         Swap level i with level j.
@@ -3977,7 +3977,7 @@ def swaplevel(self, i=-2, j=-1, axis=0):
 
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def transpose(self):
         """Transpose index and columns.
 
@@ -4041,7 +4041,7 @@ def transpose(self):
 
     T = property(transpose, doc=transpose.__doc__)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def melt(self, **kwargs):
         """Unpivots a DataFrame from wide format to long format,
         optionally leaving identifier variables set.
@@ -4071,7 +4071,7 @@ def melt(self, **kwargs):
 
         return melt(self, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def merge(
         self,
         right,
@@ -4224,7 +4224,7 @@ def merge(
             suffixes=suffixes,
         ).perform_merge()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def join(
         self,
         other,
@@ -4273,7 +4273,7 @@ def join(
         )
         return df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         groupby_doc_template.format(
             ret=textwrap.dedent(
@@ -4407,7 +4407,7 @@ def query(self, expr, local_dict=None):
                 BooleanMask.from_column_unchecked(boolmask)
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def apply(
         self, func, axis=1, raw=False, result_type=None, args=(), **kwargs
     ):
@@ -4691,7 +4691,7 @@ def _func(x):  # pragma: no cover
 
         return DataFrame._from_data(result, index=self.index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @applyutils.doc_apply()
     def apply_rows(
         self,
@@ -4770,7 +4770,7 @@ def apply_rows(
             cache_key=cache_key,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @applyutils.doc_applychunks()
     def apply_chunks(
         self,
@@ -4837,7 +4837,7 @@ def apply_chunks(
             tpb=tpb,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def partition_by_hash(self, columns, nparts, keep_index=True):
         """Partition the dataframe by the hashed value of data in *columns*.
 
@@ -5181,7 +5181,7 @@ def _sizeof_fmt(num, size_qualifier):
 
         cudf.utils.ioutils.buffer_write_lines(buf, lines)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_describe()
     def describe(
         self,
@@ -5243,7 +5243,7 @@ def describe(
                 )
             return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.DataFrame:
@@ -5333,7 +5333,7 @@ def to_pandas(
         return out_df
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_pandas(cls, dataframe, nan_as_null=no_default):
         """
         Convert from a Pandas DataFrame.
@@ -5406,7 +5406,7 @@ def from_pandas(cls, dataframe, nan_as_null=no_default):
             )
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrow(cls, table):
         """
         Convert from PyArrow Table to DataFrame.
@@ -5492,7 +5492,7 @@ def from_arrow(cls, table):
 
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_arrow(self, preserve_index=None):
         """
         Convert to a PyArrow Table.
@@ -5582,7 +5582,7 @@ def to_arrow(self, preserve_index=None):
 
         return out.replace_schema_metadata(metadata)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_records(self, index=True):
         """Convert to a numpy recarray
 
@@ -5606,7 +5606,7 @@ def to_records(self, index=True):
         return ret
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         """
         Convert structured or record ndarray to DataFrame.
@@ -5685,7 +5685,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         return df
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
         """Convert a numpy/cupy array to DataFrame.
 
@@ -5763,7 +5763,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             index=index,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def interpolate(
         self,
         method="linear",
@@ -5793,7 +5793,7 @@ def interpolate(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quantile(
         self,
         q=0.5,
@@ -5936,7 +5936,7 @@ def quantile(
         result.index = cudf.Index(list(map(float, qs)), dtype="float64")
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isin(self, values):
         """
         Whether each element in the DataFrame is contained in values.
@@ -6080,7 +6080,7 @@ def make_false_column_like_self():
     #
     # Stats
     #
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
         """Prepare a DataFrame for CuPy-based row-wise operations."""
 
@@ -6132,7 +6132,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
             coerced = coerced.astype("int64", copy=False)
         return coerced, mask, common_dtype
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def count(self, axis=0, numeric_only=False):
         """
         Count ``non-NA`` cells for each column or row.
@@ -6184,7 +6184,7 @@ def count(self, axis=0, numeric_only=False):
         "columns": 1,
     }
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _reduce(
         self,
         op,
@@ -6308,7 +6308,7 @@ def _reduce(
         else:
             raise ValueError(f"Invalid value of {axis=} received for {op}")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _scan(
         self,
         op,
@@ -6325,7 +6325,7 @@ def _scan(
         elif axis == 1:
             return self._apply_cupy_method_axis_1(op, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def mode(self, axis=0, numeric_only=False, dropna=True):
         """
         Get the mode(s) of each element along the selected axis.
@@ -6432,17 +6432,17 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
 
         return df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def all(self, axis=0, bool_only=None, skipna=True, **kwargs):
         obj = self.select_dtypes(include="bool") if bool_only else self
         return super(DataFrame, obj).all(axis, skipna, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def any(self, axis=0, bool_only=None, skipna=True, **kwargs):
         obj = self.select_dtypes(include="bool") if bool_only else self
         return super(DataFrame, obj).any(axis, skipna, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
         # This method uses cupy to perform scans and reductions along rows of a
         # DataFrame. Since cuDF is designed around columnar storage and
@@ -6542,7 +6542,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
             result_df._set_columns_like(prepared._data)
             return result_df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _columns_view(self, columns):
         """
         Return a subset of the DataFrame's columns as a view.
@@ -6551,7 +6551,7 @@ def _columns_view(self, columns):
             {col: self._data[col] for col in columns}, index=self.index
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def select_dtypes(self, include=None, exclude=None):
         """Return a subset of the DataFrame's columns based on the column dtypes.
 
@@ -6816,7 +6816,7 @@ def to_orc(
             index=index,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def stack(self, level=-1, dropna=no_default, future_stack=False):
         """Stack the prescribed level(s) from columns to index
 
@@ -7161,7 +7161,7 @@ def unnamed_group_generator():
         else:
             return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cov(self, **kwargs):
         """Compute the covariance matrix of a DataFrame.
 
@@ -7216,7 +7216,7 @@ def corr(self, method="pearson", min_periods=None):
         df._set_columns_like(self._data)
         return df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_struct(self, name=None):
         """
         Return a struct Series composed of the columns of the DataFrame.
@@ -7250,7 +7250,7 @@ def to_struct(self, name=None):
             name=name,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def keys(self):
         """
         Get the columns.
@@ -7310,14 +7310,14 @@ def iterrows(self):
             "if you wish to iterate over each row."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @copy_docstring(reshape.pivot)
     def pivot(self, *, columns, index=no_default, values=no_default):
         return cudf.core.reshape.pivot(
             self, index=index, columns=columns, values=values
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @copy_docstring(reshape.pivot_table)
     def pivot_table(
         self,
@@ -7346,14 +7346,14 @@ def pivot_table(
             sort=sort,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @copy_docstring(reshape.unstack)
     def unstack(self, level=-1, fill_value=None):
         return cudf.core.reshape.unstack(
             self, level=level, fill_value=fill_value
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def explode(self, column, ignore_index=False):
         """
         Transform each element of a list-like to a row, replicating index
@@ -7549,7 +7549,7 @@ def _from_columns_like_self(
         result._set_columns_like(self._data)
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def interleave_columns(self):
         """
         Interleave Series columns of a table into a single column.
@@ -7597,7 +7597,7 @@ def interleave_columns(self):
             {None: libcudf.reshape.interleave_columns([*self._columns])}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def eval(self, expr: str, inplace: bool = False, **kwargs):
         """Evaluate a string describing operations on DataFrame columns.
 
@@ -7953,7 +7953,7 @@ def func(left, right, output):
     )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def from_pandas(obj, nan_as_null=no_default):
     """
     Convert certain Pandas objects into the cudf equivalent.
@@ -8080,7 +8080,7 @@ def from_pandas(obj, nan_as_null=no_default):
         )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def merge(left, right, *args, **kwargs):
     if isinstance(left, Series):
         left = left.to_frame()
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 8ca71180c00..9bac75dc6ac 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -32,7 +32,7 @@
 from cudf.core.mixins import BinaryOperand, Scannable
 from cudf.utils import ioutils
 from cudf.utils.dtypes import find_common_type
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf
 
 if TYPE_CHECKING:
@@ -86,7 +86,7 @@ def _dtypes(self) -> abc.Iterable:
     def ndim(self) -> int:
         raise NotImplementedError()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def serialize(self):
         # TODO: See if self._data can be serialized outright
         header = {
@@ -101,7 +101,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def deserialize(cls, header, frames):
         cls_deserialize = pickle.loads(header["type-serialized"])
         column_names = pickle.loads(header["column_names"])
@@ -122,7 +122,7 @@ def deserialize(cls, header, frames):
         return cls_deserialize._from_data(col_accessor)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data(cls, data: MutableMapping) -> Self:
         """
         Construct cls from a ColumnAccessor-like mapping.
@@ -131,7 +131,7 @@ def _from_data(cls, data: MutableMapping) -> Self:
         Frame.__init__(obj, data)
         return obj
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data_like_self(self, data: MutableMapping) -> Self:
         """
         Return type(self) from a ColumnAccessor-like mapping but
@@ -139,7 +139,7 @@ def _from_data_like_self(self, data: MutableMapping) -> Self:
         """
         return self._from_data(data)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_columns_like_self(
         self,
         columns: list[ColumnBase],
@@ -155,7 +155,7 @@ def _from_columns_like_self(
         frame = self.__class__._from_data(data)
         return frame._copy_type_metadata(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _mimic_inplace(
         self, result: Self, inplace: bool = False
     ) -> Self | None:
@@ -171,7 +171,7 @@ def _mimic_inplace(
             return result
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def size(self) -> int:
         """
         Return the number of elements in the underlying data.
@@ -263,11 +263,11 @@ def memory_usage(self, deep=False):
         """
         raise NotImplementedError
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __len__(self) -> int:
         return self._num_rows
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self:
         casted = (
             col.astype(dtype.get(col_name, col.dtype), copy=copy)
@@ -276,7 +276,7 @@ def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self:
         ca = self._data._from_columns_like_self(casted, verify=False)
         return self._from_data_like_self(ca)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:
         """
         Test whether two objects contain the same elements.
@@ -347,7 +347,7 @@ def equals(self, other) -> bool:
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_columns_by_label(self, labels) -> Self:
         """
         Returns columns of the Frame specified by `labels`.
@@ -357,7 +357,7 @@ def _get_columns_by_label(self, labels) -> Self:
         return self._from_data_like_self(self._data.select_by_label(labels))
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values(self) -> cupy.ndarray:
         """
         Return a CuPy representation of the DataFrame.
@@ -373,7 +373,7 @@ def values(self) -> cupy.ndarray:
         return self.to_cupy()
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values_host(self) -> np.ndarray:
         """
         Return a NumPy representation of the data.
@@ -388,7 +388,7 @@ def values_host(self) -> np.ndarray:
         """
         return self.to_numpy()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array__(self, dtype=None):
         raise TypeError(
             "Implicit conversion to a host NumPy array via __array__ is not "
@@ -397,14 +397,14 @@ def __array__(self, dtype=None):
             "using .to_numpy()."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __arrow_array__(self, type=None):
         raise TypeError(
             "Implicit conversion to a host PyArrow object via __arrow_array__ "
             "is not allowed. Consider using .to_arrow()"
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _to_array(
         self,
         get_array: Callable,
@@ -468,7 +468,7 @@ def to_array(
     # particular, we need to benchmark how much of the overhead is coming from
     # (potentially unavoidable) local copies in to_cupy and how much comes from
     # inefficiencies in the implementation.
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_cupy(
         self,
         dtype: Dtype | None = None,
@@ -502,7 +502,7 @@ def to_cupy(
             na_value,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_numpy(
         self,
         dtype: Dtype | None = None,
@@ -537,7 +537,7 @@ def to_numpy(
             lambda col: col.values_host, numpy, copy, dtype, na_value
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace: bool = False) -> Self | None:
         """
         Replace values where the condition is False.
@@ -610,7 +610,7 @@ def where(self, cond, other=None, inplace: bool = False) -> Self | None:
         """
         raise NotImplementedError
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(
         self,
         value: None | ScalarLike | cudf.Series = None,
@@ -767,14 +767,14 @@ def fillna(
             inplace=inplace,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _drop_column(self, name):
         """Drop a column by *name*"""
         if name not in self._data:
             raise KeyError(f"column '{name}' does not exist")
         del self._data[name]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _quantile_table(
         self,
         q: float,
@@ -808,7 +808,7 @@ def _quantile_table(
         )
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrow(cls, data: pa.Table) -> Self:
         """Convert from PyArrow Table to Frame
 
@@ -968,7 +968,7 @@ def from_arrow(cls, data: pa.Table) -> Self:
 
         return cls._from_data({name: result[name] for name in column_names})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_arrow(self):
         """
         Convert to arrow Table
@@ -992,7 +992,7 @@ def to_arrow(self):
             {str(name): col.to_arrow() for name, col in self._data.items()}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _positions_from_column_names(self, column_names) -> list[int]:
         """Map each column name into their positions in the frame.
 
@@ -1005,7 +1005,7 @@ def _positions_from_column_names(self, column_names) -> list[int]:
             if name in set(column_names)
         ]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _copy_type_metadata(self: Self, other: Self) -> Self:
         """
         Copy type metadata from each column of `other` to the corresponding
@@ -1020,7 +1020,7 @@ def _copy_type_metadata(self: Self, other: Self) -> Self:
 
         return self
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isna(self):
         """
         Identify missing values.
@@ -1101,7 +1101,7 @@ def isna(self):
     # Alias for isna
     isnull = isna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def notna(self):
         """
         Identify non-missing values.
@@ -1182,7 +1182,7 @@ def notna(self):
     # Alias for notna
     notnull = notna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def searchsorted(
         self,
         values,
@@ -1296,7 +1296,7 @@ def searchsorted(
         else:
             return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def argsort(
         self,
         by=None,
@@ -1383,7 +1383,7 @@ def argsort(
             by=by, ascending=ascending, na_position=na_position
         ).values
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_sorted_inds(
         self,
         by=None,
@@ -1411,7 +1411,7 @@ def _get_sorted_inds(
             stable=True,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _split(self, splits):
         """Split a frame with split points in ``splits``. Returns a list of
         Frames of length `len(splits) + 1`.
@@ -1426,13 +1426,13 @@ def _split(self, splits):
             for split_idx in range(len(splits) + 1)
         ]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _encode(self):
         columns, indices = libcudf.transform.table_encode([*self._columns])
         keys = self._from_columns_like_self(columns)
         return keys, indices
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _unaryop(self, op):
         data_columns = (col.unary_operator(op) for col in self._columns)
         return self._from_data_like_self(
@@ -1440,7 +1440,7 @@ def _unaryop(self, op):
         )
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _colwise_binop(
         cls,
         operands: dict[str | None, tuple[ColumnBase, Any, bool, Any]],
@@ -1519,11 +1519,11 @@ def _colwise_binop(
 
         return output
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return _array_ufunc(self, ufunc, method, inputs, kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @acquire_spill_lock()
     def _apply_cupy_ufunc_to_operands(
         self, ufunc, cupy_func, operands, **kwargs
@@ -1565,7 +1565,7 @@ def _apply_cupy_ufunc_to_operands(
         return data
 
     # Unary logical operators
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __neg__(self):
         """Negate for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
@@ -1579,30 +1579,30 @@ def __neg__(self):
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __pos__(self):
         return self.copy(deep=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __abs__(self):
         return self._unaryop("abs")
 
     # Reductions
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_axis_from_axis_arg(cls, axis):
         try:
             return cls._SUPPORT_AXIS_LOOKUP[axis]
         except KeyError:
             raise ValueError(f"No axis named {axis} for object type {cls}")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _reduce(self, *args, **kwargs):
         raise NotImplementedError(
             f"Reductions are not supported for objects of type {type(self)}."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def min(
         self,
         axis=0,
@@ -1653,7 +1653,7 @@ def min(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def max(
         self,
         axis=0,
@@ -1701,7 +1701,7 @@ def max(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def all(self, axis=0, skipna=True, **kwargs):
         """
         Return whether all elements are True in DataFrame.
@@ -1754,7 +1754,7 @@ def all(self, axis=0, skipna=True, **kwargs):
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def any(self, axis=0, skipna=True, **kwargs):
         """
         Return whether any elements is True in DataFrame.
@@ -1807,26 +1807,26 @@ def any(self, axis=0, skipna=True, **kwargs):
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @ioutils.doc_to_dlpack()
     def to_dlpack(self):
         """{docstring}"""
 
         return cudf.io.dlpack.to_dlpack(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __str__(self):
         return repr(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __deepcopy__(self, memo):
         return self.copy(deep=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __copy__(self):
         return self.copy(deep=False)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __invert__(self):
         """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
@@ -1835,7 +1835,7 @@ def __invert__(self):
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nunique(self, dropna: bool = True):
         """
         Returns a per column mapping with counts of unique values for
@@ -1856,7 +1856,7 @@ def nunique(self, dropna: bool = True):
         )
 
     @staticmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _repeat(
         columns: list[ColumnBase], repeats, axis=None
     ) -> list[ColumnBase]:
@@ -1870,7 +1870,7 @@ def _repeat(
 
         return libcudf.filling.repeat(columns, repeats)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @_warn_no_dask_cudf
     def __dask_tokenize__(self):
         from dask.base import normalize_token
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 77b54a583d3..eccb3acabf6 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -31,7 +31,7 @@
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
 from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin
 
 if TYPE_CHECKING:
@@ -392,7 +392,7 @@ def indices(self):
             zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1]))
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_group(self, name, obj=None):
         """
         Construct DataFrame from group with provided name.
@@ -436,7 +436,7 @@ def get_group(self, name, obj=None):
             )
         return obj.iloc[self.indices[name]]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def size(self):
         """
         Return the size of each group.
@@ -451,7 +451,7 @@ def size(self):
             .agg("size")
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cumcount(self):
         """
         Return the cumulative count of keys in each group.
@@ -467,7 +467,7 @@ def cumcount(self):
             .agg("cumcount")
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rank(
         self,
         method="average",
@@ -521,7 +521,7 @@ def _groupby(self):
             [*self.grouping.keys._columns], dropna=self._dropna
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def agg(self, func):
         """
         Apply aggregation(s) to the groups.
@@ -821,7 +821,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
         else:
             return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def head(self, n: int = 5, *, preserve_order: bool = True):
         """Return first n rows of each group
 
@@ -874,7 +874,7 @@ def head(self, n: int = 5, *, preserve_order: bool = True):
             n, take_head=True, preserve_order=preserve_order
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def tail(self, n: int = 5, *, preserve_order: bool = True):
         """Return last n rows of each group
 
@@ -928,7 +928,7 @@ def tail(self, n: int = 5, *, preserve_order: bool = True):
             n, take_head=False, preserve_order=preserve_order
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nth(self, n):
         """
         Return the nth row from each group.
@@ -949,7 +949,7 @@ def nth(self, n):
         del self.obj._data["__groupbynth_order__"]
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ngroup(self, ascending=True):
         """
         Number each group from 0 to the number of groups - 1.
@@ -1261,7 +1261,7 @@ def _normalize_aggs(
         ]
         return column_names, columns, normalized_aggs
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pipe(self, func, *args, **kwargs):
         """
         Apply a function `func` with arguments to this GroupBy
@@ -1316,7 +1316,7 @@ def pipe(self, func, *args, **kwargs):
         """
         return cudf.core.common.pipe(self, func, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _jit_groupby_apply(
         self, function, group_names, offsets, group_keys, grouped_values, *args
     ):
@@ -1327,7 +1327,7 @@ def _jit_groupby_apply(
             chunk_results, group_names, group_keys, grouped_values
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _iterative_groupby_apply(
         self, function, group_names, offsets, group_keys, grouped_values, *args
     ):
@@ -1415,7 +1415,7 @@ def _post_process_chunk_results(
                 result.index = cudf.MultiIndex._from_data(index_data)
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def apply(
         self, function, *args, engine="auto", include_groups: bool = True
     ):
@@ -1573,7 +1573,7 @@ def mult(df):
             result = result.reset_index()
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def apply_grouped(self, function, **kwargs):
         """Apply a transformation function over the grouped chunk.
 
@@ -1712,7 +1712,7 @@ def rolling_avg(val, avg):
         kwargs.update({"chunks": offsets})
         return grouped_values.apply_chunks(function, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _broadcast(self, values):
         """
         Broadcast the results of an aggregation to the group
@@ -1736,7 +1736,7 @@ def _broadcast(self, values):
             values.index = self.obj.index
         return values
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def transform(self, function):
         """Apply an aggregation, then broadcast the result to the group size.
 
@@ -1801,7 +1801,7 @@ def rolling(self, *args, **kwargs):
         """
         return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def count(self, dropna=True):
         """Compute the number of values in each column.
 
@@ -1816,7 +1816,7 @@ def func(x):
 
         return self.agg(func)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def describe(self, include=None, exclude=None):
         """
         Generate descriptive statistics that summarizes the central tendency,
@@ -1888,7 +1888,7 @@ def describe(self, include=None, exclude=None):
         )
         return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def corr(self, method="pearson", min_periods=1):
         """
         Compute pairwise correlation of columns, excluding NA/null values.
@@ -1950,7 +1950,7 @@ def corr(self, method="pearson", min_periods=1):
             lambda x: x.corr(method, min_periods), "Correlation"
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cov(self, min_periods=0, ddof=1):
         """
         Compute the pairwise covariance among the columns of a DataFrame,
@@ -2129,7 +2129,7 @@ def _cov_or_corr(self, func, method_name):
 
         return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def var(self, ddof=1):
         """Compute the column-wise variance of the values in each group.
 
@@ -2145,7 +2145,7 @@ def func(x):
 
         return self.agg(func)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def std(self, ddof=1):
         """Compute the column-wise std of the values in each group.
 
@@ -2161,7 +2161,7 @@ def func(x):
 
         return self.agg(func)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quantile(self, q=0.5, interpolation="linear"):
         """Compute the column-wise quantiles of the values in each group.
 
@@ -2179,18 +2179,18 @@ def func(x):
 
         return self.agg(func)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def collect(self):
         """Get a list of all the values for each column in each group."""
         _deprecate_collect()
         return self.agg(list)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def unique(self):
         """Get a list of the unique values for each column in each group."""
         return self.agg("unique")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def diff(self, periods=1, axis=0):
         """Get the difference between the values in each group.
 
@@ -2258,7 +2258,7 @@ def bfill(self, limit=None):
 
         return self._scan_fill("bfill", limit)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(
         self,
         value=None,
@@ -2325,7 +2325,7 @@ def fillna(
             value=value, inplace=inplace, axis=axis, limit=limit
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         """
         Shift each group by ``periods`` positions.
@@ -2388,7 +2388,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         result = self._mimic_pandas_order(result)
         return result._copy_type_metadata(values)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pct_change(
         self,
         periods=1,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 71658695b80..e069f8d0ea6 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -58,7 +58,7 @@
     is_mixed_with_object_dtype,
     numeric_normalize_types,
 )
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
 
 if TYPE_CHECKING:
@@ -204,7 +204,7 @@ class RangeIndex(BaseIndex, BinaryOperand):
 
     _range: range
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self, start, stop=None, step=1, dtype=None, copy=False, name=None
     ):
@@ -259,17 +259,17 @@ def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
         return codes, uniques
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self):
         return self._name
 
     @name.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self, value):
         self._name = value
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def start(self) -> int:
         """
         The value of the `start` parameter (0 if this was not supplied).
@@ -277,7 +277,7 @@ def start(self) -> int:
         return self._range.start
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def stop(self) -> int:
         """
         The value of the stop parameter.
@@ -285,7 +285,7 @@ def stop(self) -> int:
         return self._range.stop
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def step(self) -> int:
         """
         The value of the step parameter.
@@ -293,12 +293,12 @@ def step(self) -> int:
         return self._range.step
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _num_rows(self) -> int:
         return len(self)
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _values(self):
         if len(self) > 0:
             return column.as_column(self._range, dtype=self.dtype)
@@ -330,18 +330,18 @@ def _is_interval(self) -> bool:
         return False
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hasnans(self) -> bool:
         return False
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _data(self):
         return cudf.core.column_accessor.ColumnAccessor(
             {self.name: self._values}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __contains__(self, item):
         hash(item)
         if isinstance(item, bool) or not isinstance(
@@ -357,7 +357,7 @@ def __contains__(self, item):
         except (ValueError, OverflowError):
             return False
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def copy(self, name=None, deep=False):
         """
         Make a copy of this object.
@@ -377,7 +377,7 @@ def copy(self, name=None, deep=False):
 
         return RangeIndex(self._range, name=name)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(self, dtype, copy: bool = True):
         if is_dtype_equal(dtype, self.dtype):
             return self
@@ -386,15 +386,15 @@ def astype(self, dtype, copy: bool = True):
     def fillna(self, value, downcast=None):
         return self.copy()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop_duplicates(self, keep="first"):
         return self
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def duplicated(self, keep="first") -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __repr__(self):
         return (
             f"{self.__class__.__name__}(start={self.start}, stop={self.stop}"
@@ -408,15 +408,15 @@ def __repr__(self):
         )
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def size(self) -> int:
         return len(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __len__(self):
         return len(self._range)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, index):
         if isinstance(index, slice):
             sl_start, sl_stop, sl_step = index.indices(len(self))
@@ -435,13 +435,13 @@ def __getitem__(self, index):
             return self.start + index * self.step
         return self._as_int_index()[index]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:
         if isinstance(other, RangeIndex):
             return self._range == other._range
         return self._as_int_index().equals(other)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def serialize(self):
         header = {}
         header["index_column"] = {}
@@ -462,7 +462,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def deserialize(cls, header, frames):
         h = header["index_column"]
         name = pickle.loads(header["name"])
@@ -472,7 +472,7 @@ def deserialize(cls, header, frames):
         return RangeIndex(start=start, stop=stop, step=step, name=name)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dtype(self):
         """
         `dtype` of the range of values in RangeIndex.
@@ -487,7 +487,7 @@ def dtype(self):
     def _dtypes(self) -> Iterable:
         return [(self.name, self.dtype)]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.RangeIndex:
@@ -508,16 +508,16 @@ def is_unique(self) -> bool:
         return True
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_increasing(self) -> bool:
         return self.step > 0 or len(self) <= 1
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_decreasing(self):
         return self.step < 0 or len(self) <= 1
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, deep: bool = False) -> int:
         if deep:
             warnings.warn(
@@ -530,7 +530,7 @@ def unique(self) -> Self:
         # RangeIndex always has unique values
         return self.copy()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __mul__(self, other):
         # Multiplication by raw ints must return a RangeIndex to match pandas.
         if isinstance(other, cudf.Scalar) and other.dtype.kind in "iu":
@@ -547,24 +547,24 @@ def __mul__(self, other):
             )
         return self._as_int_index().__mul__(other)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __rmul__(self, other):
         # Multiplication is commutative.
         return self.__mul__(other)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _as_int_index(self):
         # Convert self to an integer index. This method is used to perform ops
         # that are not defined directly on RangeIndex.
         return cudf.Index._from_data(self._data)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return self._as_int_index().__array_ufunc__(
             ufunc, method, *inputs, **kwargs
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_indexer(self, target, limit=None, method=None, tolerance=None):
         target_col = cudf.core.column.as_column(target)
         if method is not None or not isinstance(
@@ -594,7 +594,7 @@ def get_indexer(self, target, limit=None, method=None, tolerance=None):
             locs[valid] = len(self) - 1 - locs[valid]
         return locs
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_loc(self, key):
         if not is_scalar(key):
             raise TypeError("Should be a scalar-like")
@@ -608,7 +608,7 @@ def get_loc(self, key):
             raise KeyError(key)
         return idx_int
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _union(self, other, sort=None):
         if isinstance(other, RangeIndex):
             # Variable suffixes are of the
@@ -685,7 +685,7 @@ def _union(self, other, sort=None):
             self._as_int_index()._union(other, sort=sort)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _intersection(self, other, sort=None):
         if not isinstance(other, RangeIndex):
             return self._try_reconstruct_range_index(
@@ -733,7 +733,7 @@ def _intersection(self, other, sort=None):
 
         return self._try_reconstruct_range_index(new_index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def difference(self, other, sort=None):
         if isinstance(other, RangeIndex) and self.equals(other):
             return self[:0]._get_reconciled_name_object(other)
@@ -785,14 +785,14 @@ def sort_values(
         else:
             return sorted_index
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _gather(self, gather_map, nullify=False, check_bounds=True):
         gather_map = cudf.core.column.as_column(gather_map)
         return cudf.Index._from_data(
             {self.name: self._values.take(gather_map, nullify, check_bounds)}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _apply_boolean_mask(self, boolean_mask):
         return cudf.Index._from_data(
             {self.name: self._values.apply_boolean_mask(boolean_mask)}
@@ -838,21 +838,21 @@ def join(
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _column(self):
         return self._as_int_index()._column
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _columns(self):
         return self._as_int_index()._columns
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values_host(self) -> np.ndarray:
         return np.arange(start=self.start, stop=self.stop, step=self.step)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def argsort(
         self,
         ascending=True,
@@ -865,19 +865,19 @@ def argsort(
         else:
             return cupy.arange(len(self))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         return self._as_int_index().where(cond, other, inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_numpy(self) -> np.ndarray:
         return self.values_host
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_cupy(self) -> cupy.ndarray:
         return self.values
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_arrow(self) -> pa.Array:
         return pa.array(self._range, type=pa.from_numpy_dtype(self.dtype))
 
@@ -889,23 +889,23 @@ def __array__(self, dtype=None):
             "using .to_numpy()."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nunique(self, dropna: bool = True) -> int:
         return len(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isna(self) -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
     isnull = isna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def notna(self) -> cupy.ndarray:
         return cupy.ones(len(self), dtype=bool)
 
     notnull = isna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _minmax(self, meth: str):
         no_steps = len(self) - 1
         if no_steps == -1:
@@ -1004,12 +1004,12 @@ class Index(SingleColumnFrame, BaseIndex, metaclass=IndexMeta):
         Column's, the data Column will be cloned to adopt this name.
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(self, data, **kwargs):
         name = _getdefault_name(data, name=kwargs.get("name"))
         super().__init__({name: data})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         ret = super().__array_ufunc__(ufunc, method, *inputs, **kwargs)
 
@@ -1046,7 +1046,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return NotImplemented
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self:
         out = super()._from_data(data=data)
         if name is not no_default:
@@ -1054,7 +1054,7 @@ def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self:
         return out
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data_like_self(
         cls, data: MutableMapping, name: Any = no_default
     ) -> Self:
@@ -1064,7 +1064,7 @@ def _from_data_like_self(
         return out
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrow(cls, obj):
         try:
             return cls(ColumnBase.from_arrow(obj))
@@ -1118,12 +1118,12 @@ def _binaryop(
         return ret
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _values(self):
         return self._column
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _concat(cls, objs):
         non_empties = [index for index in objs if len(index)]
         if len(objs) != len(non_empties):
@@ -1166,16 +1166,16 @@ def _concat(cls, objs):
         result.name = name
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, deep=False):
         return self._column.memory_usage
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_unique(self):
         return self._column.is_unique
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:
         if not isinstance(other, BaseIndex) or len(self) != len(other):
             return False
@@ -1198,7 +1198,7 @@ def equals(self, other) -> bool:
         except TypeError:
             return False
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def copy(self, name=None, deep=False):
         """
         Make a copy of this object.
@@ -1221,11 +1221,11 @@ def copy(self, name=None, deep=False):
             {name: self._values.copy(True) if deep else self._values}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(self, dtype, copy: bool = True):
         return super().astype({self.name: dtype}, copy)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_indexer(self, target, method=None, limit=None, tolerance=None):
         if is_scalar(target):
             raise TypeError("Should be a sequence")
@@ -1297,7 +1297,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
 
         return _return_get_indexer_result(result_series.to_cupy())
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_loc(self, key):
         if not is_scalar(key):
             raise TypeError("Should be a scalar-like")
@@ -1333,7 +1333,7 @@ def get_loc(self, key):
         mask[true_inds] = True
         return mask
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __repr__(self):
         max_seq_items = pd.get_option("max_seq_items") or len(self)
         mr = 0
@@ -1419,7 +1419,7 @@ def __repr__(self):
         lines.append(f"{prior_to_dtype} {keywords})")
         return "\n".join(lines)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, index):
         res = self._get_elements_from_column(index)
         if isinstance(res, ColumnBase):
@@ -1427,20 +1427,20 @@ def __getitem__(self, index):
         return res
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dtype(self):
         """
         `dtype` of the underlying values in Index.
         """
         return self._values.dtype
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isna(self):
         return self._column.isnull().values
 
     isnull = isna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def notna(self):
         return self._column.notnull().values
 
@@ -1470,11 +1470,11 @@ def _is_interval(self):
         return False
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hasnans(self):
         return self._column.has_nulls(include_nan=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def argsort(
         self,
         axis=0,
@@ -1518,7 +1518,7 @@ def repeat(self, repeats, axis=None):
             Frame._repeat([*self._columns], repeats, axis), self._column_names
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
@@ -1615,7 +1615,7 @@ def _indices_of(self, value):
 
     @copy_docstring(StringMethods)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def str(self):
         if is_string_dtype(self.dtype):
             return StringMethods(parent=self)
@@ -1698,7 +1698,7 @@ class DatetimeIndex(Index):
                   dtype='datetime64[ns]', name='a')
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -1761,7 +1761,7 @@ def __init__(
             ):
                 raise ValueError("No unique frequency found")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _copy_type_metadata(self: Self, other: Self) -> Self:
         super()._copy_type_metadata(other)
         self._freq = _validate_freq(other._freq)
@@ -1783,7 +1783,7 @@ def __getitem__(self, index):
             return pd.Timestamp(value)
         return value
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def copy(self, name=None, deep=False):
         idx_copy = super().copy(name=name, deep=deep)
         return idx_copy._copy_type_metadata(self)
@@ -1801,7 +1801,7 @@ def searchsorted(
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def year(self):
         """
         The year of the datetime.
@@ -1820,7 +1820,7 @@ def year(self):
         return self._get_dt_field("year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def month(self):
         """
         The month as January=1, December=12.
@@ -1839,7 +1839,7 @@ def month(self):
         return self._get_dt_field("month")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day(self):
         """
         The day of the datetime.
@@ -1858,7 +1858,7 @@ def day(self):
         return self._get_dt_field("day")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hour(self):
         """
         The hours of the datetime.
@@ -1879,7 +1879,7 @@ def hour(self):
         return self._get_dt_field("hour")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def minute(self):
         """
         The minutes of the datetime.
@@ -1900,7 +1900,7 @@ def minute(self):
         return self._get_dt_field("minute")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def second(self):
         """
         The seconds of the datetime.
@@ -1921,7 +1921,7 @@ def second(self):
         return self._get_dt_field("second")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def microsecond(self):
         """
         The microseconds of the datetime.
@@ -1952,7 +1952,7 @@ def microsecond(self):
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nanosecond(self):
         """
         The nanoseconds of the datetime.
@@ -1974,7 +1974,7 @@ def nanosecond(self):
         return self._get_dt_field("nanosecond")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def weekday(self):
         """
         The day of the week with Monday=0, Sunday=6.
@@ -1996,7 +1996,7 @@ def weekday(self):
         return self._get_dt_field("weekday")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dayofweek(self):
         """
         The day of the week with Monday=0, Sunday=6.
@@ -2018,7 +2018,7 @@ def dayofweek(self):
         return self._get_dt_field("weekday")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dayofyear(self):
         """
         The day of the year, from 1-365 in non-leap years and
@@ -2041,7 +2041,7 @@ def dayofyear(self):
         return self._get_dt_field("day_of_year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day_of_year(self):
         """
         The day of the year, from 1-365 in non-leap years and
@@ -2064,7 +2064,7 @@ def day_of_year(self):
         return self._get_dt_field("day_of_year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_leap_year(self):
         """
         Boolean indicator if the date belongs to a leap year.
@@ -2083,7 +2083,7 @@ def is_leap_year(self):
         return cupy.asarray(res)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quarter(self):
         """
         Integer indicator for which quarter of the year the date belongs in.
@@ -2108,7 +2108,7 @@ def quarter(self):
         res = extract_quarter(self._values)
         return Index(res, dtype="int8")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day_name(self, locale: str | None = None) -> Index:
         """
         Return the day names. Currently supports English locale only.
@@ -2128,7 +2128,7 @@ def day_name(self, locale: str | None = None) -> Index:
         day_names = self._column.get_day_names(locale)
         return Index._from_data({self.name: day_names})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def month_name(self, locale: str | None = None) -> Index:
         """
         Return the month names. Currently supports English locale only.
@@ -2147,7 +2147,7 @@ def month_name(self, locale: str | None = None) -> Index:
         month_names = self._column.get_month_names(locale)
         return Index._from_data({self.name: month_names})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isocalendar(self) -> cudf.DataFrame:
         """
         Returns a DataFrame with the year, week, and day
@@ -2172,7 +2172,7 @@ def isocalendar(self) -> cudf.DataFrame:
         )
         return cudf.DataFrame._from_data(ca, index=self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.DatetimeIndex:
@@ -2181,7 +2181,7 @@ def to_pandas(
             result.freq = self._freq._maybe_as_fast_pandas_offset()
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_dt_field(self, field):
         out_column = self._values.get_dt_field(field)
         # column.column_empty_like always returns a Column object
@@ -2198,7 +2198,7 @@ def _get_dt_field(self, field):
     def _is_boolean(self):
         return False
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ceil(self, freq):
         """
         Perform ceil operation on the data to the specified freq.
@@ -2231,7 +2231,7 @@ def ceil(self, freq):
 
         return self.__class__._from_data({self.name: out_column})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def floor(self, freq):
         """
         Perform floor operation on the data to the specified freq.
@@ -2264,7 +2264,7 @@ def floor(self, freq):
 
         return self.__class__._from_data({self.name: out_column})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def round(self, freq):
         """
         Perform round operation on the data to the specified freq.
@@ -2452,7 +2452,7 @@ class TimedeltaIndex(Index):
                   dtype='timedelta64[s]', name='delta-index')
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -2500,7 +2500,7 @@ def __getitem__(self, index):
         return value
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def days(self):
         """
         Number of days for each element.
@@ -2509,7 +2509,7 @@ def days(self):
         return Index(self._values.days, name=self.name, dtype="int64")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def seconds(self):
         """
         Number of seconds (>= 0 and less than 1 day) for each element.
@@ -2517,7 +2517,7 @@ def seconds(self):
         return Index(self._values.seconds, name=self.name, dtype="int32")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def microseconds(self):
         """
         Number of microseconds (>= 0 and less than 1 second) for each element.
@@ -2525,7 +2525,7 @@ def microseconds(self):
         return Index(self._values.microseconds, name=self.name, dtype="int32")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nanoseconds(self):
         """
         Number of nanoseconds (>= 0 and less than 1 microsecond) for each
@@ -2534,7 +2534,7 @@ def nanoseconds(self):
         return Index(self._values.nanoseconds, name=self.name, dtype="int32")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def components(self):
         """
         Return a dataframe of the components (days, hours, minutes,
@@ -2612,7 +2612,7 @@ class CategoricalIndex(Index):
     CategoricalIndex([1, 2, 3, <NA>], categories=[1, 2, 3], ordered=False, dtype='category', name='a')
     """  # noqa: E501
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -2667,7 +2667,7 @@ def __init__(
         super().__init__(data, name=name)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def codes(self):
         """
         The category codes of this categorical.
@@ -2675,7 +2675,7 @@ def codes(self):
         return Index(self._values.codes)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def categories(self):
         """
         The categories of this categorical.
@@ -2689,7 +2689,7 @@ def _is_categorical(self):
         return True
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def interval_range(
     start=None,
     end=None,
@@ -2841,7 +2841,7 @@ class IntervalIndex(Index):
     IntervalIndex
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data,
@@ -2900,7 +2900,7 @@ def closed(self):
         return self.dtype.closed
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_breaks(
         cls,
         breaks,
@@ -2975,7 +2975,7 @@ def _clean_nulls_from_index(self):
         return self
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def as_index(
     arbitrary, nan_as_null=no_default, copy=False, name=no_default, dtype=None
 ) -> BaseIndex:
@@ -3090,7 +3090,7 @@ def _getdefault_name(values, name):
     return name
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex:
     """
     An internal Utility function to concat RangeIndex objects.
@@ -3131,7 +3131,7 @@ def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex:
     return RangeIndex(start, stop, step)
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _extended_gcd(a: int, b: int) -> tuple[int, int, int]:
     """
     Extended Euclidean algorithms to solve Bezout's identity:
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 280a6e92eab..72bd3c45fa6 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -56,7 +56,7 @@
 from cudf.utils import docutils, ioutils
 from cudf.utils._numba import _CUDFNumbaConfig
 from cudf.utils.docutils import copy_docstring
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf
 
 if TYPE_CHECKING:
@@ -301,13 +301,13 @@ def _from_data(
         out._index = RangeIndex(out._data.nrows) if index is None else index
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data_like_self(self, data: MutableMapping):
         out = super()._from_data_like_self(data)
         out.index = self.index
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_columns_like_self(
         self,
         columns: list[ColumnBase],
@@ -363,7 +363,7 @@ def _mimic_inplace(
             self._index = result.index
         return super()._mimic_inplace(result, inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _scan(self, op, axis=None, skipna=True):
         """
         Return {op_name} of the {cls}.
@@ -439,7 +439,7 @@ def _check_data_index_length_match(self) -> None:
             )
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def empty(self):
         """
         Indicator whether DataFrame or Series is empty.
@@ -501,7 +501,7 @@ def empty(self):
         """
         return self.size == 0
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @ioutils.doc_to_json()
     def to_json(self, path_or_buf=None, *args, **kwargs):
         """{docstring}"""
@@ -510,14 +510,14 @@ def to_json(self, path_or_buf=None, *args, **kwargs):
             self, path_or_buf=path_or_buf, *args, **kwargs
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @ioutils.doc_to_hdf()
     def to_hdf(self, path_or_buf, key, *args, **kwargs):
         """{docstring}"""
 
         cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_string(self):
         r"""
         Convert to string
@@ -606,7 +606,7 @@ def copy(self, deep: bool = True) -> Self:
             self.index.copy(deep=False),
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:  # noqa: D102
         return super().equals(other) and self.index.equals(other.index)
 
@@ -632,7 +632,7 @@ def index(self, value):
 
         self._index = value
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def replace(
         self,
         to_replace=None,
@@ -900,7 +900,7 @@ def replace(
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def clip(self, lower=None, upper=None, inplace=False, axis=1):
         """
         Trim values at input threshold(s).
@@ -1026,7 +1026,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1):
         )
         return self._mimic_inplace(output, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def abs(self):
         """
         Return a Series/DataFrame with absolute numeric value of each element.
@@ -1052,7 +1052,7 @@ def abs(self):
         """
         return self._unaryop("abs")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dot(self, other, reflect=False):
         """
         Get dot product of frame and other, (binary operator `dot`).
@@ -1159,15 +1159,15 @@ def dot(self, other, reflect=False):
             )
         return result.item()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __matmul__(self, other):
         return self.dot(other)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __rmatmul__(self, other):
         return self.dot(other, reflect=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def head(self, n=5):
         """
         Return the first `n` rows.
@@ -1246,7 +1246,7 @@ def head(self, n=5):
         """
         return self.iloc[:n]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def tail(self, n=5):
         """
         Returns the last n rows as a new DataFrame or Series
@@ -1277,7 +1277,7 @@ def tail(self, n=5):
 
         return self.iloc[-n:]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pipe(self, func, *args, **kwargs):
         """
         Apply ``func(self, *args, **kwargs)``.
@@ -1324,7 +1324,7 @@ def pipe(self, func, *args, **kwargs):
         """
         return cudf.core.common.pipe(self, func, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sum(
         self,
         axis=no_default,
@@ -1385,7 +1385,7 @@ def sum(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def product(
         self,
         axis=no_default,
@@ -1452,7 +1452,7 @@ def product(
     # Alias for pandas compatibility.
     prod = product
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         """
         Return the mean of the values for the requested axis.
@@ -1541,7 +1541,7 @@ def median(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def std(
         self,
         axis=no_default,
@@ -1600,7 +1600,7 @@ def std(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def var(
         self,
         axis=no_default,
@@ -1658,7 +1658,7 @@ def var(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         """
         Return Fisher's unbiased kurtosis of a sample.
@@ -1718,7 +1718,7 @@ def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs):
     # Alias for kurtosis.
     kurt = kurtosis
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         """
         Return unbiased Fisher-Pearson skew of a sample.
@@ -1777,7 +1777,7 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs):
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def mask(self, cond, other=None, inplace: bool = False) -> Self | None:
         """
         Replace values where the condition is True.
@@ -1839,7 +1839,7 @@ def mask(self, cond, other=None, inplace: bool = False) -> Self | None:
 
         return self.where(cond=~cond, other=other, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @copy_docstring(Rolling)
     def rolling(
         self, window, min_periods=None, center=False, axis=0, win_type=None
@@ -1879,7 +1879,7 @@ def ewm(
             times=times,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nans_to_nulls(self):
         """
         Convert nans (if any) to nulls
@@ -1935,7 +1935,7 @@ def nans_to_nulls(self):
             self._data._from_columns_like_self(result)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def interpolate(
         self,
         method="linear",
@@ -2034,7 +2034,7 @@ def interpolate(
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         """Shift values by `periods` positions."""
         axis = self._get_axis_from_axis_arg(axis)
@@ -2050,7 +2050,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
             self._data._from_columns_like_self(data_columns)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def truncate(self, before=None, after=None, axis=0, copy=True):
         """
         Truncate a Series or DataFrame before and after some index value.
@@ -2398,7 +2398,7 @@ def iloc(self):
         return self._iloc_indexer_type(self)
 
     @property  # type:ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def axes(self):
         """
         Return a list representing the axes of the Series.
@@ -2530,7 +2530,7 @@ def squeeze(self, axis: Literal["index", "columns", 0, 1, None] = None):
         )
         return self.iloc[indexer]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def scale(self):
         """
         Scale values to [0, 1] in float64
@@ -2565,7 +2565,7 @@ def scale(self):
         scaled.index = self.index.copy(deep=False)
         return scaled
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sort_index(
         self,
         axis=0,
@@ -3070,7 +3070,7 @@ def drop_duplicates(
             self.index.names if not ignore_index else None,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def duplicated(self, subset=None, keep="first"):
         """
         Return boolean Series denoting duplicate rows.
@@ -3180,7 +3180,7 @@ def duplicated(self, subset=None, keep="first"):
         )
         return cudf.Series(result, index=self.index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _empty_like(self, keep_index=True) -> Self:
         result = self._from_columns_like_self(
             libcudf.copying.columns_empty_like(
@@ -3217,7 +3217,7 @@ def _split(self, splits, keep_index=True):
             for i in range(len(splits) + 1)
         ]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def bfill(self, value=None, axis=None, inplace=None, limit=None):
         """
         Synonym for :meth:`Series.fillna` with ``method='bfill'``.
@@ -3236,7 +3236,7 @@ def bfill(self, value=None, axis=None, inplace=None, limit=None):
                 limit=limit,
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def backfill(self, value=None, axis=None, inplace=None, limit=None):
         """
         Synonym for :meth:`Series.fillna` with ``method='bfill'``.
@@ -3256,7 +3256,7 @@ def backfill(self, value=None, axis=None, inplace=None, limit=None):
         )
         return self.bfill(value=value, axis=axis, inplace=inplace, limit=limit)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ffill(self, value=None, axis=None, inplace=None, limit=None):
         """
         Synonym for :meth:`Series.fillna` with ``method='ffill'``.
@@ -3275,7 +3275,7 @@ def ffill(self, value=None, axis=None, inplace=None, limit=None):
                 limit=limit,
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pad(self, value=None, axis=None, inplace=None, limit=None):
         """
         Synonym for :meth:`Series.fillna` with ``method='ffill'``.
@@ -3415,7 +3415,7 @@ def add_suffix(self, suffix):
         raise NotImplementedError
 
     @acquire_spill_lock()
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _apply(self, func, kernel_getter, *args, **kwargs):
         """Apply `func` across the rows of the frame."""
         if kwargs:
@@ -3626,7 +3626,7 @@ def _align_to_index(
         out.index.names = self.index.names
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _reindex(
         self,
         column_names,
@@ -4154,7 +4154,7 @@ def dropna(
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _drop_na_columns(self, how="any", subset=None, thresh=None):
         """
         Drop columns containing nulls
@@ -4471,7 +4471,7 @@ def last(self, offset):
             slice_func=lambda i: self.iloc[i:],
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sample(
         self,
         n=None,
@@ -4751,7 +4751,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
 
         return NotImplemented
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def repeat(self, repeats, axis=None):
         """Repeats elements consecutively.
 
@@ -4949,7 +4949,7 @@ def astype(
                 raise e
             return self
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop(
         self,
         labels=None,
@@ -5161,7 +5161,7 @@ def drop(
         if not inplace:
             return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _explode(self, explode_column: Any, ignore_index: bool):
         # Helper function for `explode` in `Series` and `Dataframe`, explodes a
         # specified nested column. Other columns' corresponding rows are
@@ -5200,7 +5200,7 @@ def _explode(self, explode_column: Any, ignore_index: bool):
             self.index.names if not ignore_index else None,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def tile(self, count):
         """Repeats the rows `count` times to form a new Frame.
 
@@ -5233,7 +5233,7 @@ def tile(self, count):
             index_names=self._index_names,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def groupby(
         self,
         by=None,
@@ -5283,7 +5283,7 @@ def groupby(
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Addition",
@@ -5324,7 +5324,7 @@ def add(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__add__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Addition",
@@ -5365,7 +5365,7 @@ def radd(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__radd__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Subtraction",
@@ -5408,7 +5408,7 @@ def subtract(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
     sub = subtract
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Subtraction",
@@ -5449,7 +5449,7 @@ def rsub(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rsub__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Multiplication",
@@ -5492,7 +5492,7 @@ def multiply(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
     mul = multiply
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Multiplication",
@@ -5533,7 +5533,7 @@ def rmul(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rmul__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Modulo",
@@ -5574,7 +5574,7 @@ def mod(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__mod__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Modulo",
@@ -5615,7 +5615,7 @@ def rmod(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rmod__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Exponential",
@@ -5656,7 +5656,7 @@ def pow(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__pow__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Exponential",
@@ -5697,7 +5697,7 @@ def rpow(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rpow__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Integer division",
@@ -5738,7 +5738,7 @@ def floordiv(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__floordiv__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Integer division",
@@ -5779,7 +5779,7 @@ def rfloordiv(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rfloordiv__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Floating division",
@@ -5824,7 +5824,7 @@ def truediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
     div = truediv
     divide = truediv
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Floating division",
@@ -5868,7 +5868,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
     # Alias for rtruediv
     rdiv = rtruediv
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Equal to",
@@ -5908,7 +5908,7 @@ def eq(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__eq__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Not equal to",
@@ -5948,7 +5948,7 @@ def ne(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__ne__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Less than",
@@ -5988,7 +5988,7 @@ def lt(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__lt__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Less than or equal to",
@@ -6028,7 +6028,7 @@ def le(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__le__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Greater than",
@@ -6068,7 +6068,7 @@ def gt(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__gt__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Greater than or equal to",
@@ -6123,7 +6123,7 @@ def _preprocess_subset(self, subset):
             raise KeyError(f"columns {diff} do not exist")
         return subset
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rank(
         self,
         axis=0,
@@ -6291,7 +6291,7 @@ def _check_duplicate_level_names(specified, level_names):
         )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _get_replacement_values_for_columns(
     to_replace: Any, value: Any, columns_dtype_map: dict[Any, Any]
 ) -> tuple[dict[Any, bool], dict[Any, Any], dict[Any, Any]]:
@@ -6458,7 +6458,7 @@ def _is_series(obj):
     return isinstance(obj, Frame) and obj.ndim == 1 and obj.index is not None
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _drop_rows_by_labels(
     obj: DataFrameOrSeries,
     labels: ColumnLike | abc.Iterable | str,
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 547c14cdc99..7657fa9e234 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -32,7 +32,7 @@
 )
 from cudf.core.join._join_helpers import _match_join_keys
 from cudf.utils.dtypes import is_column_like
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
 if TYPE_CHECKING:
@@ -126,7 +126,7 @@ class MultiIndex(Frame, BaseIndex, NotIterable):
                )
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         levels=None,
@@ -211,12 +211,12 @@ def __init__(
         self.names = names
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def names(self):
         return self._names
 
     @names.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def names(self, value):
         if value is None:
             value = [None] * self.nlevels
@@ -242,13 +242,13 @@ def names(self, value):
             )
         self._names = pd.core.indexes.frozen.FrozenList(value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_series(self, index=None, name=None):
         raise NotImplementedError(
             "MultiIndex.to_series isn't implemented yet."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(self, dtype, copy: bool = True):
         if not is_object_dtype(dtype):
             raise TypeError(
@@ -257,7 +257,7 @@ def astype(self, dtype, copy: bool = True):
             )
         return self
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rename(self, names, inplace=False):
         """
         Alter MultiIndex level names
@@ -304,7 +304,7 @@ def rename(self, names, inplace=False):
         """
         return self.set_names(names, level=None, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def set_names(self, names, level=None, inplace=False):
         names_is_list_like = is_list_like(names)
         level_is_list_like = is_list_like(level)
@@ -342,7 +342,7 @@ def set_names(self, names, level=None, inplace=False):
         return self._set_names(names=names, inplace=inplace)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data(
         cls,
         data: MutableMapping,
@@ -354,16 +354,16 @@ def _from_data(
         return obj
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self):
         return self._name
 
     @name.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self, value):
         self._name = value
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def copy(
         self,
         names=None,
@@ -432,7 +432,7 @@ def copy(
 
         return mi
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __repr__(self):
         max_seq_items = pd.get_option("display.max_seq_items") or len(self)
 
@@ -484,7 +484,7 @@ def _codes_frame(self):
 
     @property  # type: ignore
     @_external_only_api("Use ._codes_frame instead")
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def codes(self):
         """
         Returns the codes of the underlying MultiIndex.
@@ -510,13 +510,13 @@ def get_slice_bound(self, label, side, kind=None):
         raise NotImplementedError()
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nlevels(self):
         """Integer number of levels in this MultiIndex."""
         return self._num_columns
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def levels(self):
         """
         Returns list of levels in the MultiIndex
@@ -548,12 +548,12 @@ def levels(self):
         return self._levels
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ndim(self) -> int:
         """Dimension of the data. For MultiIndex ndim is always 2."""
         return 2
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_level_label(self, level):
         """Get name of the level.
 
@@ -570,7 +570,7 @@ def _get_level_label(self, level):
         else:
             return self._data.names[level]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isin(self, values, level=None):
         """Return a boolean array where the index values are in values.
 
@@ -669,7 +669,7 @@ def where(self, cond, other=None, inplace=False):
             ".where is not supported for MultiIndex operations"
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _compute_levels_and_codes(self):
         levels = []
 
@@ -683,7 +683,7 @@ def _compute_levels_and_codes(self):
         self._levels = levels
         self._codes = cudf.DataFrame._from_data(codes)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _compute_validity_mask(self, index, row_tuple, max_length):
         """Computes the valid set of indices of values in the lookup"""
         lookup = cudf.DataFrame()
@@ -731,7 +731,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
                     raise KeyError(row)
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
         # Instructions for Slicing
         # if tuple, get first and last elements of tuple
@@ -761,7 +761,7 @@ def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
             return row_tuple
         return self._compute_validity_mask(index, row_tuple, max_length)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _index_and_downcast(self, result, index, index_key):
         if isinstance(index_key, (numbers.Number, slice)):
             index_key = [index_key]
@@ -829,7 +829,7 @@ def _index_and_downcast(self, result, index, index_key):
             result.index = index
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_row_major(
         self,
         df: DataFrameOrSeries,
@@ -856,7 +856,7 @@ def _get_row_major(
         final = self._index_and_downcast(result, result.index, row_tuple)
         return final
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _validate_indexer(
         self,
         indexer: numbers.Number
@@ -884,7 +884,7 @@ def _validate_indexer(
             for i in indexer:
                 self._validate_indexer(i)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __eq__(self, other):
         if isinstance(other, MultiIndex):
             return np.array(
@@ -898,12 +898,12 @@ def __eq__(self, other):
         return NotImplemented
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def size(self):
         # The size of a MultiIndex is only dependent on the number of rows.
         return self._num_rows
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def take(self, indices):
         if isinstance(indices, cudf.Series) and indices.has_nulls:
             raise ValueError("Column must have no nulls.")
@@ -911,7 +911,7 @@ def take(self, indices):
         obj.names = self.names
         return obj
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def serialize(self):
         header, frames = super().serialize()
         # Overwrite the names in _data with the true names.
@@ -919,7 +919,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def deserialize(cls, header, frames):
         # Spoof the column names to construct the frame, then set manually.
         column_names = pickle.loads(header["column_names"])
@@ -927,7 +927,7 @@ def deserialize(cls, header, frames):
         obj = super().deserialize(header, frames)
         return obj._set_names(column_names)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, index):
         flatten = isinstance(index, int)
 
@@ -954,7 +954,7 @@ def __getitem__(self, index):
             result._levels = self._levels
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_frame(self, index=True, name=no_default, allow_duplicates=False):
         """
         Create a DataFrame with the levels of the MultiIndex as columns.
@@ -1031,7 +1031,7 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False):
             data=ca, index=self if index else None
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_level_values(self, level):
         """
         Return the values at the requested level
@@ -1087,7 +1087,7 @@ def _is_interval(self):
         return False
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _concat(cls, objs):
         source_data = [o.to_frame(index=False) for o in objs]
 
@@ -1107,7 +1107,7 @@ def _concat(cls, objs):
         return cudf.MultiIndex.from_frame(source_data, names=names)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_tuples(cls, tuples, names=None):
         """
         Convert list of tuples to MultiIndex.
@@ -1145,12 +1145,12 @@ def from_tuples(cls, tuples, names=None):
         pdi = pd.MultiIndex.from_tuples(tuples, names=names)
         return cls.from_pandas(pdi)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_numpy(self):
         return self.values_host
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values_host(self):
         """
         Return a numpy representation of the MultiIndex.
@@ -1178,7 +1178,7 @@ def values_host(self):
         return self.to_pandas().values
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values(self):
         """
         Return a CuPy representation of the MultiIndex.
@@ -1214,7 +1214,7 @@ def values(self):
         return self.to_frame(index=False).values
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         """
         Make a MultiIndex from a DataFrame.
@@ -1289,7 +1289,7 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         return obj
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_product(cls, arrays, names=None):
         """
         Make a MultiIndex from the cartesian product of multiple iterables.
@@ -1331,7 +1331,7 @@ def from_product(cls, arrays, names=None):
         return cls.from_pandas(pdi)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrays(
         cls,
         arrays,
@@ -1390,7 +1390,7 @@ def from_arrays(
             codes=codes, levels=levels, sortorder=sortorder, names=names
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _poplevels(self, level):
         """
         Remove and return the specified levels from self.
@@ -1441,7 +1441,7 @@ def _poplevels(self, level):
 
         return popped
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def swaplevel(self, i=-2, j=-1):
         """
         Swap level i with level j.
@@ -1492,7 +1492,7 @@ def swaplevel(self, i=-2, j=-1):
             midx = midx.set_names(self.names)
         return midx
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def droplevel(self, level=-1):
         """
         Removes the specified levels from the MultiIndex.
@@ -1555,7 +1555,7 @@ def droplevel(self, level=-1):
         else:
             return mi
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.MultiIndex:
@@ -1572,7 +1572,7 @@ def to_pandas(
         )
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default):
         """
         Convert from a Pandas MultiIndex
@@ -1607,7 +1607,7 @@ def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default):
         )
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_unique(self):
         return len(self) == len(self.unique())
 
@@ -1615,7 +1615,7 @@ def is_unique(self):
     def dtype(self):
         return np.dtype("O")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _is_sorted(self, ascending=None, null_position=None) -> bool:
         """
         Returns a boolean indicating whether the data of the MultiIndex are sorted
@@ -1661,7 +1661,7 @@ def _is_sorted(self, ascending=None, null_position=None) -> bool:
         )
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_increasing(self) -> bool:
         """
         Return if the index is monotonic increasing
@@ -1670,7 +1670,7 @@ def is_monotonic_increasing(self) -> bool:
         return self._is_sorted(ascending=None, null_position=None)
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_decreasing(self) -> bool:
         """
         Return if the index is monotonic decreasing
@@ -1680,7 +1680,7 @@ def is_monotonic_decreasing(self) -> bool:
             ascending=[False] * len(self.levels), null_position=None
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(self, value):
         """
         Fill null values with the specified value.
@@ -1721,11 +1721,11 @@ def fillna(self, value):
 
         return super().fillna(value=value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def unique(self):
         return self.drop_duplicates(keep="first")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nunique(self, dropna: bool = True) -> int:
         mi = self.dropna(how="all") if dropna else self
         return len(mi.unique())
@@ -1740,7 +1740,7 @@ def _clean_nulls_from_index(self):
             index_df._clean_nulls_from_dataframe(index_df), names=self.names
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, deep=False):
         usage = sum(col.memory_usage for col in self._data.columns)
         if self.levels:
@@ -1751,13 +1751,13 @@ def memory_usage(self, deep=False):
                 usage += col.memory_usage
         return usage
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def difference(self, other, sort=None):
         if hasattr(other, "to_pandas"):
             other = other.to_pandas()
         return cudf.from_pandas(self.to_pandas().difference(other, sort))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def append(self, other):
         """
         Append a collection of MultiIndex objects together
@@ -1820,7 +1820,7 @@ def append(self, other):
 
         return MultiIndex._concat(to_concat)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_function__(self, func, types, args, kwargs):
         cudf_df_module = MultiIndex
 
@@ -1867,7 +1867,7 @@ def _level_index_from_level(self, level):
                 ) from None
             return level
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_indexer(self, target, method=None, limit=None, tolerance=None):
         if tolerance is not None:
             raise NotImplementedError(
@@ -1926,7 +1926,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
 
         return _return_get_indexer_result(result_series.to_cupy())
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_loc(self, key):
         is_sorted = (
             self.is_monotonic_increasing or self.is_monotonic_decreasing
@@ -2000,7 +2000,7 @@ def _maybe_match_names(self, other):
             for self_name, other_name in zip(self.names, other.names)
         ]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def union(self, other, sort=None):
         if not isinstance(other, MultiIndex):
             msg = "other must be a MultiIndex or a list of tuples"
@@ -2024,7 +2024,7 @@ def union(self, other, sort=None):
 
         return self._union(other, sort=sort)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _union(self, other, sort=None):
         # TODO: When to_frame is refactored to return a
         # deep copy in future, we should push most of the common
@@ -2050,7 +2050,7 @@ def _union(self, other, sort=None):
             return midx.sort_values()
         return midx
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _intersection(self, other, sort=None):
         if self.names != other.names:
             deep = True
@@ -2073,14 +2073,14 @@ def _intersection(self, other, sort=None):
             return midx.sort_values()
         return midx
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _copy_type_metadata(self: Self, other: Self) -> Self:
         res = super()._copy_type_metadata(other)
         if isinstance(other, MultiIndex):
             res._names = other._names
         return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _split_columns_by_levels(
         self, levels: tuple, *, in_levels: bool
     ) -> Generator[tuple[Any, column.ColumnBase], None, None]:
@@ -2099,7 +2099,7 @@ def _split_columns_by_levels(
             elif not in_levels and i not in level_indices:
                 yield name, col
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _new_index_for_reset_index(
         self, levels: tuple | None, name
     ) -> None | BaseIndex:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index ea25d482578..9acf5294b72 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -68,7 +68,7 @@
     is_mixed_with_object_dtype,
     to_cudf_compatible_scalar,
 )
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 if TYPE_CHECKING:
     from cudf._typing import (
@@ -179,7 +179,7 @@ class _SeriesIlocIndexer(_FrameIndexer):
 
     _frame: cudf.Series
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, arg):
         indexing_spec = indexing_utils.parse_row_iloc_indexer(
             indexing_utils.destructure_series_iloc_indexer(arg, self._frame),
@@ -187,7 +187,7 @@ def __getitem__(self, arg):
         )
         return self._frame._getitem_preprocessed(indexing_spec)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __setitem__(self, key, value):
         if isinstance(key, tuple):
             key = list(key)
@@ -274,7 +274,7 @@ class _SeriesLocIndexer(_FrameIndexer):
     Label-based selection
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, arg: Any) -> ScalarLike | DataFrameOrSeries:
         if isinstance(arg, pd.MultiIndex):
             arg = cudf.from_pandas(arg)
@@ -301,7 +301,7 @@ def __getitem__(self, arg: Any) -> ScalarLike | DataFrameOrSeries:
 
         return self._frame.iloc[arg]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __setitem__(self, key, value):
         try:
             key = self._loc_to_iloc(key)
@@ -476,7 +476,7 @@ def _constructor_expanddim(self):
         return cudf.DataFrame
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_categorical(cls, categorical, codes=None):
         """Creates from a pandas.Categorical
 
@@ -517,7 +517,7 @@ def from_categorical(cls, categorical, codes=None):
         return Series(data=col)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_masked_array(cls, data, mask, null_count=None):
         """Create a Series with null-mask.
         This is equivalent to:
@@ -566,7 +566,7 @@ def from_masked_array(cls, data, mask, null_count=None):
         col = as_column(data).set_mask(mask)
         return cls(data=col)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -663,7 +663,7 @@ def __init__(
         self._check_data_index_length_match()
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data(
         cls,
         data: MutableMapping,
@@ -675,18 +675,18 @@ def _from_data(
             out.name = name
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data_like_self(self, data: MutableMapping):
         out = super()._from_data_like_self(data)
         out.name = self.name
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __contains__(self, item):
         return item in self.index
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_pandas(cls, s: pd.Series, nan_as_null=no_default):
         """
         Convert from a Pandas Series.
@@ -735,7 +735,7 @@ def from_pandas(cls, s: pd.Series, nan_as_null=no_default):
         return result
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_unique(self):
         """Return boolean if values in the object are unique.
 
@@ -746,7 +746,7 @@ def is_unique(self):
         return self._column.is_unique
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dt(self):
         """
         Accessor object for datetime-like properties of the Series values.
@@ -788,7 +788,7 @@ def dt(self):
             )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hasnans(self):
         """
         Return True if there are any NaNs or nulls.
@@ -829,7 +829,7 @@ def hasnans(self):
         """
         return self._column.has_nulls(include_nan=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def serialize(self):
         header, frames = super().serialize()
 
@@ -842,7 +842,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def deserialize(cls, header, frames):
         index_nframes = header["index_frame_count"]
         obj = super().deserialize(
@@ -855,7 +855,7 @@ def deserialize(cls, header, frames):
 
         return obj
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop(
         self,
         labels=None,
@@ -884,7 +884,7 @@ def tolist(self):  # noqa: D102
 
     to_list = tolist
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_dict(self, into: type[dict] = dict) -> dict:
         """
         Convert Series to {label -> value} dict or dict-like object.
@@ -923,7 +923,7 @@ def to_dict(self, into: type[dict] = dict) -> dict:
         """
         return self.to_pandas().to_dict(into=into)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def reindex(self, *args, **kwargs):
         """
         Conform Series to new index.
@@ -996,7 +996,7 @@ def reindex(self, *args, **kwargs):
         series.name = self.name
         return series
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_reset_index_template.format(
             klass="Series",
@@ -1081,7 +1081,7 @@ def reset_index(
             inplace=inplace,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_frame(self, name=None):
         """Convert Series into a DataFrame
 
@@ -1124,13 +1124,13 @@ def to_frame(self, name=None):
 
         return cudf.DataFrame({col: self._column}, index=self.index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, index=True, deep=False):
         return self._column.memory_usage + (
             self.index.memory_usage() if index else 0
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_function__(self, func, types, args, kwargs):
         if "out" in kwargs or not all(issubclass(t, Series) for t in types):
             return NotImplemented
@@ -1191,7 +1191,7 @@ def __array_function__(self, func, types, args, kwargs):
 
         return NotImplemented
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def map(self, arg, na_action=None) -> "Series":
         """
         Map values of Series according to input correspondence.
@@ -1333,7 +1333,7 @@ def _getitem_preprocessed(
             return self._empty_like(keep_index=True)
         assert_never(spec)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, arg):
         if isinstance(arg, slice):
             return self.iloc[arg]
@@ -1344,7 +1344,7 @@ def __getitem__(self, arg):
 
     items = SingleColumnFrame.__iter__
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __setitem__(self, key, value):
         if isinstance(key, slice):
             self.iloc[key] = value
@@ -1495,36 +1495,36 @@ def _make_operands_and_index_for_binop(
 
     @copy_docstring(CategoricalAccessor)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cat(self):
         return CategoricalAccessor(parent=self)
 
     @copy_docstring(StringMethods)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def str(self):
         return StringMethods(parent=self)
 
     @copy_docstring(ListMethods)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def list(self):
         return ListMethods(parent=self)
 
     @copy_docstring(StructMethods)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def struct(self):
         return StructMethods(parent=self)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dtype(self):
         """The dtype of the Series."""
         return self._column.dtype
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _concat(cls, objs, axis=0, index=True):
         # Concatenate index if not provided
         if index is True:
@@ -1590,25 +1590,25 @@ def _concat(cls, objs, axis=0, index=True):
         return cls(data=col, index=index, name=name)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def valid_count(self):
         """Number of non-null values"""
         return len(self) - self._column.null_count
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def null_count(self):
         """Number of null values"""
         return self._column.null_count
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nullable(self):
         """A boolean indicating whether a null-mask is needed"""
         return self._column.nullable
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def has_nulls(self):
         """
         Indicator whether Series contains null values.
@@ -1637,7 +1637,7 @@ def has_nulls(self):
         """
         return self._column.has_nulls()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dropna(self, axis=0, inplace=False, how=None):
         """
         Return a Series with null values removed.
@@ -1717,7 +1717,7 @@ def dropna(self, axis=0, inplace=False, how=None):
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
         """
         Return Series with duplicate values removed.
@@ -1791,7 +1791,7 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
     ):
@@ -1896,7 +1896,7 @@ def between(self, left, right, inclusive="both") -> Series:
             )
         return self._from_data({self.name: lmask & rmask}, self.index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def all(self, axis=0, bool_only=None, skipna=True, **kwargs):
         if bool_only not in (None, True):
             raise NotImplementedError(
@@ -1904,7 +1904,7 @@ def all(self, axis=0, bool_only=None, skipna=True, **kwargs):
             )
         return super().all(axis, skipna, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def any(self, axis=0, bool_only=None, skipna=True, **kwargs):
         if bool_only not in (None, True):
             raise NotImplementedError(
@@ -1912,7 +1912,7 @@ def any(self, axis=0, bool_only=None, skipna=True, **kwargs):
             )
         return super().any(axis, skipna, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self,
         *,
@@ -2004,7 +2004,7 @@ def to_pandas(
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def data(self):
         """The gpu buffer for the data
 
@@ -2029,12 +2029,12 @@ def data(self):
         return self._column.data
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nullmask(self):
         """The gpu buffer for the null-mask"""
         return cudf.Series(self._column.nullmask)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(
         self,
         dtype,
@@ -2051,13 +2051,13 @@ def astype(
             dtype = {self.name: dtype}
         return super().astype(dtype, copy, errors)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sort_index(self, axis=0, *args, **kwargs):
         if axis not in (0, "index"):
             raise ValueError("Only axis=0 is valid for Series.")
         return super().sort_index(axis=axis, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sort_values(
         self,
         axis=0,
@@ -2112,7 +2112,7 @@ def sort_values(
             ignore_index=ignore_index,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nlargest(self, n=5, keep="first"):
         """Returns a new Series of the *n* largest element.
 
@@ -2175,7 +2175,7 @@ def nlargest(self, n=5, keep="first"):
         """
         return self._n_largest_or_smallest(True, n, [self.name], keep)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nsmallest(self, n=5, keep="first"):
         """
         Returns a new Series of the *n* smallest element.
@@ -2251,7 +2251,7 @@ def nsmallest(self, n=5, keep="first"):
         """
         return self._n_largest_or_smallest(False, n, [self.name], keep)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def argsort(
         self,
         axis=0,
@@ -2274,7 +2274,7 @@ def argsort(
         obj.name = self.name
         return obj
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def replace(self, to_replace=None, value=no_default, *args, **kwargs):
         if is_dict_like(to_replace) and value not in {None, no_default}:
             raise ValueError(
@@ -2284,7 +2284,7 @@ def replace(self, to_replace=None, value=no_default, *args, **kwargs):
 
         return super().replace(to_replace, value, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def update(self, other):
         """
         Modify Series in place using values from passed Series.
@@ -2390,7 +2390,7 @@ def update(self, other):
         self.mask(mask, other, inplace=True)
 
     # UDF related
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def apply(self, func, convert_dtype=True, args=(), **kwargs):
         """
         Apply a scalar function to the values of a Series.
@@ -2535,7 +2535,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
     #
     # Stats
     #
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def count(self):
         """
         Return number of non-NA/null observations in the Series
@@ -2559,7 +2559,7 @@ def count(self):
         """
         return self.valid_count
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def mode(self, dropna=True):
         """
         Return the mode(s) of the dataset.
@@ -2630,7 +2630,7 @@ def mode(self, dropna=True):
             {self.name: val_counts.index.sort_values()}, name=self.name
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def round(self, decimals=0, how="half_even"):
         if not is_integer(decimals):
             raise ValueError(
@@ -2639,7 +2639,7 @@ def round(self, decimals=0, how="half_even"):
         decimals = int(decimals)
         return super().round(decimals, how)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cov(self, other, min_periods=None):
         """
         Compute covariance with Series, excluding missing values.
@@ -2690,7 +2690,7 @@ def cov(self, other, min_periods=None):
                 f"{other.dtype}"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def transpose(self):
         """Return the transpose, which is by definition self."""
 
@@ -2698,7 +2698,7 @@ def transpose(self):
 
     T = property(transpose, doc=transpose.__doc__)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def duplicated(self, keep="first"):
         """
         Indicate duplicate Series values.
@@ -2778,7 +2778,7 @@ def duplicated(self, keep="first"):
         """
         return super().duplicated(keep=keep)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def corr(self, other, method="pearson", min_periods=None):
         """Calculates the sample correlation between two Series,
         excluding missing values.
@@ -2830,7 +2830,7 @@ def corr(self, other, method="pearson", min_periods=None):
                 f"cannot perform corr with types {self.dtype}, {other.dtype}"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def autocorr(self, lag=1):
         """Compute the lag-N autocorrelation. This method computes the Pearson
         correlation between the Series and its shifted self.
@@ -2856,7 +2856,7 @@ def autocorr(self, lag=1):
         """
         return self.corr(self.shift(lag))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isin(self, values):
         """Check whether values are contained in Series.
 
@@ -2926,7 +2926,7 @@ def isin(self, values):
             {self.name: self._column.isin(values)}, index=self.index
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def unique(self):
         """
         Returns unique values of this Series.
@@ -2961,7 +2961,7 @@ def unique(self):
             return res.values
         return Series(res, name=self.name)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def value_counts(
         self,
         normalize=False,
@@ -3116,7 +3116,7 @@ def value_counts(
         res.name = result_name
         return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quantile(
         self, q=0.5, interpolation="linear", exact=True, quant_index=True
     ):
@@ -3195,7 +3195,7 @@ def quantile(
         )
 
     @docutils.doc_describe()
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def describe(
         self,
         percentiles=None,
@@ -3240,7 +3240,7 @@ def describe(
             name=self.name,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def digitize(self, bins, right=False):
         """Return the indices of the bins to which each value belongs.
 
@@ -3276,7 +3276,7 @@ def digitize(self, bins, right=False):
             cudf.core.column.numerical.digitize(self._column, bins, right)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def diff(self, periods=1):
         """First discrete difference of element.
 
@@ -3347,7 +3347,7 @@ def diff(self, periods=1):
 
         return self - self.shift(periods=periods)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         groupby_doc_template.format(
             ret=textwrap.dedent(
@@ -3385,7 +3385,7 @@ def groupby(
             dropna,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rename(self, index=None, copy=True):
         """
         Alter Series name
@@ -3431,7 +3431,7 @@ def rename(self, index=None, copy=True):
         out_data = self._data.copy(deep=copy)
         return Series._from_data(out_data, self.index, name=index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def add_prefix(self, prefix):
         return Series._from_data(
             # TODO: Change to deep=False when copy-on-write is default
@@ -3439,7 +3439,7 @@ def add_prefix(self, prefix):
             index=prefix + self.index.astype(str),
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def add_suffix(self, suffix):
         return Series._from_data(
             # TODO: Change to deep=False when copy-on-write is default
@@ -3447,7 +3447,7 @@ def add_suffix(self, suffix):
             index=self.index.astype(str) + suffix,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def keys(self):
         """
         Return alias for index.
@@ -3491,7 +3491,7 @@ def keys(self):
         """
         return self.index
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def explode(self, ignore_index=False):
         """
         Transform each element of a list-like to a row, replicating index
@@ -3528,7 +3528,7 @@ def explode(self, ignore_index=False):
         """
         return super()._explode(self.name, ignore_index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pct_change(
         self, periods=1, fill_method=no_default, limit=no_default, freq=None
     ):
@@ -3602,7 +3602,7 @@ def pct_change(
         change = diff / data.shift(periods=periods, freq=freq)
         return change
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
@@ -3736,7 +3736,7 @@ class DatetimeProperties(BaseDatelikeProperties):
     """
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def year(self) -> Series:
         """
         The year of the datetime.
@@ -3761,7 +3761,7 @@ def year(self) -> Series:
         return self._get_dt_field("year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def month(self) -> Series:
         """
         The month as January=1, December=12.
@@ -3786,7 +3786,7 @@ def month(self) -> Series:
         return self._get_dt_field("month")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day(self) -> Series:
         """
         The day of the datetime.
@@ -3811,7 +3811,7 @@ def day(self) -> Series:
         return self._get_dt_field("day")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hour(self) -> Series:
         """
         The hours of the datetime.
@@ -3836,7 +3836,7 @@ def hour(self) -> Series:
         return self._get_dt_field("hour")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def minute(self) -> Series:
         """
         The minutes of the datetime.
@@ -3861,7 +3861,7 @@ def minute(self) -> Series:
         return self._get_dt_field("minute")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def second(self) -> Series:
         """
         The seconds of the datetime.
@@ -3886,7 +3886,7 @@ def second(self) -> Series:
         return self._get_dt_field("second")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def microsecond(self) -> Series:
         """
         The microseconds of the datetime.
@@ -3918,7 +3918,7 @@ def microsecond(self) -> Series:
         return self._return_result_like_self(micro + extra)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nanosecond(self) -> Series:
         """
         The nanoseconds of the datetime.
@@ -3943,7 +3943,7 @@ def nanosecond(self) -> Series:
         return self._get_dt_field("nanosecond")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def weekday(self) -> Series:
         """
         The day of the week with Monday=0, Sunday=6.
@@ -3980,7 +3980,7 @@ def weekday(self) -> Series:
         return self._get_dt_field("weekday")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dayofweek(self) -> Series:
         """
         The day of the week with Monday=0, Sunday=6.
@@ -4017,7 +4017,7 @@ def dayofweek(self) -> Series:
         return self._get_dt_field("weekday")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dayofyear(self) -> Series:
         """
         The day of the year, from 1-365 in non-leap years and
@@ -4055,7 +4055,7 @@ def dayofyear(self) -> Series:
         return self._get_dt_field("day_of_year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day_of_year(self) -> Series:
         """
         The day of the year, from 1-365 in non-leap years and
@@ -4093,7 +4093,7 @@ def day_of_year(self) -> Series:
         return self._get_dt_field("day_of_year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_leap_year(self) -> Series:
         """
         Boolean indicator if the date belongs to a leap year.
@@ -4148,7 +4148,7 @@ def is_leap_year(self) -> Series:
         return self._return_result_like_self(res)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quarter(self) -> Series:
         """
         Integer indicator for which quarter of the year the date belongs in.
@@ -4177,7 +4177,7 @@ def quarter(self) -> Series:
         )
         return self._return_result_like_self(res)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day_name(self, locale: str | None = None) -> Series:
         """
         Return the day names. Currently supports English locale only.
@@ -4213,7 +4213,7 @@ def day_name(self, locale: str | None = None) -> Series:
             self.series._column.get_day_names(locale)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def month_name(self, locale: str | None = None) -> Series:
         """
         Return the month names. Currently supports English locale only.
@@ -4243,7 +4243,7 @@ def month_name(self, locale: str | None = None) -> Series:
             self.series._column.get_month_names(locale)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isocalendar(self) -> cudf.DataFrame:
         """
         Returns a DataFrame with the year, week, and day
@@ -4291,7 +4291,7 @@ def isocalendar(self) -> cudf.DataFrame:
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_month_start(self) -> Series:
         """
         Booleans indicating if dates are the first day of the month.
@@ -4299,7 +4299,7 @@ def is_month_start(self) -> Series:
         return (self.day == 1).fillna(False)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def days_in_month(self) -> Series:
         """
         Get the total number of days in the month that the date falls on.
@@ -4348,7 +4348,7 @@ def days_in_month(self) -> Series:
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_month_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of the month.
@@ -4391,7 +4391,7 @@ def is_month_end(self) -> Series:
         return (self.day == last_day.dt.day).fillna(False)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_quarter_start(self) -> Series:
         """
         Boolean indicator if the date is the first day of a quarter.
@@ -4436,7 +4436,7 @@ def is_quarter_start(self) -> Series:
         return self._return_result_like_self(result)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_quarter_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of a quarter.
@@ -4483,7 +4483,7 @@ def is_quarter_end(self) -> Series:
         return self._return_result_like_self(result)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_year_start(self) -> Series:
         """
         Boolean indicator if the date is the first day of the year.
@@ -4514,7 +4514,7 @@ def is_year_start(self) -> Series:
         return self._return_result_like_self(outcol.fillna(False))
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_year_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of the year.
@@ -4547,13 +4547,13 @@ def is_year_end(self) -> Series:
         result = cudf._lib.copying.copy_if_else(leap, non_leap, leap_dates)
         return self._return_result_like_self(result.fillna(False))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_dt_field(self, field: str) -> Series:
         return self._return_result_like_self(
             self.series._column.get_dt_field(field)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ceil(self, freq: str) -> Series:
         """
         Perform ceil operation on the data to the specified freq.
@@ -4586,7 +4586,7 @@ def ceil(self, freq: str) -> Series:
         """
         return self._return_result_like_self(self.series._column.ceil(freq))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def floor(self, freq: str) -> Series:
         """
         Perform floor operation on the data to the specified freq.
@@ -4619,7 +4619,7 @@ def floor(self, freq: str) -> Series:
         """
         return self._return_result_like_self(self.series._column.floor(freq))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def round(self, freq: str) -> Series:
         """
         Perform round operation on the data to the specified freq.
@@ -4655,7 +4655,7 @@ def round(self, freq: str) -> Series:
         """
         return self._return_result_like_self(self.series._column.round(freq))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def strftime(self, date_format: str, *args, **kwargs) -> Series:
         """
         Convert to Series using specified ``date_format``.
@@ -4832,7 +4832,7 @@ class TimedeltaProperties(BaseDatelikeProperties):
     """
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def days(self) -> Series:
         """
         Number of days.
@@ -4864,7 +4864,7 @@ def days(self) -> Series:
         return self._get_td_field("days")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def seconds(self) -> Series:
         """
         Number of seconds (>= 0 and less than 1 day).
@@ -4903,7 +4903,7 @@ def seconds(self) -> Series:
         return self._get_td_field("seconds")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def microseconds(self) -> Series:
         """
         Number of microseconds (>= 0 and less than 1 second).
@@ -4935,7 +4935,7 @@ def microseconds(self) -> Series:
         return self._get_td_field("microseconds")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nanoseconds(self) -> Series:
         """
         Return the number of nanoseconds (n), where 0 <= n < 1 microsecond.
@@ -4967,7 +4967,7 @@ def nanoseconds(self) -> Series:
         return self._get_td_field("nanoseconds")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def components(self) -> cudf.DataFrame:
         """
         Return a Dataframe of the components of the Timedeltas.
@@ -4999,14 +4999,14 @@ def components(self) -> cudf.DataFrame:
             ca, index=self.series.index
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_td_field(self, field: str) -> Series:
         return self._return_result_like_self(
             getattr(self.series._column, field)
         )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _align_indices(series_list, how="outer", allow_non_unique=False):
     """
     Internal util to align the indices of a list of Series objects
@@ -5069,7 +5069,7 @@ def _align_indices(series_list, how="outer", allow_non_unique=False):
 
 
 @acquire_spill_lock()
-@_cudf_nvtx_annotate
+@_performance_tracking
 def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     r"""Returns a boolean array where two arrays are equal within a tolerance.
 
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 23a2c828a04..f9555aee6a2 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -18,7 +18,7 @@
 )
 from cudf.core.column import ColumnBase, as_column
 from cudf.core.frame import Frame
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import NotIterable
 
 if TYPE_CHECKING:
@@ -41,7 +41,7 @@ class SingleColumnFrame(Frame, NotIterable):
         "index": 0,
     }
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _reduce(
         self,
         op,
@@ -62,7 +62,7 @@ def _reduce(
         except AttributeError:
             raise TypeError(f"cannot perform {op} with type {self.dtype}")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _scan(self, op, axis=None, *args, **kwargs):
         if axis not in (None, 0):
             raise NotImplementedError("axis parameter is not implemented yet")
@@ -70,24 +70,24 @@ def _scan(self, op, axis=None, *args, **kwargs):
         return super()._scan(op, axis=axis, *args, **kwargs)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self):
         """Get the name of this object."""
         return next(iter(self._column_names))
 
     @name.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self, value):
         self._data[value] = self._data.pop(self.name)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ndim(self) -> int:  # noqa: D401
         """Number of dimensions of the underlying data, by definition 1."""
         return 1
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def shape(self) -> tuple[int]:
         """Get a tuple representing the dimensionality of the Index."""
         return (len(self),)
@@ -99,27 +99,27 @@ def __bool__(self):
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _num_columns(self) -> int:
         return 1
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _column(self) -> ColumnBase:
         return next(iter(self._columns))
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values(self) -> cupy.ndarray:  # noqa: D102
         return self._column.values
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values_host(self) -> numpy.ndarray:  # noqa: D102
         return self._column.values_host
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrow(cls, array) -> Self:
         """Create from PyArrow Array/ChunkedArray.
 
@@ -150,7 +150,7 @@ def from_arrow(cls, array) -> Self:
         """
         return cls(ColumnBase.from_arrow(array))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_arrow(self) -> pa.Array:
         """
         Convert to a PyArrow Array.
@@ -182,7 +182,7 @@ def to_arrow(self) -> pa.Array:
         return self._column.to_arrow()
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_unique(self) -> bool:
         """Return boolean if values in the object are unique.
 
@@ -193,7 +193,7 @@ def is_unique(self) -> bool:
         return self._column.is_unique
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_increasing(self) -> bool:
         """Return boolean if values in the object are monotonically increasing.
 
@@ -204,7 +204,7 @@ def is_monotonic_increasing(self) -> bool:
         return self._column.is_monotonic_increasing
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_decreasing(self) -> bool:
         """Return boolean if values in the object are monotonically decreasing.
 
@@ -215,7 +215,7 @@ def is_monotonic_decreasing(self) -> bool:
         return self._column.is_monotonic_decreasing
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __cuda_array_interface__(self):
         # While the parent column class has a `__cuda_array_interface__` method
         # defined, it is not implemented for all column types. When it is not
@@ -229,7 +229,7 @@ def __cuda_array_interface__(self):
                 "'__cuda_array_interface__'"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def factorize(
         self, sort: bool = False, use_na_sentinel: bool = True
     ) -> tuple[cupy.ndarray, cudf.Index]:
@@ -268,7 +268,7 @@ def factorize(
             use_na_sentinel=use_na_sentinel,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _make_operands_for_binop(
         self,
         other: Any,
@@ -323,7 +323,7 @@ def _make_operands_for_binop(
 
         return {result_name: (self._column, other, reflect, fill_value)}
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nunique(self, dropna: bool = True) -> int:
         """
         Return count of unique values for the column.
@@ -369,7 +369,7 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase:
                 return self._column.apply_boolean_mask(arg)
             raise NotImplementedError(f"Unknown indexer {type(arg)}")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         from cudf.core._internals.where import (
             _check_and_cast_columns_with_other,
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index 06d9296ca0f..265b87350ae 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -30,7 +30,7 @@
     _supported_dtypes_from_frame,
 )
 from cudf.utils._numba import _CUDFNumbaConfig
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 
 def _get_frame_groupby_type(dtype, index_dtype):
@@ -126,7 +126,7 @@ def _get_groupby_apply_kernel(frame, func, args):
     return kernel, return_type
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def jit_groupby_apply(offsets, grouped_values, function, *args):
     """
     Main entrypoint for JIT Groupby.apply via Numba.
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index f1704e4ea78..d616761cb3b 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -38,7 +38,7 @@
     STRING_TYPES,
     TIMEDELTA_TYPES,
 )
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import initfunc
 
 # Maximum size of a string column is 2 GiB
@@ -71,7 +71,7 @@ def _ptx_file():
     )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _get_udf_return_type(argty, func: Callable, args=()):
     """
     Get the return type of a masked UDF for a given set of argument dtypes. It
@@ -236,7 +236,7 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"):
     )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _compile_or_get(
     frame, func, args, kernel_getter=None, suffix="__APPLY_UDF"
 ):
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index f07764e2ce4..e909d96309e 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -12,10 +12,10 @@
 from cudf.api.types import is_scalar
 from cudf.utils import ioutils
 from cudf.utils.dtypes import _maybe_convert_to_default_type
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 @ioutils.doc_read_csv()
 def read_csv(
     filepath_or_buffer,
@@ -151,7 +151,7 @@ def read_csv(
     return df
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 @ioutils.doc_to_csv()
 def to_csv(
     df,
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 2a838ca7417..7733e770d99 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -22,7 +22,7 @@
 from cudf.api.types import is_list_like
 from cudf.core.column import as_column, build_categorical_column, column_empty
 from cudf.utils import ioutils
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 BYTE_SIZES = {
     "kb": 1000,
@@ -50,7 +50,7 @@
 }
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _write_parquet(
     df,
     paths,
@@ -130,7 +130,7 @@ def _write_parquet(
 
 # Logic chosen to match: https://arrow.apache.org/
 # docs/_modules/pyarrow/parquet.html#write_to_dataset
-@_cudf_nvtx_annotate
+@_performance_tracking
 def write_to_dataset(
     df,
     root_path,
@@ -318,7 +318,7 @@ def write_to_dataset(
 
 
 @ioutils.doc_read_parquet_metadata()
-@_cudf_nvtx_annotate
+@_performance_tracking
 def read_parquet_metadata(filepath_or_buffer):
     """{docstring}"""
     # Multiple sources are passed as a list. If a single source is passed,
@@ -360,7 +360,7 @@ def read_parquet_metadata(filepath_or_buffer):
     return libparquet.read_parquet_metadata(filepaths_or_buffers)
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _process_dataset(
     paths,
     fs,
@@ -515,7 +515,7 @@ def _process_dataset(
 
 
 @ioutils.doc_read_parquet()
-@_cudf_nvtx_annotate
+@_performance_tracking
 def read_parquet(
     filepath_or_buffer,
     engine="cudf",
@@ -785,7 +785,7 @@ def _handle_is(column: cudf.Series, value, *, negate) -> cudf.Series:
         return df
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _parquet_to_frame(
     paths_or_buffers,
     *args,
@@ -885,7 +885,7 @@ def _parquet_to_frame(
         return dfs[0]
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _read_parquet(
     filepaths_or_buffers,
     engine,
@@ -941,7 +941,7 @@ def _read_parquet(
 
 
 @ioutils.doc_to_parquet()
-@_cudf_nvtx_annotate
+@_performance_tracking
 def to_parquet(
     df,
     path,
@@ -1107,7 +1107,7 @@ def _get_estimated_file_size(df):
     return file_size
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _get_partitioned(
     df,
     root_path,
@@ -1145,7 +1145,7 @@ def _get_partitioned(
     return full_paths, metadata_file_paths, grouped_df, part_offsets, filename
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _get_groups_and_offsets(
     df,
     partition_cols,
@@ -1305,7 +1305,7 @@ class ParquetDatasetWriter:
 
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         path,
@@ -1355,7 +1355,7 @@ def __init__(
 
         self._file_sizes: dict[str, int] = {}
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def write_table(self, df):
         """
         Write a dataframe to the file/dataset
@@ -1486,7 +1486,7 @@ def write_table(self, df):
             self.path_cw_map.update({k: new_cw_idx for k in new_paths})
             self._chunked_writers[-1][0].write_table(grouped_df, part_info)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def close(self, return_metadata=False):
         """
         Close all open files and optionally return footer metadata as a binary
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 0e19972f6e0..4329480bb2c 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -1,14 +1,14 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from io import BytesIO, StringIO
 
 import cudf
 from cudf._lib import text as libtext
 from cudf.utils import ioutils
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 @ioutils.doc_read_text()
 def read_text(
     filepath_or_buffer,
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index fb5a963f008..1f539e7f266 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -311,6 +311,20 @@ def _integer_and_none_validator(val):
     _make_contains_validator([False, True]),
 )
 
+_register_option(
+    "memory_profiling",
+    _env_get_bool("CUDF_MEMORY_PROFILING", False),
+    textwrap.dedent(
+        """
+        If set to `False`, disables memory profiling.
+        If set to `True`, enables memory profiling.
+        Read more at: :ref:`memory-profiling-user-doc`
+        \tValid values are True or False. Default is False.
+    """
+    ),
+    _make_contains_validator([False, True]),
+)
+
 
 class option_context(ContextDecorator):
     """
diff --git a/python/cudf/cudf/tests/test_performance_tracking.py b/python/cudf/cudf/tests/test_performance_tracking.py
new file mode 100644
index 00000000000..e886b77af3f
--- /dev/null
+++ b/python/cudf/cudf/tests/test_performance_tracking.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from io import StringIO
+
+import pytest
+
+import rmm.mr
+import rmm.statistics
+
+import cudf
+from cudf.utils.performance_tracking import (
+    get_memory_records,
+    print_memory_report,
+)
+
+
+@pytest.fixture
+def rmm_reset():
+    """Fixture to reset the RMM resource before and after the test"""
+    mr = rmm.mr.get_current_device_resource()
+    try:
+        rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource())
+        yield
+    finally:
+        rmm.mr.set_current_device_resource(mr)
+
+
+def test_memory_profiling(rmm_reset):
+    df1 = cudf.DataFrame({"a": [1, 2, 3]})
+    assert len(get_memory_records()) == 0
+
+    rmm.statistics.enable_statistics()
+    cudf.set_option("memory_profiling", True)
+
+    df1.merge(df1)
+
+    assert len(get_memory_records()) > 0
+
+    out = StringIO()
+    print_memory_report(file=out)
+    assert "DataFrame.merge" in out.getvalue()
diff --git a/python/cudf/cudf/utils/nvtx_annotation.py b/python/cudf/cudf/utils/nvtx_annotation.py
deleted file mode 100644
index a4404e51232..00000000000
--- a/python/cudf/cudf/utils/nvtx_annotation.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-
-import hashlib
-from functools import partial
-
-from nvtx import annotate
-
-_NVTX_COLORS = ["green", "blue", "purple", "rapids"]
-
-
-def _get_color_for_nvtx(name):
-    m = hashlib.sha256()
-    m.update(name.encode())
-    hash_value = int(m.hexdigest(), 16)
-    idx = hash_value % len(_NVTX_COLORS)
-    return _NVTX_COLORS[idx]
-
-
-def _cudf_nvtx_annotate(func, domain="cudf_python"):
-    """Decorator for applying nvtx annotations to methods in cudf."""
-    return annotate(
-        message=func.__qualname__,
-        color=_get_color_for_nvtx(func.__qualname__),
-        domain=domain,
-    )(func)
-
-
-_dask_cudf_nvtx_annotate = partial(
-    _cudf_nvtx_annotate, domain="dask_cudf_python"
-)
diff --git a/python/cudf/cudf/utils/performance_tracking.py b/python/cudf/cudf/utils/performance_tracking.py
new file mode 100644
index 00000000000..30c891d0d5a
--- /dev/null
+++ b/python/cudf/cudf/utils/performance_tracking.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import contextlib
+import functools
+import hashlib
+import sys
+
+import nvtx
+
+import rmm.statistics
+
+from cudf.options import get_option
+
+_NVTX_COLORS = ["green", "blue", "purple", "rapids"]
+
+
+def _get_color_for_nvtx(name):
+    m = hashlib.sha256()
+    m.update(name.encode())
+    hash_value = int(m.hexdigest(), 16)
+    idx = hash_value % len(_NVTX_COLORS)
+    return _NVTX_COLORS[idx]
+
+
+def _performance_tracking(func, domain="cudf_python"):
+    """Decorator for applying performance tracking (if enabled)."""
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        with contextlib.ExitStack() as stack:
+            if get_option("memory_profiling"):
+                # NB: the user still needs to call `rmm.statistics.enable_statistics()`
+                #     to enable memory profiling.
+                stack.enter_context(
+                    rmm.statistics.profiler(
+                        name=rmm.statistics._get_descriptive_name_of_object(
+                            func
+                        )
+                    )
+                )
+            if nvtx.enabled():
+                stack.enter_context(
+                    nvtx.annotate(
+                        message=func.__qualname__,
+                        color=_get_color_for_nvtx(func.__qualname__),
+                        domain=domain,
+                    )
+                )
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
+_dask_cudf_performance_tracking = functools.partial(
+    _performance_tracking, domain="dask_cudf_python"
+)
+
+
+def get_memory_records() -> (
+    dict[str, rmm.statistics.ProfilerRecords.MemoryRecord]
+):
+    """Get the memory records from the memory profiling
+
+    Returns
+    -------
+    Dict that maps function names to memory records. Empty if
+    memory profiling is disabled
+    """
+    return rmm.statistics.default_profiler_records.records
+
+
+def print_memory_report(file=sys.stdout) -> None:
+    """Pretty print the result of the memory profiling
+
+    Parameters
+    ----------
+    file
+        The output stream
+    """
+    print(rmm.statistics.default_profiler_records.report(), file=file)
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 2e4dfc4bb14..7347ec7866a 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -159,8 +159,9 @@ def _external_only_api(func, alternative=""):
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
         # Check the immediately preceding frame to see if it's in cudf.
-        frame, lineno = next(traceback.walk_stack(None))
-        fn = frame.f_code.co_filename
+        pre_frame = traceback.extract_stack(limit=2)[0]
+        fn = pre_frame.filename
+        lineno = pre_frame.lineno
         if _cudf_root in fn and _tests_root not in fn:
             raise RuntimeError(
                 f"External-only API called in {fn} at line {lineno}. "
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index d250589e389..1f55a59ea55 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -43,7 +43,7 @@
 
 import cudf
 from cudf.api.types import is_string_dtype
-from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 from .core import DataFrame, Index, Series
 
@@ -53,7 +53,7 @@
 
 
 @meta_nonempty.register(cudf.BaseIndex)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _nonempty_index(idx):
     if isinstance(idx, cudf.core.index.RangeIndex):
         return cudf.core.index.RangeIndex(2, name=idx.name)
@@ -100,7 +100,7 @@ def _nest_list_data(data, leaf_type):
     return data
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _get_non_empty_data(s):
     if isinstance(s, cudf.core.column.CategoricalColumn):
         categories = (
@@ -147,7 +147,7 @@ def _get_non_empty_data(s):
 
 
 @meta_nonempty.register(cudf.Series)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _nonempty_series(s, idx=None):
     if idx is None:
         idx = _nonempty_index(s.index)
@@ -157,7 +157,7 @@ def _nonempty_series(s, idx=None):
 
 
 @meta_nonempty.register(cudf.DataFrame)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def meta_nonempty_cudf(x):
     idx = meta_nonempty(x.index)
     columns_with_dtype = dict()
@@ -182,18 +182,18 @@ def meta_nonempty_cudf(x):
 
 
 @make_meta_dispatch.register((cudf.Series, cudf.DataFrame))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def make_meta_cudf(x, index=None):
     return x.head(0)
 
 
 @make_meta_dispatch.register(cudf.BaseIndex)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def make_meta_cudf_index(x, index=None):
     return x[:0]
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _empty_series(name, dtype, index=None):
     if isinstance(dtype, str) and dtype == "category":
         return cudf.Series(
@@ -203,7 +203,7 @@ def _empty_series(name, dtype, index=None):
 
 
 @make_meta_obj.register(object)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def make_meta_object_cudf(x, index=None):
     """Create an empty cudf object containing the desired metadata.
 
@@ -274,7 +274,7 @@ def make_meta_object_cudf(x, index=None):
 
 
 @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def concat_cudf(
     dfs,
     axis=0,
@@ -299,13 +299,13 @@ def concat_cudf(
 @categorical_dtype_dispatch.register(
     (cudf.DataFrame, cudf.Series, cudf.BaseIndex)
 )
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def categorical_dtype_cudf(categories=None, ordered=False):
     return cudf.CategoricalDtype(categories=categories, ordered=ordered)
 
 
 @tolist_dispatch.register((cudf.Series, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def tolist_cudf(obj):
     return obj.to_pandas().tolist()
 
@@ -313,7 +313,7 @@ def tolist_cudf(obj):
 @is_categorical_dtype_dispatch.register(
     (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series)
 )
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def is_categorical_dtype_cudf(obj):
     return cudf.api.types._is_categorical_dtype(obj)
 
@@ -324,7 +324,7 @@ def get_grouper_cudf(obj):
 
 
 @percentile_lookup.register((cudf.Series, cp.ndarray, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def percentile_cudf(a, q, interpolation="linear"):
     # Cudf dispatch to the equivalent of `np.percentile`:
     # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html
@@ -400,7 +400,7 @@ def _table_to_cudf(obj, table, self_destruct=None, **kwargs):
 
 
 @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def union_categoricals_cudf(
     to_union, sort_categories=False, ignore_order=False
 ):
@@ -410,7 +410,7 @@ def union_categoricals_cudf(
 
 
 @hash_object_dispatch.register((cudf.DataFrame, cudf.Series))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def hash_object_cudf(frame, index=True):
     if index:
         frame = frame.reset_index()
@@ -418,7 +418,7 @@ def hash_object_cudf(frame, index=True):
 
 
 @hash_object_dispatch.register(cudf.BaseIndex)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def hash_object_cudf_index(ind, index=None):
     if isinstance(ind, cudf.MultiIndex):
         return ind.to_frame(index=False).hash_values()
@@ -428,7 +428,7 @@ def hash_object_cudf_index(ind, index=None):
 
 
 @group_split_dispatch.register((cudf.Series, cudf.DataFrame))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def group_split_cudf(df, c, k, ignore_index=False):
     return dict(
         zip(
@@ -443,7 +443,7 @@ def group_split_cudf(df, c, k, ignore_index=False):
 
 
 @sizeof_dispatch.register(cudf.DataFrame)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def sizeof_cudf_dataframe(df):
     return int(
         sum(col.memory_usage for col in df._data.columns)
@@ -452,7 +452,7 @@ def sizeof_cudf_dataframe(df):
 
 
 @sizeof_dispatch.register((cudf.Series, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def sizeof_cudf_series_index(obj):
     return obj.memory_usage()
 
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 3bd455a3a57..aab56e3a1b0 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -22,7 +22,7 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 from dask_cudf import sorting
 from dask_cudf.accessors import ListMethods, StructMethods
@@ -53,7 +53,7 @@ def __repr__(self):
         s = "<dask_cudf.%s | %d tasks | %d npartitions>"
         return s % (type(self).__name__, len(self.dask), self.npartitions)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def to_dask_dataframe(self, **kwargs):
         """Create a dask.dataframe object from a dask_cudf object
 
@@ -92,7 +92,7 @@ class DataFrame(_Frame, dd.core.DataFrame):
 
     _partition_type = cudf.DataFrame
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def _assign_column(self, k, v):
         def assigner(df, k, v):
             out = df.copy()
@@ -102,7 +102,7 @@ def assigner(df, k, v):
         meta = assigner(self._meta, k, dask_make_meta(v))
         return self.map_partitions(assigner, k, v, meta=meta)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None):
         import uuid
 
@@ -123,7 +123,7 @@ def do_apply_rows(df, func, incols, outcols, kwargs):
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def merge(self, other, shuffle_method=None, **kwargs):
         on = kwargs.pop("on", None)
         if isinstance(on, tuple):
@@ -136,7 +136,7 @@ def merge(self, other, shuffle_method=None, **kwargs):
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def join(self, other, shuffle_method=None, **kwargs):
         # CuDF doesn't support "right" join yet
         how = kwargs.pop("how", "left")
@@ -155,7 +155,7 @@ def join(self, other, shuffle_method=None, **kwargs):
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def set_index(
         self,
         other,
@@ -237,7 +237,7 @@ def set_index(
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def sort_values(
         self,
         by,
@@ -275,14 +275,14 @@ def sort_values(
             return df.reset_index(drop=True)
         return df
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def to_parquet(self, path, *args, **kwargs):
         """Calls dask.dataframe.io.to_parquet with CudfEngine backend"""
         from dask_cudf.io import to_parquet
 
         return to_parquet(self, path, *args, **kwargs)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def to_orc(self, path, **kwargs):
         """Calls dask_cudf.io.to_orc"""
         from dask_cudf.io import to_orc
@@ -290,7 +290,7 @@ def to_orc(self, path, **kwargs):
         return to_orc(self, path, **kwargs)
 
     @derived_from(pd.DataFrame)
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def var(
         self,
         axis=None,
@@ -324,28 +324,28 @@ def var(
             return _parallel_var(self, meta, skipna, split_every, out)
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def shuffle(self, *args, shuffle_method=None, **kwargs):
         """Wraps dask.dataframe DataFrame.shuffle method"""
         return super().shuffle(
             *args, shuffle_method=_get_shuffle_method(shuffle_method), **kwargs
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def groupby(self, by=None, **kwargs):
         from .groupby import CudfDataFrameGroupBy
 
         return CudfDataFrameGroupBy(self, by=by, **kwargs)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def sum_of_squares(x):
     x = x.astype("f8")._column
     outcol = libcudf.reduce.reduce("sum_of_squares", x)
     return cudf.Series(outcol)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def var_aggregate(x2, x, n, ddof):
     try:
         with warnings.catch_warnings(record=True):
@@ -358,12 +358,12 @@ def var_aggregate(x2, x, n, ddof):
         return np.float64(np.nan)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def nlargest_agg(x, **kwargs):
     return cudf.concat(x).nlargest(**kwargs)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def nsmallest_agg(x, **kwargs):
     return cudf.concat(x).nsmallest(**kwargs)
 
@@ -371,7 +371,7 @@ def nsmallest_agg(x, **kwargs):
 class Series(_Frame, dd.core.Series):
     _partition_type = cudf.Series
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def count(self, split_every=False):
         return reduction(
             [self],
@@ -381,14 +381,14 @@ def count(self, split_every=False):
             meta="i8",
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def mean(self, split_every=False):
         sum = self.sum(split_every=split_every)
         n = self.count(split_every=split_every)
         return sum / n
 
     @derived_from(pd.DataFrame)
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def var(
         self,
         axis=None,
@@ -417,19 +417,19 @@ def var(
         else:
             return _parallel_var(self, meta, skipna, split_every, out)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def groupby(self, *args, **kwargs):
         from .groupby import CudfSeriesGroupBy
 
         return CudfSeriesGroupBy(self, *args, **kwargs)
 
     @property  # type: ignore
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def list(self):
         return ListMethods(self)
 
     @property  # type: ignore
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def struct(self):
         return StructMethods(self)
 
@@ -438,7 +438,7 @@ class Index(Series, dd.core.Index):
     _partition_type = cudf.Index  # type: ignore
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _naive_var(ddf, meta, skipna, ddof, split_every, out):
     num = ddf._get_numeric_data()
     x = 1.0 * num.sum(skipna=skipna, split_every=split_every)
@@ -453,7 +453,7 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out):
     return handle_out(out, result)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _parallel_var(ddf, meta, skipna, split_every, out):
     def _local_var(x, skipna):
         if skipna:
@@ -520,7 +520,7 @@ def _finalize_var(vals):
     return handle_out(out, result)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _extract_meta(x):
     """
     Extract internal cache data (``_meta``) from dask_cudf objects
@@ -536,7 +536,7 @@ def _extract_meta(x):
     return x
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _emulate(func, *args, **kwargs):
     """
     Apply a function using args / kwargs. If arguments contain dd.DataFrame /
@@ -546,7 +546,7 @@ def _emulate(func, *args, **kwargs):
         return func(*_extract_meta(args), **_extract_meta(kwargs))
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def align_partitions(args):
     """Align partitions between dask_cudf objects.
 
@@ -563,7 +563,7 @@ def align_partitions(args):
     return args
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def reduction(
     args,
     chunk=None,
@@ -702,7 +702,7 @@ def reduction(
     return dd.core.new_dd_object(graph, b, meta, (None, None))
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
     from dask_cudf import QUERY_PLANNING_ON
 
@@ -746,7 +746,7 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
 )
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def from_dask_dataframe(df):
     """
     Convert a Dask :class:`dask.dataframe.DataFrame` to a Dask-cuDF
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 2e72461b43d..bbbcde17b51 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -16,7 +16,7 @@
 
 import cudf
 from cudf.core.groupby.groupby import _deprecate_collect
-from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 from dask_cudf.sorting import _deprecate_shuffle_kwarg
 
@@ -56,13 +56,13 @@ def wrapper(*args, **kwargs):
 
 
 class CudfDataFrameGroupBy(DataFrameGroupBy):
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def __init__(self, *args, sort=None, **kwargs):
         self.sep = kwargs.pop("sep", "___")
         self.as_index = kwargs.pop("as_index", True)
         super().__init__(*args, sort=sort, **kwargs)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def __getitem__(self, key):
         if isinstance(key, list):
             g = CudfDataFrameGroupBy(
@@ -84,7 +84,7 @@ def __getitem__(self, key):
         g._meta = g._meta[key]
         return g
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def _make_groupby_method_aggs(self, agg_name):
         """Create aggs dictionary for aggregation methods"""
 
@@ -92,7 +92,7 @@ def _make_groupby_method_aggs(self, agg_name):
             return {c: agg_name for c in self.obj.columns if c not in self.by}
         return {c: agg_name for c in self.obj.columns if c != self.by}
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def count(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -102,7 +102,7 @@ def count(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def mean(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -112,7 +112,7 @@ def mean(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def std(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -122,7 +122,7 @@ def std(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def var(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -132,7 +132,7 @@ def var(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def sum(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -142,7 +142,7 @@ def sum(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def min(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -152,7 +152,7 @@ def min(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def max(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -162,7 +162,7 @@ def max(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
         _deprecate_collect()
@@ -173,7 +173,7 @@ def collect(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def first(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -183,7 +183,7 @@ def first(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def last(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -194,7 +194,7 @@ def last(self, split_every=None, split_out=1):
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def aggregate(
         self, arg, split_every=None, split_out=1, shuffle_method=None
     ):
@@ -231,13 +231,13 @@ def aggregate(
 
 
 class CudfSeriesGroupBy(SeriesGroupBy):
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def __init__(self, *args, sort=None, **kwargs):
         self.sep = kwargs.pop("sep", "___")
         self.as_index = kwargs.pop("as_index", True)
         super().__init__(*args, sort=sort, **kwargs)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def count(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -247,7 +247,7 @@ def count(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def mean(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -257,7 +257,7 @@ def mean(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def std(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -267,7 +267,7 @@ def std(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def var(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -277,7 +277,7 @@ def var(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def sum(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -287,7 +287,7 @@ def sum(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def min(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -297,7 +297,7 @@ def min(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def max(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -307,7 +307,7 @@ def max(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
         _deprecate_collect()
@@ -318,7 +318,7 @@ def collect(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def first(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -328,7 +328,7 @@ def first(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def last(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -339,7 +339,7 @@ def last(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def aggregate(
         self, arg, split_every=None, split_out=1, shuffle_method=None
     ):
@@ -429,7 +429,7 @@ def _shuffle_aggregate(
     return result
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def groupby_agg(
     ddf,
     gb_cols,
@@ -641,7 +641,7 @@ def groupby_agg(
     )
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _make_groupby_agg_call(
     gb, aggs, split_every, split_out, shuffle_method=None
 ):
@@ -663,7 +663,7 @@ def _make_groupby_agg_call(
     )
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _redirect_aggs(arg):
     """Redirect aggregations to their corresponding name in cuDF"""
     redirects = {
@@ -690,7 +690,7 @@ def _redirect_aggs(arg):
     return redirects.get(arg, arg)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _aggs_optimized(arg, supported: set):
     """Check that aggregations in `arg` are a subset of `supported`"""
     if isinstance(arg, (list, dict)):
@@ -712,7 +712,7 @@ def _aggs_optimized(arg, supported: set):
     return False
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _groupby_optimized(gb):
     """Check that groupby input can use dask-cudf optimized codepath"""
     return isinstance(gb.obj, DaskDataFrame) and (
@@ -730,7 +730,7 @@ def _make_name(col_name, sep="_"):
     return sep.join(name for name in col_name if name != "")
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep):
     """Initial partition-level aggregation task.
 
@@ -768,7 +768,7 @@ def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep):
     return gb[sorted(output_columns)]
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _tree_node_agg(df, gb_cols, dropna, sort, sep):
     """Node in groupby-aggregation reduction tree.
 
@@ -807,7 +807,7 @@ def _tree_node_agg(df, gb_cols, dropna, sort, sep):
     return gb[sorted(output_columns)]
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
     """Calculate variance (given count, sum, and sum-squared columns)."""
 
@@ -829,7 +829,7 @@ def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
     return var
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _finalize_gb_agg(
     gb_in,
     gb_cols,
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index f3774e20d32..a2ba4d1878e 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -18,7 +18,7 @@
 
 import cudf
 from cudf.api.types import _is_categorical_dtype
-from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 _SHUFFLE_SUPPORT = ("tasks", "p2p")  # "disk" not supported
 
@@ -48,14 +48,14 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def set_index_post(df, index_name, drop, column_dtype):
     df2 = df.set_index(index_name, drop=drop)
     df2.columns = df2.columns.astype(column_dtype)
     return df2
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _set_partitions_pre(s, divisions, ascending=True, na_position="last"):
     if ascending:
         partitions = divisions.searchsorted(s, side="right") - 1
@@ -72,7 +72,7 @@ def _set_partitions_pre(s, divisions, ascending=True, na_position="last"):
     return partitions
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _quantile(a, q):
     n = len(a)
     if not len(a):
@@ -83,7 +83,7 @@ def _quantile(a, q):
     )
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def merge_quantiles(finalq, qs, vals):
     """Combine several quantile calculations of different data.
     [NOTE: Same logic as dask.array merge_percentiles]
@@ -146,7 +146,7 @@ def _append_counts(val, count):
     return rv.reset_index(drop=True)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _approximate_quantile(df, q):
     """Approximate quantiles of DataFrame or Series.
     [NOTE: Same logic as dask.dataframe Series quantile]
@@ -220,7 +220,7 @@ def set_quantile_index(df):
     return df
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def quantile_divisions(df, by, npartitions):
     qn = np.linspace(0.0, 1.0, npartitions + 1).tolist()
     divisions = _approximate_quantile(df[by], qn).compute()
@@ -257,7 +257,7 @@ def quantile_divisions(df, by, npartitions):
 
 
 @_deprecate_shuffle_kwarg
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def sort_values(
     df,
     by,

From 57862a3ab1324bc8dbea4133485bb99044bc2742 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 28 Jun 2024 08:43:12 -0400
Subject: [PATCH 06/27] stable_distinct public api now has a stream parameter
 (#16068)

As part of https://github.com/rapidsai/cudf/pull/15982 we determined that the cudf  `stable_distinct` public API needs to be updated so that a user provided stream can be provided.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Srinivas Yadav (https://github.com/srinivasyadav18)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16068
---
 cpp/include/cudf/detail/stream_compaction.hpp |   2 -
 cpp/include/cudf/stream_compaction.hpp        |   2 +
 cpp/src/stream_compaction/stable_distinct.cu  |   4 +-
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/streams/stream_compaction_test.cpp  | 235 ++++++++++++++++++
 5 files changed, 240 insertions(+), 4 deletions(-)
 create mode 100644 cpp/tests/streams/stream_compaction_test.cpp

diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index e2974789ea1..e3ef4190fd2 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -88,8 +88,6 @@ std::unique_ptr<table> distinct(table_view const& input,
 
 /**
  * @copydoc cudf::stable_distinct
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> stable_distinct(table_view const& input,
                                        std::vector<size_type> const& keys,
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index c386b3a22b4..181af11adb8 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -320,6 +320,7 @@ std::unique_ptr<column> distinct_indices(
  * @param keep Copy any, first, last, or none of the found duplicates
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
  * @param nans_equal Flag to specify whether NaN elements should be considered as equal
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned table
  * @return Table with distinct rows, preserving input order
  */
@@ -329,6 +330,7 @@ std::unique_ptr<table> stable_distinct(
   duplicate_keep_option keep        = duplicate_keep_option::KEEP_ANY,
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu
index 27b5a92ab69..074d4fd7d1a 100644
--- a/cpp/src/stream_compaction/stable_distinct.cu
+++ b/cpp/src/stream_compaction/stable_distinct.cu
@@ -79,11 +79,11 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
                                        duplicate_keep_option keep,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::stable_distinct(
-    input, keys, keep, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
+  return detail::stable_distinct(input, keys, keep, nulls_equal, nans_equal, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 9f14455f42d..eef09954647 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -700,6 +700,7 @@ ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_ROLLING_TEST streams/rolling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_STREAM_COMPACTION_TEST streams/stream_compaction_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_STRINGS_TEST
   streams/strings/case_test.cpp
diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp
new file mode 100644
index 00000000000..56443870602
--- /dev/null
+++ b/cpp/tests/streams/stream_compaction_test.cpp
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cmath>
+
+auto constexpr null{0};  // null at current level
+auto constexpr XXX{0};   // null pushed down from parent level
+auto constexpr NaN          = std::numeric_limits<double>::quiet_NaN();
+auto constexpr KEEP_ANY     = cudf::duplicate_keep_option::KEEP_ANY;
+auto constexpr KEEP_FIRST   = cudf::duplicate_keep_option::KEEP_FIRST;
+auto constexpr KEEP_LAST    = cudf::duplicate_keep_option::KEEP_LAST;
+auto constexpr KEEP_NONE    = cudf::duplicate_keep_option::KEEP_NONE;
+auto constexpr NULL_EQUAL   = cudf::null_equality::EQUAL;
+auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL;
+auto constexpr NAN_EQUAL    = cudf::nan_equality::ALL_EQUAL;
+auto constexpr NAN_UNEQUAL  = cudf::nan_equality::UNEQUAL;
+
+using int32s_col = cudf::test::fixed_width_column_wrapper<int32_t>;
+using floats_col = cudf::test::fixed_width_column_wrapper<float>;
+
+using cudf::nan_policy;
+using cudf::null_equality;
+using cudf::null_policy;
+using cudf::test::iterators::no_nulls;
+using cudf::test::iterators::null_at;
+using cudf::test::iterators::nulls_at;
+
+struct StableDistinctKeepAny : public cudf::test::BaseFixture {};
+
+struct StableDistinctKeepFirstLastNone : public cudf::test::BaseFixture {};
+
+TEST_F(StableDistinctKeepAny, NoNullsTableWithNaNs)
+{
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+  auto const col1  = int32s_col{6, 6, 6, 1, 1, 1, 3, 5, 8, 5};
+  auto const col2  = floats_col{6, 6, 6, 1, 1, 1, 3, 4, 9, 4};
+  auto const keys1 = int32s_col{20, 20, 20, 15, 15, 15, 20, 19, 21, 9};
+  auto const keys2 = floats_col{19., 19., 19., NaN, NaN, NaN, 20., 20., 9., 21.};
+
+  auto const input   = cudf::table_view{{col1, col2, keys1, keys2}};
+  auto const key_idx = std::vector<cudf::size_type>{2, 3};
+
+  // NaNs are unequal.
+  {
+    auto const exp_col1  = int32s_col{6, 1, 1, 1, 3, 5, 8, 5};
+    auto const exp_col2  = floats_col{6, 1, 1, 1, 3, 4, 9, 4};
+    auto const exp_keys1 = int32s_col{20, 15, 15, 15, 20, 19, 21, 9};
+    auto const exp_keys2 = floats_col{19., NaN, NaN, NaN, 20., 20., 9., 21.};
+    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // NaNs are equal.
+  {
+    auto const exp_col1  = int32s_col{6, 1, 3, 5, 8, 5};
+    auto const exp_col2  = floats_col{6, 1, 3, 4, 9, 4};
+    auto const exp_keys1 = int32s_col{20, 15, 20, 19, 21, 9};
+    auto const exp_keys2 = floats_col{19., NaN, 20., 20., 9., 21.};
+    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, InputWithNullsAndNaNs)
+{
+  auto constexpr null{0.0};  // shadow the global `null` variable of type int
+
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+  auto const col   = int32s_col{5, 4, 4, 1, 1, 1, 8, 8, 1};
+  auto const keys  = floats_col{{20., null, null, NaN, NaN, NaN, 19., 19., 21.}, nulls_at({1, 2})};
+  auto const input = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // Nulls are equal, NaNs are unequal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 1, 1, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, NaN, NaN, NaN, 19., 21.}, null_at(1)};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // Nulls are equal, NaNs are equal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, NaN, 19., 21.}, null_at(1)};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // Nulls are unequal, NaNs are unequal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 4, 1, 1, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, null, NaN, NaN, NaN, 19., 21.}, nulls_at({1, 2})};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // Nulls are unequal, NaNs are equal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 4, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, null, NaN, 19., 21.}, nulls_at({1, 2})};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_UNEQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsEqual)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6};
+  auto const keys    = floats_col{20., NaN, NaN, 19., 21., 19., 22.};
+  auto const input   = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col  = int32s_col{0, 1, 3, 4, 6};
+    auto const exp_keys = floats_col{20., NaN, 19., 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_FIRST, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col  = int32s_col{0, 2, 4, 5, 6};
+    auto const exp_keys = floats_col{20., NaN, 21., 19., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_LAST, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col  = int32s_col{0, 4, 6};
+    auto const exp_keys = floats_col{20., 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_NONE, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsUnequal)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6, 7};
+  auto const keys    = floats_col{20., NaN, NaN, 19., 21., 19., 22., 20.};
+  auto const input   = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col  = int32s_col{0, 1, 2, 3, 4, 6};
+    auto const exp_keys = floats_col{20., NaN, NaN, 19., 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_FIRST, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col  = int32s_col{1, 2, 4, 5, 6, 7};
+    auto const exp_keys = floats_col{NaN, NaN, 21., 19., 22., 20.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_LAST, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col  = int32s_col{1, 2, 4, 6};
+    auto const exp_keys = floats_col{NaN, NaN, 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_NONE, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}

From 2b547dc70c7f42b671cdc3e75946b123301779f0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 28 Jun 2024 03:11:01 -1000
Subject: [PATCH 07/27] Add ensure_index to not unnecessarily shallow copy
 cudf.Index (#16117)

The `cudf.Index` constructor will shallow copy a `cudf.Index` input. Sometimes, we just need to make sure an input is a `cudf.Index`, so created `ensure_index` (pandas has something similar) so we don't shallow copy these inputs unnecessarily

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16117
---
 python/cudf/cudf/core/_base_index.py     |  6 ++++-
 python/cudf/cudf/core/algorithms.py      |  4 ++--
 python/cudf/cudf/core/cut.py             |  2 +-
 python/cudf/cudf/core/dataframe.py       | 29 ++++++++++++++----------
 python/cudf/cudf/core/index.py           | 13 ++++++++++-
 python/cudf/cudf/core/indexed_frame.py   | 11 ++++-----
 python/cudf/cudf/core/multiindex.py      |  3 ++-
 python/cudf/cudf/core/series.py          | 12 ++++------
 python/cudf/cudf/tests/test_dataframe.py | 24 ++++++++++++++++++++
 9 files changed, 73 insertions(+), 31 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index caf07b286cd..e160fa697ee 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1104,7 +1104,11 @@ def difference(self, other, sort=None):
                 f"of [None, False, True]; {sort} was passed."
             )
 
-        other = cudf.Index(other, name=getattr(other, "name", self.name))
+        if not isinstance(other, BaseIndex):
+            other = cudf.Index(
+                other,
+                name=getattr(other, "name", self.name),
+            )
 
         if not len(other):
             res = self._get_reconciled_name_object(other).unique()
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 51a32e29886..e8b82ff60c2 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -6,7 +6,7 @@
 
 from cudf.core.column import as_column
 from cudf.core.copy_types import BooleanMask
-from cudf.core.index import Index, RangeIndex
+from cudf.core.index import RangeIndex, ensure_index
 from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.scalar import Scalar
 from cudf.options import get_option
@@ -107,7 +107,7 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
         dtype="int64" if get_option("mode.pandas_compatible") else None,
     ).values
 
-    return labels, cats.values if return_cupy_array else Index(cats)
+    return labels, cats.values if return_cupy_array else ensure_index(cats)
 
 
 def _linear_interpolation(column, index=None):
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index 54c5e829e8a..d9f62f51f92 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -292,7 +292,7 @@ def cut(
     )
 
     # we return a categorical index, as we don't have a Categorical method
-    categorical_index = cudf.Index(col)
+    categorical_index = cudf.CategoricalIndex._from_data({None: col})
 
     if isinstance(orig_x, (pd.Series, cudf.Series)):
         # if we have a series input we return a series output
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 3fc29582c4c..4dfeb68b7ba 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -58,7 +58,12 @@
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.copy_types import BooleanMask
 from cudf.core.groupby.groupby import DataFrameGroupBy, groupby_doc_template
-from cudf.core.index import BaseIndex, RangeIndex, _index_from_data, as_index
+from cudf.core.index import (
+    BaseIndex,
+    RangeIndex,
+    _index_from_data,
+    ensure_index,
+)
 from cudf.core.indexed_frame import (
     IndexedFrame,
     _FrameIndexer,
@@ -338,7 +343,7 @@ def _getitem_tuple_arg(self, arg):
                                 range(len(tmp_arg[0]))
                             )
                         },
-                        index=as_index(tmp_arg[0]),
+                        index=cudf.Index(tmp_arg[0]),
                     )
                     columns_df[cantor_name] = column.as_column(
                         range(len(columns_df))
@@ -702,7 +707,7 @@ def __init__(
                     data = data.reindex(index)
                     index = data.index
                 else:
-                    index = cudf.Index(index)
+                    index = ensure_index(index)
             else:
                 index = data.index
 
@@ -751,7 +756,7 @@ def __init__(
             if index is None:
                 self._index = RangeIndex(0)
             else:
-                self._index = cudf.Index(index)
+                self._index = ensure_index(index)
             if columns is not None:
                 rangeindex = isinstance(
                     columns, (range, pd.RangeIndex, cudf.RangeIndex)
@@ -909,7 +914,7 @@ def _init_from_series_list(self, data, columns, index):
                         f"not match length of index ({index_length})"
                     )
 
-            final_index = cudf.Index(index)
+            final_index = ensure_index(index)
 
         series_lengths = list(map(len, data))
         data = numeric_normalize_types(*data)
@@ -977,9 +982,9 @@ def _init_from_list_like(self, data, index=None, columns=None):
         if index is None:
             index = RangeIndex(start=0, stop=len(data))
         else:
-            index = cudf.Index(index)
+            index = ensure_index(index)
 
-        self._index = cudf.Index(index)
+        self._index = index
         # list-of-dicts case
         if len(data) > 0 and isinstance(data[0], dict):
             data = DataFrame.from_pandas(pd.DataFrame(data))
@@ -1085,7 +1090,7 @@ def _init_from_dict_like(
 
             self._index = RangeIndex(0, num_rows)
         else:
-            self._index = cudf.Index(index)
+            self._index = ensure_index(index)
 
         if len(data):
             self._data.multiindex = True
@@ -1491,7 +1496,7 @@ def memory_usage(self, index=True, deep=False):
             names.append("Index")
         return Series._from_data(
             data={None: as_column(mem_usage)},
-            index=as_index(names),
+            index=cudf.Index(names),
         )
 
     @_performance_tracking
@@ -4033,7 +4038,7 @@ def transpose(self):
         # Set the old column names as the new index
         result = self.__class__._from_data(
             ColumnAccessor(dict(enumerate(result_columns)), verify=False),
-            index=as_index(index),
+            index=cudf.Index(index),
         )
         # Set the old index as the new column names
         result.columns = columns
@@ -5657,7 +5662,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
             }
 
         if not is_scalar(index):
-            new_index = cudf.Index(index)
+            new_index = ensure_index(index)
         else:
             new_index = None
 
@@ -5741,7 +5746,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             }
 
         if index is not None:
-            index = cudf.Index(index)
+            index = ensure_index(index)
 
         if isinstance(columns, (pd.Index, cudf.Index)):
             level_names = tuple(columns.names)
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index e069f8d0ea6..b398ee2343e 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -65,6 +65,17 @@
     from collections.abc import Generator, Iterable
 
 
+def ensure_index(index_like: Any) -> BaseIndex:
+    """
+    Ensure an Index is returned.
+
+    Avoids a shallow copy compared to calling cudf.Index(...)
+    """
+    if not isinstance(index_like, BaseIndex):
+        return cudf.Index(index_like)
+    return index_like
+
+
 class IndexMeta(type):
     """Custom metaclass for Index that overrides instance/subclass tests."""
 
@@ -1569,7 +1580,7 @@ def append(self, other):
                 to_concat.append(obj)
         else:
             this = self
-            other = cudf.Index(other)
+            other = ensure_index(other)
 
             if len(this) == 0 or len(other) == 0:
                 # we'll filter out empties later in ._concat
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 72bd3c45fa6..ff10051c52d 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -33,7 +33,6 @@
     is_list_like,
     is_scalar,
 )
-from cudf.core._base_index import BaseIndex
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import ColumnBase, as_column
@@ -42,7 +41,7 @@
 from cudf.core.dtypes import ListDtype
 from cudf.core.frame import Frame
 from cudf.core.groupby.groupby import GroupBy
-from cudf.core.index import Index, RangeIndex, _index_from_data
+from cudf.core.index import RangeIndex, _index_from_data, ensure_index
 from cudf.core.missing import NA
 from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import _Resampler
@@ -66,6 +65,8 @@
         Dtype,
         NotImplementedType,
     )
+    from cudf.core._base_index import BaseIndex
+
 
 doc_reset_index_template = """
         Reset the index of the {klass}, or a level of it.
@@ -627,9 +628,7 @@ def index(self, value):
                 f"new values have {len(value)} elements"
             )
         # avoid unnecessary cast to Index
-        if not isinstance(value, BaseIndex):
-            value = Index(value)
-
+        value = ensure_index(value)
         self._index = value
 
     @_performance_tracking
@@ -3595,7 +3594,7 @@ def _align_to_index(
         sort: bool = True,
         allow_non_unique: bool = False,
     ) -> Self:
-        index = cudf.Index(index)
+        index = ensure_index(index)
 
         if self.index.equals(index):
             return self
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 7657fa9e234..9cbe863142b 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -29,6 +29,7 @@
     BaseIndex,
     _get_indexer_basic,
     _lexsorted_equal_range,
+    ensure_index,
 )
 from cudf.core.join._join_helpers import _match_join_keys
 from cudf.utils.dtypes import is_column_like
@@ -173,7 +174,7 @@ def __init__(
                     "codes and is inconsistent!"
                 )
 
-        levels = [cudf.Index(level) for level in levels]
+        levels = [ensure_index(level) for level in levels]
 
         if len(levels) != len(codes._data):
             raise ValueError(
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 9acf5294b72..97b6bbec2d4 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -48,7 +48,7 @@
 from cudf.core.column.struct import StructMethods
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.groupby.groupby import SeriesGroupBy, groupby_doc_template
-from cudf.core.index import BaseIndex, DatetimeIndex, RangeIndex, as_index
+from cudf.core.index import BaseIndex, DatetimeIndex, RangeIndex, ensure_index
 from cudf.core.indexed_frame import (
     IndexedFrame,
     _FrameIndexer,
@@ -588,10 +588,8 @@ def __init__(
                 data = data.copy(deep=True)
             name_from_data = data.name
             column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
-            if isinstance(data, pd.Series):
-                index_from_data = cudf.Index(data.index)
-            elif isinstance(data, Series):
-                index_from_data = data.index
+            if isinstance(data, (pd.Series, Series)):
+                index_from_data = ensure_index(data.index)
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
                 "Use cudf.Series._from_data for constructing a Series from "
@@ -642,7 +640,7 @@ def __init__(
             name = name_from_data
 
         if index is not None:
-            index = cudf.Index(index)
+            index = ensure_index(index)
 
         if index_from_data is not None:
             first_index = index_from_data
@@ -3191,7 +3189,7 @@ def quantile(
 
         return Series._from_data(
             data={self.name: result},
-            index=as_index(np_array_q) if quant_index else None,
+            index=cudf.Index(np_array_q) if quant_index else None,
         )
 
     @docutils.doc_describe()
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index fc7fd87d4c5..f40106a30f4 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -11078,3 +11078,27 @@ def test_dataframe_loc_int_float(dtype1, dtype2):
     expected = pdf.loc[pidx]
 
     assert_eq(actual, expected, check_index_type=True, check_dtype=True)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        cudf.DataFrame(range(2)),
+        None,
+        [cudf.Series(range(2))],
+        [[0], [1]],
+        {1: range(2)},
+        cupy.arange(2),
+    ],
+)
+def test_init_with_index_no_shallow_copy(data):
+    idx = cudf.RangeIndex(2)
+    df = cudf.DataFrame(data, index=idx)
+    assert df.index is idx
+
+
+def test_from_records_with_index_no_shallow_copy():
+    idx = cudf.RangeIndex(2)
+    data = np.array([(1.0, 2), (3.0, 4)], dtype=[("x", "<f8"), ("y", "<i8")])
+    df = cudf.DataFrame(data.view(np.recarray), index=idx)
+    assert df.index is idx

From 224ac5bad11465d0486af80e7935eac482269805 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 28 Jun 2024 09:26:37 -0400
Subject: [PATCH 08/27] Add libcudf public/detail API pattern to developer
 guide (#16086)

Adds specific description for the public API to detail API function pattern to the libcudf developer guide.
Also fixes some formatting issues and broken link.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16086
---
 .../developer_guide/DEVELOPER_GUIDE.md        | 60 +++++++++++--------
 1 file changed, 34 insertions(+), 26 deletions(-)

diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index ff80c2daab8..0d097541692 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -1,4 +1,4 @@
-# libcudf C++ Developer Guide
+# libcudf C++ Developer Guide {#DEVELOPER_GUIDE}
 
 This document serves as a guide for contributors to libcudf C++ code. Developers should also refer
 to these additional files for further documentation of libcudf best practices.
@@ -469,7 +469,7 @@ libcudf throws under different circumstances, see the [section on error handling
 
 # libcudf API and Implementation
 
-## Streams
+## Streams {#streams}
 
 libcudf is in the process of adding support for asynchronous execution using
 CUDA streams. In order to facilitate the usage of streams, all new libcudf APIs
@@ -486,33 +486,37 @@ use only asynchronous versions of CUDA APIs with the stream parameter.
 
 In order to make the `detail` API callable from other libcudf functions, it should be exposed in a
 header placed in the `cudf/cpp/include/detail/` directory.
+The declaration is not necessary if no other libcudf functions call the `detail` function.
 
 For example:
 
 ```c++
 // cpp/include/cudf/header.hpp
-void external_function(...);
+void external_function(...,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 // cpp/include/cudf/detail/header.hpp
 namespace detail{
-void external_function(..., rmm::cuda_stream_view stream)
+void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 } // namespace detail
 
 // cudf/src/implementation.cpp
 namespace detail{
-    // Use the stream parameter in the detail implementation.
-    void external_function(..., rmm::cuda_stream_view stream){
-        // Implementation uses the stream with async APIs.
-        rmm::device_buffer buff(...,stream);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
-        kernel<<<..., stream>>>(...);
-        thrust::algorithm(rmm::exec_policy(stream), ...);
-    }
+// Use the stream parameter in the detail implementation.
+void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr){
+  // Implementation uses the stream with async APIs.
+  rmm::device_buffer buff(..., stream, mr);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
+  kernel<<<..., stream>>>(...);
+  thrust::algorithm(rmm::exec_policy(stream), ...);
+}
 } // namespace detail
 
-void external_function(...){
-    CUDF_FUNC_RANGE(); // Generates an NVTX range for the lifetime of this function.
-    detail::external_function(..., cudf::get_default_stream());
+void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE(); // Generates an NVTX range for the lifetime of this function.
+  detail::external_function(..., stream, mr);
 }
 ```
 
@@ -703,28 +707,28 @@ The preferred style for how inputs are passed in and outputs are returned is the
     - `column_view const&`
   - Tables:
     - `table_view const&`
-    - Scalar:
-        - `scalar const&`
-    - Everything else:
-       - Trivial or inexpensively copied types
-          - Pass by value
-       - Non-trivial or expensive to copy types
-          - Pass by `const&`
+  - Scalar:
+    - `scalar const&`
+  - Everything else:
+    - Trivial or inexpensively copied types
+      - Pass by value
+    - Non-trivial or expensive to copy types
+      - Pass by `const&`
 - In/Outs
   - Columns:
     - `mutable_column_view&`
   - Tables:
     - `mutable_table_view&`
-    - Everything else:
-        - Pass by via raw pointer
+  - Everything else:
+    - Pass by via raw pointer
 - Outputs
   - Outputs should be *returned*, i.e., no output parameters
   - Columns:
     - `std::unique_ptr<column>`
   - Tables:
     - `std::unique_ptr<table>`
-    - Scalars:
-        - `std::unique_ptr<scalar>`
+  - Scalars:
+    - `std::unique_ptr<scalar>`
 
 
 ### Multiple Return Values
@@ -908,6 +912,10 @@ functions that are specific to columns of Strings. These functions reside in the
 namespace. Similarly, functionality used exclusively for unit testing is in the `cudf::test::`
 namespace.
 
+The public function is expected to contain a call to `CUDF_FUNC_RANGE()` followed by a call to
+a `detail` function with same name and parameters as the public function.
+See the [Streams](#streams) section for an example of this pattern.
+
 ### Internal
 
 Many functions are not meant for public use, so place them in either the `detail` or an *anonymous*

From 673d766836b7e6e8c80afe32cd9a4b4da2cecf58 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Fri, 28 Jun 2024 09:38:57 -0400
Subject: [PATCH 09/27] Make binary operators work between fixed-point and
 floating args (#16116)

Some of the binary operators in cuDF don't work between fixed_point and floating-point numbers after [this earlier PR](https://github.com/rapidsai/cudf/pull/15438) removed the ability to construct and implicitly cast fixed_point numbers from floating point numbers. This PR restores that functionality by detecting and performing the necessary explicit casts, and adds tests for the supported operators.

Note that the `binary_op_has_common_type` code is modeled after `has_common_type` found in traits.hpp.

This closes [issue 16090](https://github.com/rapidsai/cudf/issues/16090)

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16116
---
 cpp/include/cudf/binaryop.hpp                 | 50 ++++++++++++++++
 cpp/src/binaryop/compiled/binary_ops.cuh      | 14 ++++-
 cpp/src/binaryop/compiled/util.cpp            | 12 ++--
 .../binop-compiled-fixed_point-test.cpp       | 58 +++++++++++++++++++
 4 files changed, 125 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 5e41a871f32..22dad11e109 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -91,6 +91,56 @@ enum class binary_operator : int32_t {
                      ///< (null, false) is null, and (valid, valid) == LOGICAL_OR(valid, valid)
   INVALID_BINARY     ///< invalid operation
 };
+
+/// Binary operation common type default
+template <typename L, typename R, typename = void>
+struct binary_op_common_type {};
+
+/// Binary operation common type specialization
+template <typename L, typename R>
+struct binary_op_common_type<L, R, std::enable_if_t<has_common_type_v<L, R>>> {
+  /// The common type of the template parameters
+  using type = std::common_type_t<L, R>;
+};
+
+/// Binary operation common type specialization
+template <typename L, typename R>
+struct binary_op_common_type<
+  L,
+  R,
+  std::enable_if_t<is_fixed_point<L>() && cuda::std::is_floating_point_v<R>>> {
+  /// The common type of the template parameters
+  using type = L;
+};
+
+/// Binary operation common type specialization
+template <typename L, typename R>
+struct binary_op_common_type<
+  L,
+  R,
+  std::enable_if_t<is_fixed_point<R>() && cuda::std::is_floating_point_v<L>>> {
+  /// The common type of the template parameters
+  using type = R;
+};
+
+/// Binary operation common type helper
+template <typename L, typename R>
+using binary_op_common_type_t = typename binary_op_common_type<L, R>::type;
+
+namespace detail {
+template <typename AlwaysVoid, typename L, typename R>
+struct binary_op_has_common_type_impl : std::false_type {};
+
+template <typename L, typename R>
+struct binary_op_has_common_type_impl<std::void_t<binary_op_common_type_t<L, R>>, L, R>
+  : std::true_type {};
+}  // namespace detail
+
+/// Checks if binary operation types have a common type
+template <typename L, typename R>
+constexpr inline bool binary_op_has_common_type_v =
+  detail::binary_op_has_common_type_impl<void, L, R>::value;
+
 /**
  * @brief Performs a binary operation between a scalar and a column.
  *
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 5177e7d4bda..c6af0c3c58a 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -49,9 +49,16 @@ struct type_casted_accessor {
                                         column_device_view const& col,
                                         bool is_scalar) const
   {
-    if constexpr (column_device_view::has_element_accessor<Element>() and
-                  std::is_convertible_v<Element, CastType>)
-      return static_cast<CastType>(col.element<Element>(is_scalar ? 0 : i));
+    if constexpr (column_device_view::has_element_accessor<Element>()) {
+      auto const element = col.element<Element>(is_scalar ? 0 : i);
+      if constexpr (std::is_convertible_v<Element, CastType>) {
+        return static_cast<CastType>(element);
+      } else if constexpr (is_fixed_point<Element>() && cuda::std::is_floating_point_v<CastType>) {
+        return convert_fixed_to_floating<CastType>(element);
+      } else if constexpr (is_fixed_point<CastType>() && cuda::std::is_floating_point_v<Element>) {
+        return convert_floating_to_fixed<CastType>(element, numeric::scale_type{0});
+      }
+    }
     return {};
   }
 };
@@ -159,6 +166,7 @@ struct ops2_wrapper {
       TypeRhs y   = rhs.element<TypeRhs>(is_rhs_scalar ? 0 : i);
       auto result = [&]() {
         if constexpr (std::is_same_v<BinaryOperator, ops::NullEquals> or
+                      std::is_same_v<BinaryOperator, ops::NullNotEquals> or
                       std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
                       std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
                       std::is_same_v<BinaryOperator, ops::NullMax> or
diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp
index 2b6a4f58895..b62c5f1f4e1 100644
--- a/cpp/src/binaryop/compiled/util.cpp
+++ b/cpp/src/binaryop/compiled/util.cpp
@@ -31,8 +31,8 @@ struct common_type_functor {
   template <typename TypeLhs, typename TypeRhs>
   std::optional<data_type> operator()() const
   {
-    if constexpr (cudf::has_common_type_v<TypeLhs, TypeRhs>) {
-      using TypeCommon = std::common_type_t<TypeLhs, TypeRhs>;
+    if constexpr (binary_op_has_common_type_v<TypeLhs, TypeRhs>) {
+      using TypeCommon = binary_op_common_type_t<TypeLhs, TypeRhs>;
       return data_type{type_to_id<TypeCommon>()};
     }
 
@@ -85,8 +85,8 @@ struct is_binary_operation_supported {
   {
     if constexpr (column_device_view::has_element_accessor<TypeLhs>() and
                   column_device_view::has_element_accessor<TypeRhs>()) {
-      if constexpr (has_common_type_v<TypeLhs, TypeRhs>) {
-        using common_t = std::common_type_t<TypeLhs, TypeRhs>;
+      if constexpr (binary_op_has_common_type_v<TypeLhs, TypeRhs>) {
+        using common_t = binary_op_common_type_t<TypeLhs, TypeRhs>;
         return std::is_invocable_v<BinaryOperator, common_t, common_t>;
       } else {
         return std::is_invocable_v<BinaryOperator, TypeLhs, TypeRhs>;
@@ -102,8 +102,8 @@ struct is_binary_operation_supported {
     if constexpr (column_device_view::has_element_accessor<TypeLhs>() and
                   column_device_view::has_element_accessor<TypeRhs>()) {
       if (has_mutable_element_accessor(out_type) or is_fixed_point(out_type)) {
-        if constexpr (has_common_type_v<TypeLhs, TypeRhs>) {
-          using common_t = std::common_type_t<TypeLhs, TypeRhs>;
+        if constexpr (binary_op_has_common_type_v<TypeLhs, TypeRhs>) {
+          using common_t = binary_op_common_type_t<TypeLhs, TypeRhs>;
           if constexpr (std::is_invocable_v<BinaryOperator, common_t, common_t>) {
             using ReturnType = std::invoke_result_t<BinaryOperator, common_t, common_t>;
             return is_constructible<ReturnType>(out_type) or
diff --git a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
index 6d097b2ff12..89824eb6511 100644
--- a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
@@ -843,3 +843,61 @@ TYPED_TEST(FixedPointTest_64_128_Reps, FixedPoint_64_128_ComparisonTests)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(h->view(), falses);
   }
 }
+
+template <typename ResultType>
+void test_fixed_floating(cudf::binary_operator op,
+                         double floating_value,
+                         int decimal_value,
+                         int decimal_scale,
+                         ResultType expected)
+{
+  auto const scale       = numeric::scale_type{decimal_scale};
+  auto const result_type = cudf::data_type(cudf::type_to_id<ResultType>());
+  auto const nullable =
+    (op == cudf::binary_operator::NULL_EQUALS || op == cudf::binary_operator::NULL_NOT_EQUALS ||
+     op == cudf::binary_operator::NULL_MIN || op == cudf::binary_operator::NULL_MAX);
+
+  cudf::test::fixed_width_column_wrapper<double> floating_col({floating_value});
+  cudf::test::fixed_point_column_wrapper<int> decimal_col({decimal_value}, scale);
+
+  auto result = binary_operation(floating_col, decimal_col, op, result_type);
+
+  if constexpr (cudf::is_fixed_point<ResultType>()) {
+    using wrapper_type      = cudf::test::fixed_point_column_wrapper<typename ResultType::rep>;
+    auto const expected_col = nullable ? wrapper_type({expected.value()}, {true}, expected.scale())
+                                       : wrapper_type({expected.value()}, expected.scale());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, *result.get());
+  } else {
+    using wrapper_type = cudf::test::fixed_width_column_wrapper<ResultType>;
+    auto const expected_col =
+      nullable ? wrapper_type({expected}, {true}) : wrapper_type({expected});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, *result.get());
+  }
+}
+
+TYPED_TEST(FixedPointCompiledTest, FixedPointWithFloating)
+{
+  using namespace numeric;
+
+  // BOOLEAN
+  test_fixed_floating(cudf::binary_operator::EQUAL, 1.0, 10, -1, true);
+  test_fixed_floating(cudf::binary_operator::NOT_EQUAL, 1.0, 10, -1, false);
+  test_fixed_floating(cudf::binary_operator::LESS, 2.0, 10, -1, false);
+  test_fixed_floating(cudf::binary_operator::GREATER, 2.0, 10, -1, true);
+  test_fixed_floating(cudf::binary_operator::LESS_EQUAL, 2.0, 20, -1, true);
+  test_fixed_floating(cudf::binary_operator::GREATER_EQUAL, 2.0, 30, -1, false);
+  test_fixed_floating(cudf::binary_operator::NULL_EQUALS, 1.0, 10, -1, true);
+  test_fixed_floating(cudf::binary_operator::NULL_NOT_EQUALS, 1.0, 10, -1, false);
+
+  // PRIMARY ARITHMETIC
+  auto const decimal_result = numeric::decimal32(4, numeric::scale_type{0});
+  test_fixed_floating(cudf::binary_operator::ADD, 1.0, 30, -1, decimal_result);
+  test_fixed_floating(cudf::binary_operator::SUB, 6.0, 20, -1, decimal_result);
+  test_fixed_floating(cudf::binary_operator::MUL, 2.0, 20, -1, decimal_result);
+  test_fixed_floating(cudf::binary_operator::DIV, 8.0, 2, 0, decimal_result);
+  test_fixed_floating(cudf::binary_operator::MOD, 9.0, 50, -1, decimal_result);
+
+  // OTHER ARITHMETIC
+  test_fixed_floating(cudf::binary_operator::NULL_MAX, 4.0, 20, -1, decimal_result);
+  test_fixed_floating(cudf::binary_operator::NULL_MIN, 4.0, 200, -1, decimal_result);
+}

From c40e0cc8dae8922c2633f5359609a1d063ae7f26 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 28 Jun 2024 10:10:31 -0400
Subject: [PATCH 10/27] Add support for proxy `np.flatiter` objects (#16107)

Closes #15388

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16107
---
 python/cudf/cudf/pandas/_wrappers/numpy.py        | 13 +++++++++++++
 python/cudf/cudf_pandas_tests/test_cudf_pandas.py | 10 ++++++++++
 2 files changed, 23 insertions(+)

diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index c445be46f58..3b012169676 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -129,6 +129,19 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
     },
 )
 
+
+flatiter = make_final_proxy_type(
+    "flatiter",
+    cupy.flatiter,
+    numpy.flatiter,
+    fast_to_slow=lambda fast: cupy.asnumpy(fast.base).flat,
+    slow_to_fast=lambda slow: cupy.asarray(slow).flat,
+    additional_attributes={
+        "__array__": array_method,
+    },
+)
+
+
 # Mapping flags between slow and fast types
 _ndarray_flags = make_intermediate_proxy_type(
     "_ndarray_flags",
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 0d46e2e9311..f51ce103677 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1535,6 +1535,16 @@ def test_is_proxy_object():
     assert not is_proxy_object(s2)
 
 
+def test_numpy_cupy_flatiter(series):
+    cp = pytest.importorskip("cupy")
+
+    _, s = series
+    arr = s.values
+
+    assert type(arr.flat._fsproxy_fast) == cp.flatiter
+    assert type(arr.flat._fsproxy_slow) == np.flatiter
+
+
 def test_arrow_string_arrays():
     cu_s = xpd.Series(["a", "b", "c"])
     pd_s = pd.Series(["a", "b", "c"])

From 565c0d1c3a08c9bd7eafa70278a8744097f8ef04 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 28 Jun 2024 10:16:55 -0400
Subject: [PATCH 11/27] Migrate lists/contains to pylibcudf (#15981)

Part of #15162.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15981
---
 cpp/include/cudf/lists/lists_column_view.hpp  |   3 +-
 python/cudf/cudf/_lib/lists.pyx               |  72 +++-------
 python/cudf/cudf/_lib/pylibcudf/column.pxd    |   4 +
 python/cudf/cudf/_lib/pylibcudf/column.pyx    |   9 ++
 .../_lib/pylibcudf/libcudf/lists/contains.pxd |  29 +++-
 .../libcudf/lists/lists_column_view.pxd       |   1 +
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  10 ++
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 124 +++++++++++++++++-
 .../cudf/cudf/pylibcudf_tests/test_lists.py   |  98 +++++++++++++-
 9 files changed, 281 insertions(+), 69 deletions(-)

diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp
index 57a4f724c2d..3397cb0ca1d 100644
--- a/cpp/include/cudf/lists/lists_column_view.hpp
+++ b/cpp/include/cudf/lists/lists_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,6 +38,7 @@ namespace cudf {
  */
 class lists_column_view : private column_view {
  public:
+  lists_column_view() = default;
   /**
    * @brief Construct a new lists column view object from a column view.
    *
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 5d406f5c85f..0ad09dba717 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -9,10 +9,6 @@ from libcpp.utility cimport move
 from cudf._lib.column cimport Column
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.lists.contains cimport (
-    contains,
-    index_of as cpp_index_of,
-)
 from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
     count_elements as cpp_count_elements,
 )
@@ -26,7 +22,6 @@ from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
 from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
     distinct as cpp_distinct,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     null_equality,
@@ -34,11 +29,12 @@ from cudf._lib.pylibcudf.libcudf.types cimport (
     order,
     size_type,
 )
-from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf._lib import pylibcudf
 
+from cudf._lib.pylibcudf cimport Scalar
+
 
 @acquire_spill_lock()
 def count_elements(Column col):
@@ -153,64 +149,36 @@ def extract_element_column(Column col, Column index):
 
 
 @acquire_spill_lock()
-def contains_scalar(Column col, object py_search_key):
-
-    cdef DeviceScalar search_key = py_search_key.device_value
-
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+def contains_scalar(Column col, py_search_key):
+    return Column.from_pylibcudf(
+        pylibcudf.lists.contains(
+            col.to_pylibcudf(mode="read"),
+            <Scalar> py_search_key.device_value.c_value,
+        )
     )
-    cdef const scalar* search_key_value = search_key.get_raw_ptr()
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(contains(
-            list_view.get()[0],
-            search_key_value[0],
-        ))
-    result = Column.from_unique_ptr(move(c_result))
-    return result
 
 
 @acquire_spill_lock()
 def index_of_scalar(Column col, object py_search_key):
-
-    cdef DeviceScalar search_key = py_search_key.device_value
-
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.index_of(
+            col.to_pylibcudf(mode="read"),
+            <Scalar> py_search_key.device_value.c_value,
+            True,
+        )
     )
-    cdef const scalar* search_key_value = search_key.get_raw_ptr()
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_index_of(
-            list_view.get()[0],
-            search_key_value[0],
-        ))
-    return Column.from_unique_ptr(move(c_result))
 
 
 @acquire_spill_lock()
 def index_of_column(Column col, Column search_keys):
-
-    cdef column_view keys_view = search_keys.view()
-
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.index_of(
+            col.to_pylibcudf(mode="read"),
+            search_keys.to_pylibcudf(mode="read"),
+            True,
+        )
     )
 
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_index_of(
-            list_view.get()[0],
-            keys_view,
-        ))
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def concatenate_rows(list source_columns):
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd
index e121e856865..d13791d95cf 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd
@@ -8,6 +8,9 @@ from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
 from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type, size_type
 
 from .gpumemoryview cimport gpumemoryview
@@ -56,3 +59,4 @@ cdef class ListColumnView:
     cdef Column _column
     cpdef child(self)
     cpdef offsets(self)
+    cdef lists_column_view view(self) nogil
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index e726eca154f..e0cf8b7ee32 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -348,6 +348,15 @@ cdef class ListColumnView:
         """The offsets column of the underlying list column."""
         return self._column.child(1)
 
+    cdef lists_column_view view(self) nogil:
+        """Generate a libcudf lists_column_view to pass to libcudf algorithms.
+
+        This method is for pylibcudf's functions to use to generate inputs when
+        calling libcudf algorithms, and should generally not be needed by users
+        (even direct pylibcudf Cython users).
+        """
+        return lists_column_view(self._column.view())
+
 
 @functools.cache
 def _datatype_from_dtype_desc(desc):
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
index 721679f35c7..82aed7d70a0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 
 from cudf._lib.exception_handler cimport cudf_exception_handler
@@ -12,17 +13,33 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
+
+    cpdef enum class duplicate_find_option(int32_t):
+        FIND_FIRST
+        FIND_LAST
+
     cdef unique_ptr[column] contains(
-        lists_column_view lists,
-        scalar search_key,
+        const lists_column_view& lists,
+        const scalar& search_key,
+    ) except +cudf_exception_handler
+
+    cdef unique_ptr[column] contains(
+        const lists_column_view& lists,
+        const column_view& search_keys,
+    ) except +cudf_exception_handler
+
+    cdef unique_ptr[column] contains_nulls(
+        const lists_column_view& lists,
     ) except +cudf_exception_handler
 
     cdef unique_ptr[column] index_of(
-        lists_column_view lists,
-        scalar search_key,
+        const lists_column_view& lists,
+        const scalar& search_key,
+        duplicate_find_option find_option,
     ) except +cudf_exception_handler
 
     cdef unique_ptr[column] index_of(
-        lists_column_view lists,
-        column_view search_keys,
+        const lists_column_view& lists,
+        const column_view& search_keys,
+        duplicate_find_option find_option,
     ) except +cudf_exception_handler
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
index dbafc415e45..fd21e7b334b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
@@ -9,6 +9,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil:
     cdef cppclass lists_column_view(column_view):
+        lists_column_view() except +
         lists_column_view(const column_view& lists_column) except +
         column_view parent() except +
         column_view offsets() except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index 2d2a5b2a9ea..2ccf0139e90 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -5,11 +5,21 @@ from libcpp cimport bool
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
+from .scalar cimport Scalar
 from .table cimport Table
 
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
 
 cpdef Table explode_outer(Table, size_type explode_column_idx)
 
 cpdef Column concatenate_rows(Table)
 
 cpdef Column concatenate_list_elements(Column, bool dropna)
+
+cpdef Column contains(Column, ColumnOrScalar)
+
+cpdef Column contains_nulls(Column)
+
+cpdef Column index_of(Column, ColumnOrScalar, bool)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index 069c9da31c2..a94d940accd 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -1,11 +1,15 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from cython.operator cimport dereference
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists cimport explode as cpp_explode
+from cudf._lib.pylibcudf.libcudf.lists cimport (
+    contains as cpp_contains,
+    explode as cpp_explode,
+)
 from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_list_elements as cpp_concatenate_list_elements,
     concatenate_null_policy,
@@ -13,8 +17,10 @@ from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
 )
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.lists cimport ColumnOrScalar
 
-from .column cimport Column
+from .column cimport Column, ListColumnView
+from .scalar cimport Scalar
 from .table cimport Table
 
 
@@ -71,15 +77,15 @@ cpdef Column concatenate_list_elements(Column input, bool dropna):
     ----------
     input : Column
         The input column
+    dropna : bool
+        If true, null list elements will be ignored
+        from concatenation. Otherwise any input null values will result in
+        the corresponding output row being set to null.
 
     Returns
     -------
     Column
         A new Column of concatenated list elements
-    dropna : bool
-        If true, null list elements will be ignored
-        from concatenation. Otherwise any input null values will result in
-        the corresponding output row being set to null.
     """
     cdef concatenate_null_policy null_policy = (
         concatenate_null_policy.IGNORE if dropna
@@ -94,3 +100,109 @@ cpdef Column concatenate_list_elements(Column input, bool dropna):
         ))
 
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column contains(Column input, ColumnOrScalar search_key):
+    """Create a column of bool values indicating whether
+    the search_key is contained in the input.
+
+    ``search_key`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    For details, see :cpp:func:`contains`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    search_key : Union[Column, Scalar]
+        The search key.
+
+    Returns
+    -------
+    Column
+        A new Column of bools indicating if the search_key was
+        found in the list column.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+
+    if not isinstance(search_key, (Column, Scalar)):
+        raise TypeError("Must pass a Column or Scalar")
+
+    with nogil:
+        c_result = move(cpp_contains.contains(
+            list_view.view(),
+            search_key.view() if ColumnOrScalar is Column else dereference(
+                search_key.get()
+            ),
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column contains_nulls(Column input):
+    """Create a column of bool values indicating whether
+    each row in the lists column contains a null value.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+
+    Returns
+    -------
+    Column
+        A new Column of bools indicating if the list column
+        contains a null value.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+    with nogil:
+        c_result = move(cpp_contains.contains_nulls(list_view.view()))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_option):
+    """Create a column of index values indicating the position of a search
+    key row within the corresponding list row in the lists column.
+
+    ``search_key`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    For details, see :cpp:func:`index_of`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    search_key : Union[Column, Scalar]
+        The search key.
+    find_first_option : bool
+        If true, index_of returns the first match.
+        Otherwise the last match is returned.
+
+    Returns
+    -------
+    Column
+        A new Column of index values that indicate where in the
+        list column tthe search_key was found. An index value
+        of -1 indicates that the search_key was not found.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+    cdef cpp_contains.duplicate_find_option find_option = (
+        cpp_contains.duplicate_find_option.FIND_FIRST if find_first_option
+        else cpp_contains.duplicate_find_option.FIND_LAST
+    )
+
+    with nogil:
+        c_result = move(cpp_contains.index_of(
+            list_view.view(),
+            search_key.view() if ColumnOrScalar is Column else dereference(
+                search_key.get()
+            ),
+            find_option,
+        ))
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index b21af8ea11c..c781126e388 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -7,15 +7,28 @@
 from cudf._lib import pylibcudf as plc
 
 
-def test_concatenate_rows():
-    test_data = [[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]]
+@pytest.fixture
+def test_data():
+    return [[[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]]]
 
-    arrow_tbl = pa.Table.from_arrays(test_data, names=["a", "b"])
+
+@pytest.fixture
+def scalar():
+    return pa.scalar(1)
+
+
+@pytest.fixture
+def column():
+    return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32())
+
+
+def test_concatenate_rows(test_data):
+    arrow_tbl = pa.Table.from_arrays(test_data[0], names=["a", "b"])
     plc_tbl = plc.interop.from_arrow(arrow_tbl)
 
     res = plc.lists.concatenate_rows(plc_tbl)
 
-    expect = pa.array([pair[0] + pair[1] for pair in zip(*test_data)])
+    expect = pa.array([pair[0] + pair[1] for pair in zip(*test_data[0])])
 
     assert_column_eq(expect, res)
 
@@ -44,3 +57,80 @@ def test_concatenate_list_elements(test_data, dropna, expected):
     expect = pa.array(expected)
 
     assert_column_eq(expect, res)
+
+
+def test_contains_scalar(test_data, scalar):
+    list_column = test_data[0][0]
+    arr = pa.array(list_column)
+
+    plc_column = plc.interop.from_arrow(arr)
+    plc_scalar = plc.interop.from_arrow(scalar)
+    res = plc.lists.contains(plc_column, plc_scalar)
+
+    expect = pa.array([True, False, False, False])
+
+    assert_column_eq(expect, res)
+
+
+def test_contains_list_column(test_data):
+    list_column1 = test_data[0][0]
+    list_column2 = [1, 3, 5, 1]
+    arr1 = pa.array(list_column1)
+    arr2 = pa.array(list_column2)
+
+    plc_column1 = plc.interop.from_arrow(arr1)
+    plc_column2 = plc.interop.from_arrow(arr2)
+    res = plc.lists.contains(plc_column1, plc_column2)
+
+    expect = pa.array([True, False, True, False])
+
+    assert_column_eq(expect, res)
+
+
+@pytest.mark.parametrize(
+    "list_column, expected",
+    [
+        (
+            [[1, None], [1, 3, 4], [5, None]],
+            [True, False, True],
+        ),
+        (
+            [[1, None], None, [5]],
+            [True, None, False],
+        ),
+    ],
+)
+def test_contains_nulls(list_column, expected):
+    arr = pa.array(list_column)
+    plc_column = plc.interop.from_arrow(arr)
+    res = plc.lists.contains_nulls(plc_column)
+
+    expect = pa.array(expected)
+
+    assert_column_eq(expect, res)
+
+
+def test_index_of_scalar(test_data, scalar):
+    list_column = test_data[0][0]
+    arr = pa.array(list_column)
+
+    plc_column = plc.interop.from_arrow(arr)
+    plc_scalar = plc.interop.from_arrow(scalar)
+    res = plc.lists.index_of(plc_column, plc_scalar, True)
+
+    expect = pa.array([1, -1, -1, -1], type=pa.int32())
+
+    assert_column_eq(expect, res)
+
+
+def test_index_of_list_column(test_data, column):
+    list_column = test_data[0][0]
+    arr1 = pa.array(list_column)
+    arr2, expect = column
+    plc_column1 = plc.interop.from_arrow(arr1)
+    plc_column2 = plc.interop.from_arrow(arr2)
+    res = plc.lists.index_of(plc_column1, plc_column2, True)
+
+    expect = pa.array(column[1], type=pa.int32())
+
+    assert_column_eq(expect, res)

From e434fdbc546dd1810c750abdd086f07b694782b2 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 28 Jun 2024 10:57:01 -0400
Subject: [PATCH 12/27] Update libcudf compiler requirements in contributing
 doc (#16103)

Updates the compiler requirements in the contributing document.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16103
---
 CONTRIBUTING.md | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 98c2ec0a22e..4fbc28fa6e1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -71,15 +71,14 @@ for a minimal build of libcudf without using conda are also listed below.
 
 Compilers:
 
-* `gcc` version 9.3+
-* `nvcc` version 11.5+
-* `cmake` version 3.26.4+
+* `gcc` version 11.4+
+* `nvcc` version 11.8+
+* `cmake` version 3.29.6+
 
-CUDA/GPU:
+CUDA/GPU Runtime:
 
-* CUDA 11.5+
-* NVIDIA driver 450.80.02+
-* Volta architecture or better (Compute Capability >=7.0)
+* CUDA 11.4+
+* Volta architecture or better ([Compute Capability](https://docs.nvidia.com/deploy/cuda-compatibility/) >=7.0)
 
 You can obtain CUDA from
 [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).

From a4b951a6c140c05178edb61d8e28f51a4b430e15 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Fri, 28 Jun 2024 10:20:42 -0500
Subject: [PATCH 13/27] Templatization of fixed-width parquet decoding kernels.
 (#15911)

This PR merges all of the fixed-width parquet decoding kernels into a single templatized kernel that can be selectively instantiated with desired features (dictionary/no-dictionary, nested/non-nested, etc).  It also adds support for (non-list) nested columns in this path. So structs do not have to use the much slower general decode kernel any more.

A new benchmark was added specific to structs containing only fixed width columns.  I added this because the performance improvement is fairly high (+20%) but we don't see it in the normal struct benchmarks because they include (and are dominated by) string decode times.  The new benchmark shows:

Before this PR:
```
| data_type |    io_type    | cardinality | run_length | bytes_per_second | peak_memory_usage | encoded_file_size |
|-----------|---------------|-------------|------------|------------------|-------------------|-------------------|
|    STRUCT | DEVICE_BUFFER |           0 |          1 |      21071216823 |         1.047 GiB |       511.675 MiB |
|    STRUCT | DEVICE_BUFFER |        1000 |          1 |      18974392387 |       821.312 MiB |       128.884 MiB |
|    STRUCT | DEVICE_BUFFER |           0 |         32 |      20429356824 |      621.787 MiB  |        28.141 MiB |
|    STRUCT | DEVICE_BUFFER |        1000 |         32 |      20572327813 |       598.421 MiB |        16.475 MiB |
```

After this PR:

```
| data_type |    io_type    | cardinality | run_length | bytes_per_second | peak_memory_usage | encoded_file_size |
|-----------|---------------|-------------|------------|------------------|-------------------|-------------------|
|    STRUCT | DEVICE_BUFFER |           0 |          1 |      25805996399 |         1.047 GiB |       511.675 MiB |
|    STRUCT | DEVICE_BUFFER |        1000 |          1 |      22422306660 |       821.312 MiB |       128.884 MiB |
|    STRUCT | DEVICE_BUFFER |           0 |         32 |      24460694014 |       621.787 MiB |        28.141 MiB |
|    STRUCT | DEVICE_BUFFER |        1000 |         32 |      24674861214 |       598.421 MiB |        16.475 MiB |
```

Split-page decoding for fixed-width types + structs are also going through this new path. New test added.

This brings us closer to eliminating the "general" kernel.  The only things left that run through it are lists and booleans.

This is PR 1 of 2, with the followup moving a lot of code around.  At this point, I think it makes sense to start consolidating our files a bit.

I also left some breadcrumbs (a few small commented out code blocks) in the core kernel `gpuDecodePageDataGeneric` for the next step of adding list support. They can be removed if people don't like them.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/15911
---
 .../io/parquet/parquet_reader_input.cpp       |  50 +-
 cpp/src/io/parquet/decode_fixed.cu            | 896 ++++++++++--------
 cpp/src/io/parquet/page_hdr.cu                |  16 +-
 cpp/src/io/parquet/parquet_gpu.hpp            |  46 +-
 cpp/src/io/parquet/reader_impl.cpp            |  57 +-
 cpp/tests/io/parquet_writer_test.cpp          |  97 +-
 6 files changed, 703 insertions(+), 459 deletions(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
index 019e0f30fe9..7563c823454 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,20 +59,18 @@ void parquet_read_common(cudf::size_type num_rows_to_read,
 }
 
 template <data_type DataType>
-void BM_parquet_read_data(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
+void BM_parquet_read_data_common(nvbench::state& state,
+                                 data_profile const& profile,
+                                 nvbench::type_list<nvbench::enum_type<DataType>>)
 {
   auto const d_type      = get_type_or_group(static_cast<int32_t>(DataType));
-  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
-  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
   auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
   auto const compression = cudf::io::compression_type::SNAPPY;
   cuio_source_sink_pair source_sink(source_type);
 
   auto const num_rows_written = [&]() {
-    auto const tbl = create_random_table(
-      cycle_dtypes(d_type, num_cols),
-      table_size_bytes{data_size},
-      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const tbl =
+      create_random_table(cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, profile);
     auto const view = tbl->view();
 
     cudf::io::parquet_writer_options write_opts =
@@ -85,6 +83,32 @@ void BM_parquet_read_data(nvbench::state& state, nvbench::type_list<nvbench::enu
   parquet_read_common(num_rows_written, num_cols, source_sink, state);
 }
 
+template <data_type DataType>
+void BM_parquet_read_data(nvbench::state& state,
+                          nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  BM_parquet_read_data_common<DataType>(
+    state, data_profile_builder().cardinality(cardinality).avg_run_length(run_length), type_list);
+}
+
+template <data_type DataType>
+void BM_parquet_read_fixed_width_struct(nvbench::state& state,
+                                        nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  std::vector<cudf::type_id> s_types{
+    cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::INT64};
+  BM_parquet_read_data_common<DataType>(state,
+                                        data_profile_builder()
+                                          .cardinality(cardinality)
+                                          .avg_run_length(run_length)
+                                          .struct_types(s_types),
+                                        type_list);
+}
+
 void BM_parquet_read_io_compression(nvbench::state& state)
 {
   auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
@@ -247,3 +271,13 @@ NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32})
   .add_int64_axis("num_string_cols", {1, 2, 3});
+
+// a benchmark for structs that only contain fixed-width types
+using d_type_list_struct_only = nvbench::enum_type_list<data_type::STRUCT>;
+NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only))
+  .set_name("parquet_read_fixed_width_struct")
+  .set_type_axes_names({"data_type"})
+  .add_string_axis("io_type", {"DEVICE_BUFFER"})
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index bfd89200786..ea80ae73c2f 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -24,136 +24,11 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
-constexpr int decode_block_size = 128;
-constexpr int rolling_buf_size  = decode_block_size * 2;
-// the required number of runs in shared memory we will need to provide the
-// rle_stream object
-constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<decode_block_size>();
-
-template <bool nullable, typename level_t, typename state_buf>
-static __device__ int gpuUpdateValidityOffsetsAndRowIndicesFlat(
-  int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
-{
-  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
-  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
-
-  auto& ni = s->nesting_info[0];
-
-  // how many (input) values we've processed in the page so far
-  int value_count = s->input_value_count;
-  int valid_count = ni.valid_count;
-
-  // cap by last row so that we don't process any rows past what we want to output.
-  int const first_row                 = s->first_row;
-  int const last_row                  = first_row + s->num_rows;
-  int const capped_target_value_count = min(target_value_count, last_row);
-
-  int const valid_map_offset      = ni.valid_map_offset;
-  int const row_index_lower_bound = s->row_index_lower_bound;
-
-  __syncthreads();
-
-  while (value_count < capped_target_value_count) {
-    int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
-
-    // definition level. only need to process for nullable columns
-    int d = 0;
-    if constexpr (nullable) {
-      d = t < batch_size
-            ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
-            : -1;
-    }
-
-    int const thread_value_count = t + 1;
-    int const block_value_count  = batch_size;
-
-    // compute our row index, whether we're in row bounds, and validity
-    int const row_index     = (thread_value_count + value_count) - 1;
-    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
-    int is_valid;
-    if constexpr (nullable) {
-      is_valid = ((d > 0) && in_row_bounds) ? 1 : 0;
-    } else {
-      is_valid = in_row_bounds;
-    }
-
-    // thread and block validity count
-    int thread_valid_count, block_valid_count;
-    if constexpr (nullable) {
-      using block_scan = cub::BlockScan<int, decode_block_size>;
-      __shared__ typename block_scan::TempStorage scan_storage;
-      block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
-      __syncthreads();
-
-      // validity is processed per-warp
-      //
-      // nested schemas always read and write to the same bounds (that is, read and write
-      // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
-      // at the first value, even if that is before first_row, because we cannot trivially jump to
-      // the correct position to start reading. since we are about to write the validity vector
-      // here we need to adjust our computed mask to take into account the write row bounds.
-      int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
-      int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
-      int warp_null_count   = 0;
-      if (write_start >= 0) {
-        uint32_t const warp_validity_mask = ballot(is_valid);
-        // lane 0 from each warp writes out validity
-        if ((t % cudf::detail::warp_size) == 0) {
-          int const vindex = (value_count + thread_value_count) - 1;  // absolute input value index
-          int const bit_offset = (valid_map_offset + vindex + write_start) -
-                                 first_row;  // absolute bit offset into the output validity map
-          int const write_end =
-            cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
-          int const bit_count = write_end - write_start;
-          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
-
-          store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
-        }
-      }
-
-      // sum null counts. we have to do it this way instead of just incrementing by (value_count -
-      // valid_count) because valid_count also includes rows that potentially start before our row
-      // bounds. if we could come up with a way to clean that up, we could remove this and just
-      // compute it directly at the end of the kernel.
-      size_type const block_null_count =
-        cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
-      if (t == 0) { ni.null_count += block_null_count; }
-    }
-    // trivial for non-nullable columns
-    else {
-      thread_valid_count = thread_value_count;
-      block_valid_count  = block_value_count;
-    }
-
-    // output offset
-    if (is_valid) {
-      int const dst_pos = (value_count + thread_value_count) - 1;
-      int const src_pos = (valid_count + thread_valid_count) - 1;
-      sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
-    }
-
-    // update stuff
-    value_count += block_value_count;
-    valid_count += block_valid_count;
-  }
-
-  if (t == 0) {
-    // update valid value count for decoding and total # of values we've processed
-    ni.valid_count       = valid_count;
-    ni.value_count       = value_count;
-    s->nz_count          = valid_count;
-    s->input_value_count = value_count;
-    s->input_row_count   = value_count;
-  }
-
-  return valid_count;
-}
-
-template <typename state_buf>
-__device__ inline void gpuDecodeValues(
+template <int block_size, typename state_buf>
+__device__ inline void gpuDecodeFixedWidthValues(
   page_state_s* s, state_buf* const sb, int start, int end, int t)
 {
-  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int num_warps      = block_size / cudf::detail::warp_size;
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
@@ -217,18 +92,22 @@ __device__ inline void gpuDecodeValues(
   }
 }
 
-template <typename state_buf>
-__device__ inline void gpuDecodeSplitValues(page_state_s* s,
-                                            state_buf* const sb,
-                                            int start,
-                                            int end)
+template <int block_size, typename state_buf>
+struct decode_fixed_width_values_func {
+  __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t)
+  {
+    gpuDecodeFixedWidthValues<block_size, state_buf>(s, sb, start, end, t);
+  }
+};
+
+template <int block_size, typename state_buf>
+__device__ inline void gpuDecodeFixedWidthSplitValues(
+  page_state_s* s, state_buf* const sb, int start, int end, int t)
 {
   using cudf::detail::warp_size;
-  constexpr int num_warps      = decode_block_size / warp_size;
+  constexpr int num_warps      = block_size / warp_size;
   constexpr int max_batch_size = num_warps * warp_size;
 
-  auto const t = threadIdx.x;
-
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
   int const dtype                          = s->col.physical_type;
   auto const data_len                      = thrust::distance(s->data_start, s->data_end);
@@ -307,266 +186,293 @@ __device__ inline void gpuDecodeSplitValues(page_state_s* s,
   }
 }
 
-// is the page marked nullable or not
-__device__ inline bool is_nullable(page_state_s* s)
-{
-  auto const lvl           = level_type::DEFINITION;
-  auto const max_def_level = s->col.max_level[lvl];
-  return max_def_level > 0;
-}
+template <int block_size, typename state_buf>
+struct decode_fixed_width_split_values_func {
+  __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t)
+  {
+    gpuDecodeFixedWidthSplitValues<block_size, state_buf>(s, sb, start, end, t);
+  }
+};
 
-// for a nullable page, check to see if it could have nulls
-__device__ inline bool has_nulls(page_state_s* s)
+template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+static __device__ int gpuUpdateValidityAndRowIndicesNested(
+  int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
-  auto const lvl      = level_type::DEFINITION;
-  auto const init_run = s->initial_rle_run[lvl];
-  // literal runs, lets assume they could hold nulls
-  if (is_literal_run(init_run)) { return true; }
-
-  // repeated run with number of items in the run not equal
-  // to the rows in the page, assume that means we could have nulls
-  if (s->page.num_input_values != (init_run >> 1)) { return true; }
-
-  auto const lvl_bits = s->col.level_bits[lvl];
-  auto const run_val  = lvl_bits == 0 ? 0 : s->initial_rle_value[lvl];
-
-  // the encoded repeated value isn't valid, we have (all) nulls
-  return run_val != s->col.max_level[lvl];
-}
+  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
-/**
- * @brief Kernel for computing fixed width non dictionary column data stored in the pages
- *
- * This function will write the page data and the page data's validity to the
- * output specified in the page's column chunk. If necessary, additional
- * conversion will be performed to translate from the Parquet datatype to
- * desired output datatype.
- *
- * @param pages List of pages
- * @param chunks List of column chunks
- * @param min_row Row index to start reading at
- * @param num_rows Maximum number of rows to read
- * @param error_code Error code to set if an error is encountered
- */
-template <typename level_t>
-CUDF_KERNEL void __launch_bounds__(decode_block_size)
-  gpuDecodePageDataFixed(PageInfo* pages,
-                         device_span<ColumnChunkDesc const> chunks,
-                         size_t min_row,
-                         size_t num_rows,
-                         kernel_error::pointer error_code)
-{
-  __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
-                                                1,                 // unused in this kernel
-                                                1>                 // unused in this kernel
-    state_buffers;
+  // how many (input) values we've processed in the page so far
+  int value_count = s->input_value_count;
 
-  page_state_s* const s = &state_g;
-  auto* const sb        = &state_buffers;
-  int const page_idx    = blockIdx.x;
-  int const t           = threadIdx.x;
-  PageInfo* pp          = &pages[page_idx];
+  // cap by last row so that we don't process any rows past what we want to output.
+  int const first_row                 = s->first_row;
+  int const last_row                  = first_row + s->num_rows;
+  int const capped_target_value_count = min(target_value_count, last_row);
 
-  if (!(BitAnd(pages[page_idx].kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT))) { return; }
+  int const row_index_lower_bound = s->row_index_lower_bound;
 
-  // must come after the kernel mask check
-  [[maybe_unused]] null_count_back_copier _{s, t};
+  int const max_depth = s->col.max_nesting_depth - 1;
+  __syncthreads();
 
-  if (!setupLocalPageInfo(s,
-                          pp,
-                          chunks,
-                          min_row,
-                          num_rows,
-                          mask_filter{decode_kernel_mask::FIXED_WIDTH_NO_DICT},
-                          page_processing_stage::DECODE)) {
-    return;
-  }
+  while (value_count < capped_target_value_count) {
+    int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-  // the level stream decoders
-  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
-  rle_stream<level_t, decode_block_size, rolling_buf_size> def_decoder{def_runs};
+    // definition level. only need to process for nullable columns
+    int d = 0;
+    if constexpr (nullable) {
+      if (def) {
+        d = t < batch_size
+              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
+              : -1;
+      } else {
+        d = t < batch_size ? 1 : -1;
+      }
+    }
 
-  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
-  if (s->num_rows == 0) { return; }
+    int const thread_value_count = t + 1;
+    int const block_value_count  = batch_size;
 
-  bool const nullable            = is_nullable(s);
-  bool const nullable_with_nulls = nullable && has_nulls(s);
+    // compute our row index, whether we're in row bounds, and validity
+    int const row_index           = (thread_value_count + value_count) - 1;
+    int const in_row_bounds       = (row_index >= row_index_lower_bound) && (row_index < last_row);
+    int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
+    int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
+
+    // iterate by depth
+    for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
+      auto& ni = s->nesting_info[d_idx];
+
+      int is_valid;
+      if constexpr (nullable) {
+        is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
+      } else {
+        is_valid = in_row_bounds;
+      }
 
-  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
-  level_t* const def = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
-  if (nullable_with_nulls) {
-    def_decoder.init(s->col.level_bits[level_type::DEFINITION],
-                     s->abs_lvl_start[level_type::DEFINITION],
-                     s->abs_lvl_end[level_type::DEFINITION],
-                     def,
-                     s->page.num_input_values);
-  }
-  __syncthreads();
+      // thread and block validity count
+      int thread_valid_count, block_valid_count;
+      if constexpr (nullable) {
+        using block_scan = cub::BlockScan<int, decode_block_size>;
+        __shared__ typename block_scan::TempStorage scan_storage;
+        block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
+        __syncthreads();
+
+        // validity is processed per-warp
+        //
+        // nested schemas always read and write to the same bounds (that is, read and write
+        // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
+        // at the first value, even if that is before first_row, because we cannot trivially jump to
+        // the correct position to start reading. since we are about to write the validity vector
+        // here we need to adjust our computed mask to take into account the write row bounds.
+        int warp_null_count = 0;
+        if (write_start >= 0 && ni.valid_map != nullptr) {
+          int const valid_map_offset        = ni.valid_map_offset;
+          uint32_t const warp_validity_mask = ballot(is_valid);
+          // lane 0 from each warp writes out validity
+          if ((t % cudf::detail::warp_size) == 0) {
+            int const vindex =
+              (value_count + thread_value_count) - 1;  // absolute input value index
+            int const bit_offset = (valid_map_offset + vindex + write_start) -
+                                   first_row;  // absolute bit offset into the output validity map
+            int const write_end = cudf::detail::warp_size -
+                                  __clz(in_write_row_bounds);  // last bit in the warp to store
+            int const bit_count = write_end - write_start;
+            warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
+
+            store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
+          }
+        }
 
-  // We use two counters in the loop below: processed_count and valid_count.
-  // - processed_count: number of rows out of num_input_values that we have decoded so far.
-  //   the definition stream returns the number of total rows it has processed in each call
-  //   to decode_next and we accumulate in process_count.
-  // - valid_count: number of non-null rows we have decoded so far. In each iteration of the
-  //   loop below, we look at the number of valid items (which could be all for non-nullable),
-  //   and valid_count is that running count.
-  int processed_count = 0;
-  int valid_count     = 0;
-  // the core loop. decode batches of level stream data using rle_stream objects
-  // and pass the results to gpuDecodeValues
-  while (s->error == 0 && processed_count < s->page.num_input_values) {
-    int next_valid_count;
+        // sum null counts. we have to do it this way instead of just incrementing by (value_count -
+        // valid_count) because valid_count also includes rows that potentially start before our row
+        // bounds. if we could come up with a way to clean that up, we could remove this and just
+        // compute it directly at the end of the kernel.
+        size_type const block_null_count =
+          cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+        if (t == 0) { ni.null_count += block_null_count; }
+      }
+      // trivial for non-nullable columns
+      else {
+        thread_valid_count = thread_value_count;
+        block_valid_count  = block_value_count;
+      }
 
-    // only need to process definition levels if the column has nulls
-    if (nullable_with_nulls) {
-      processed_count += def_decoder.decode_next(t);
-      __syncthreads();
+      // if this is valid and we're at the leaf, output dst_pos
+      __syncthreads();  // handle modification of ni.value_count from below
+      if (is_valid && d_idx == max_depth) {
+        // for non-list types, the value count is always the same across
+        int const dst_pos = (value_count + thread_value_count) - 1;
+        int const src_pos = (ni.valid_count + thread_valid_count) - 1;
+        sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+      }
+      __syncthreads();  // handle modification of ni.value_count from below
 
-      next_valid_count =
-        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
+      // update stuff
+      if (t == 0) { ni.valid_count += block_valid_count; }
     }
-    // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
-    // this function call entirely since all it will ever generate is a mapping of (i -> i) for
-    // nz_idx.  gpuDecodeValues would be the only work that happens.
-    else {
-      processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t);
-    }
-    __syncthreads();
 
-    // decode the values themselves
-    gpuDecodeValues(s, sb, valid_count, next_valid_count, t);
-    __syncthreads();
+    value_count += block_value_count;
+  }
 
-    valid_count = next_valid_count;
+  if (t == 0) {
+    // update valid value count for decoding and total # of values we've processed
+    s->nz_count          = s->nesting_info[max_depth].valid_count;
+    s->input_value_count = value_count;
+    s->input_row_count   = value_count;
   }
-  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
+
+  __syncthreads();
+  return s->nesting_info[max_depth].valid_count;
 }
 
-/**
- * @brief Kernel for computing fixed width dictionary column data stored in the pages
- *
- * This function will write the page data and the page data's validity to the
- * output specified in the page's column chunk. If necessary, additional
- * conversion will be performed to translate from the Parquet datatype to
- * desired output datatype.
- *
- * @param pages List of pages
- * @param chunks List of column chunks
- * @param min_row Row index to start reading at
- * @param num_rows Maximum number of rows to read
- * @param error_code Error code to set if an error is encountered
- */
-template <typename level_t>
-CUDF_KERNEL void __launch_bounds__(decode_block_size)
-  gpuDecodePageDataFixedDict(PageInfo* pages,
-                             device_span<ColumnChunkDesc const> chunks,
-                             size_t min_row,
-                             size_t num_rows,
-                             kernel_error::pointer error_code)
+template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+static __device__ int gpuUpdateValidityAndRowIndicesFlat(
+  int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
-  __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
-                                                rolling_buf_size,  // dictionary
-                                                1>                 // unused in this kernel
-    state_buffers;
-
-  page_state_s* const s = &state_g;
-  auto* const sb        = &state_buffers;
-  int const page_idx    = blockIdx.x;
-  int const t           = threadIdx.x;
-  PageInfo* pp          = &pages[page_idx];
+  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
-  if (!(BitAnd(pages[page_idx].kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT))) { return; }
+  auto& ni = s->nesting_info[0];
 
-  // must come after the kernel mask check
-  [[maybe_unused]] null_count_back_copier _{s, t};
+  // how many (input) values we've processed in the page so far
+  int value_count = s->input_value_count;
+  int valid_count = ni.valid_count;
 
-  if (!setupLocalPageInfo(s,
-                          pp,
-                          chunks,
-                          min_row,
-                          num_rows,
-                          mask_filter{decode_kernel_mask::FIXED_WIDTH_DICT},
-                          page_processing_stage::DECODE)) {
-    return;
-  }
+  // cap by last row so that we don't process any rows past what we want to output.
+  int const first_row                 = s->first_row;
+  int const last_row                  = first_row + s->num_rows;
+  int const capped_target_value_count = min(target_value_count, last_row);
 
-  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
-  rle_stream<level_t, decode_block_size, rolling_buf_size> def_decoder{def_runs};
+  int const valid_map_offset      = ni.valid_map_offset;
+  int const row_index_lower_bound = s->row_index_lower_bound;
 
-  __shared__ rle_run<uint32_t> dict_runs[rle_run_buffer_size];
-  rle_stream<uint32_t, decode_block_size, rolling_buf_size> dict_stream{dict_runs};
+  __syncthreads();
 
-  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
-  if (s->num_rows == 0) { return; }
+  while (value_count < capped_target_value_count) {
+    int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-  bool const nullable            = is_nullable(s);
-  bool const nullable_with_nulls = nullable && has_nulls(s);
+    // definition level. only need to process for nullable columns
+    int d = 0;
+    if constexpr (nullable) {
+      if (def) {
+        d = t < batch_size
+              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
+              : -1;
+      } else {
+        d = t < batch_size ? 1 : -1;
+      }
+    }
 
-  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
-  level_t* const def = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
-  if (nullable_with_nulls) {
-    def_decoder.init(s->col.level_bits[level_type::DEFINITION],
-                     s->abs_lvl_start[level_type::DEFINITION],
-                     s->abs_lvl_end[level_type::DEFINITION],
-                     def,
-                     s->page.num_input_values);
-  }
+    int const thread_value_count = t + 1;
+    int const block_value_count  = batch_size;
 
-  dict_stream.init(
-    s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values);
-  __syncthreads();
+    // compute our row index, whether we're in row bounds, and validity
+    int const row_index     = (thread_value_count + value_count) - 1;
+    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
+    int is_valid;
+    if constexpr (nullable) {
+      is_valid = ((d > 0) && in_row_bounds) ? 1 : 0;
+    } else {
+      is_valid = in_row_bounds;
+    }
 
-  // We use two counters in the loop below: processed_count and valid_count.
-  // - processed_count: number of rows out of num_input_values that we have decoded so far.
-  //   the definition stream returns the number of total rows it has processed in each call
-  //   to decode_next and we accumulate in process_count.
-  // - valid_count: number of non-null rows we have decoded so far. In each iteration of the
-  //   loop below, we look at the number of valid items (which could be all for non-nullable),
-  //   and valid_count is that running count.
-  int processed_count = 0;
-  int valid_count     = 0;
+    // thread and block validity count
+    int thread_valid_count, block_valid_count;
+    if constexpr (nullable) {
+      using block_scan = cub::BlockScan<int, decode_block_size>;
+      __shared__ typename block_scan::TempStorage scan_storage;
+      block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
+      __syncthreads();
 
-  // the core loop. decode batches of level stream data using rle_stream objects
-  // and pass the results to gpuDecodeValues
-  while (s->error == 0 && processed_count < s->page.num_input_values) {
-    int next_valid_count;
+      // validity is processed per-warp
+      //
+      // nested schemas always read and write to the same bounds (that is, read and write
+      // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
+      // at the first value, even if that is before first_row, because we cannot trivially jump to
+      // the correct position to start reading. since we are about to write the validity vector
+      // here we need to adjust our computed mask to take into account the write row bounds.
+      int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
+      int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
+      int warp_null_count   = 0;
+      if (write_start >= 0) {
+        uint32_t const warp_validity_mask = ballot(is_valid);
+        // lane 0 from each warp writes out validity
+        if ((t % cudf::detail::warp_size) == 0) {
+          int const vindex = (value_count + thread_value_count) - 1;  // absolute input value index
+          int const bit_offset = (valid_map_offset + vindex + write_start) -
+                                 first_row;  // absolute bit offset into the output validity map
+          int const write_end =
+            cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
+          int const bit_count = write_end - write_start;
+          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
 
-    // only need to process definition levels if the column has nulls
-    if (nullable_with_nulls) {
-      processed_count += def_decoder.decode_next(t);
-      __syncthreads();
+          store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
+        }
+      }
 
-      // count of valid items in this batch
-      next_valid_count =
-        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
+      // sum null counts. we have to do it this way instead of just incrementing by (value_count -
+      // valid_count) because valid_count also includes rows that potentially start before our row
+      // bounds. if we could come up with a way to clean that up, we could remove this and just
+      // compute it directly at the end of the kernel.
+      size_type const block_null_count =
+        cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+      if (t == 0) { ni.null_count += block_null_count; }
     }
-    // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
-    // this function call entirely since all it will ever generate is a mapping of (i -> i) for
-    // nz_idx.  gpuDecodeValues would be the only work that happens.
+    // trivial for non-nullable columns
     else {
-      processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t);
+      thread_valid_count = thread_value_count;
+      block_valid_count  = block_value_count;
     }
-    __syncthreads();
 
-    // We want to limit the number of dictionary items we decode, that correspond to
-    // the rows we have processed in this iteration that are valid.
-    // We know the number of valid rows to process with: next_valid_count - valid_count.
-    dict_stream.decode_next(t, next_valid_count - valid_count);
-    __syncthreads();
+    // output offset
+    if (is_valid) {
+      int const dst_pos = (value_count + thread_value_count) - 1;
+      int const src_pos = (valid_count + thread_valid_count) - 1;
+      sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+    }
 
-    // decode the values themselves
-    gpuDecodeValues(s, sb, valid_count, next_valid_count, t);
-    __syncthreads();
+    // update stuff
+    value_count += block_value_count;
+    valid_count += block_valid_count;
+  }
 
-    valid_count = next_valid_count;
+  if (t == 0) {
+    // update valid value count for decoding and total # of values we've processed
+    ni.valid_count       = valid_count;
+    ni.value_count       = value_count;  // TODO: remove? this is unused in the non-list path
+    s->nz_count          = valid_count;
+    s->input_value_count = value_count;
+    s->input_row_count   = value_count;
   }
-  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
+
+  return valid_count;
+}
+
+// is the page marked nullable or not
+__device__ inline bool is_nullable(page_state_s* s)
+{
+  auto const lvl           = level_type::DEFINITION;
+  auto const max_def_level = s->col.max_level[lvl];
+  return max_def_level > 0;
+}
+
+// for a nullable page, check to see if it could have nulls
+__device__ inline bool maybe_has_nulls(page_state_s* s)
+{
+  auto const lvl      = level_type::DEFINITION;
+  auto const init_run = s->initial_rle_run[lvl];
+  // literal runs, lets assume they could hold nulls
+  if (is_literal_run(init_run)) { return true; }
+
+  // repeated run with number of items in the run not equal
+  // to the rows in the page, assume that means we could have nulls
+  if (s->page.num_input_values != (init_run >> 1)) { return true; }
+
+  auto const lvl_bits = s->col.level_bits[lvl];
+  auto const run_val  = lvl_bits == 0 ? 0 : s->initial_rle_value[lvl];
+
+  // the encoded repeated value isn't valid, we have (all) nulls
+  return run_val != s->col.max_level[lvl];
 }
 
 /**
@@ -583,19 +489,28 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
  * @param num_rows Maximum number of rows to read
  * @param error_code Error code to set if an error is encountered
  */
-template <typename level_t>
-CUDF_KERNEL void __launch_bounds__(decode_block_size)
-  gpuDecodeSplitPageDataFlat(PageInfo* pages,
-                             device_span<ColumnChunkDesc const> chunks,
-                             size_t min_row,
-                             size_t num_rows,
-                             kernel_error::pointer error_code)
+template <typename level_t,
+          int decode_block_size_t,
+          decode_kernel_mask kernel_mask_t,
+          bool has_dict_t,
+          bool has_nesting_t,
+          template <int block_size, typename state_buf>
+          typename DecodeValuesFunc>
+CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
+  gpuDecodePageDataGeneric(PageInfo* pages,
+                           device_span<ColumnChunkDesc const> chunks,
+                           size_t min_row,
+                           size_t num_rows,
+                           kernel_error::pointer error_code)
 {
+  constexpr int rolling_buf_size    = decode_block_size_t * 2;
+  constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<decode_block_size_t>();
+
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
-                                                1,                 // unused in this kernel
-                                                1>                 // unused in this kernel
-    state_buffers;
+  using state_buf_t = page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
+                                           has_dict_t ? rolling_buf_size : 1,
+                                           1>;
+  __shared__ __align__(16) state_buf_t state_buffers;
 
   page_state_s* const s = &state_g;
   auto* const sb        = &state_buffers;
@@ -603,9 +518,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   int const t           = threadIdx.x;
   PageInfo* pp          = &pages[page_idx];
 
-  if (!(BitAnd(pages[page_idx].kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT))) {
-    return;
-  }
+  if (!(BitAnd(pages[page_idx].kernel_mask, kernel_mask_t))) { return; }
 
   // must come after the kernel mask check
   [[maybe_unused]] null_count_back_copier _{s, t};
@@ -615,30 +528,70 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
                           chunks,
                           min_row,
                           num_rows,
-                          mask_filter{decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT},
+                          mask_filter{kernel_mask_t},
                           page_processing_stage::DECODE)) {
     return;
   }
 
-  // the level stream decoders
-  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
-  rle_stream<level_t, decode_block_size, rolling_buf_size> def_decoder{def_runs};
-
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
   if (s->num_rows == 0) { return; }
 
-  bool const nullable            = is_nullable(s);
-  bool const nullable_with_nulls = nullable && has_nulls(s);
+  DecodeValuesFunc<decode_block_size_t, state_buf_t> decode_values;
+
+  bool const nullable             = is_nullable(s);
+  bool const should_process_nulls = nullable && maybe_has_nulls(s);
+
+  // shared buffer. all shared memory is suballocated out of here
+  // constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size *
+  // sizeof(rle_run<level_t>), size_t{16}) : 0;
+  constexpr int shared_dict_size =
+    has_dict_t
+      ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<uint32_t>), size_t{16})
+      : 0;
+  constexpr int shared_def_size =
+    cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), size_t{16});
+  constexpr int shared_buf_size = /*shared_rep_size +*/ shared_dict_size + shared_def_size;
+  __shared__ __align__(16) uint8_t shared_buf[shared_buf_size];
+
+  // setup all shared memory buffers
+  int shared_offset = 0;
+  /*
+  rle_run<level_t> *rep_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
+  if constexpr (has_lists_t){
+    shared_offset += shared_rep_size;
+  }
+  */
+  rle_run<uint32_t>* dict_runs = reinterpret_cast<rle_run<uint32_t>*>(shared_buf + shared_offset);
+  if constexpr (has_dict_t) { shared_offset += shared_dict_size; }
+  rle_run<level_t>* def_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
 
   // initialize the stream decoders (requires values computed in setupLocalPageInfo)
+  rle_stream<level_t, decode_block_size_t, rolling_buf_size> def_decoder{def_runs};
   level_t* const def = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
-  if (nullable_with_nulls) {
+  if (should_process_nulls) {
     def_decoder.init(s->col.level_bits[level_type::DEFINITION],
                      s->abs_lvl_start[level_type::DEFINITION],
                      s->abs_lvl_end[level_type::DEFINITION],
                      def,
                      s->page.num_input_values);
   }
+  /*
+  rle_stream<level_t, decode_block_size_t, rolling_buf_size> rep_decoder{rep_runs};
+  level_t* const rep = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
+  if constexpr(has_lists_t){
+    rep_decoder.init(s->col.level_bits[level_type::REPETITION],
+                     s->abs_lvl_start[level_type::REPETITION],
+                     s->abs_lvl_end[level_type::REPETITION],
+                     rep,
+                     s->page.num_input_values);
+  }
+  */
+
+  rle_stream<uint32_t, decode_block_size_t, rolling_buf_size> dict_stream{dict_runs};
+  if constexpr (has_dict_t) {
+    dict_stream.init(
+      s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values);
+  }
   __syncthreads();
 
   // We use two counters in the loop below: processed_count and valid_count.
@@ -655,26 +608,47 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
-    // only need to process definition levels if the column has nulls
-    if (nullable_with_nulls) {
+    // only need to process definition levels if this is a nullable column
+    if (should_process_nulls) {
       processed_count += def_decoder.decode_next(t);
       __syncthreads();
 
-      next_valid_count =
-        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
+      if constexpr (has_nesting_t) {
+        next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, true, level_t>(
+          processed_count, s, sb, def, t);
+      } else {
+        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, true, level_t>(
+          processed_count, s, sb, def, t);
+      }
     }
     // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
     // this function call entirely since all it will ever generate is a mapping of (i -> i) for
-    // nz_idx.  gpuDecodeValues would be the only work that happens.
+    // nz_idx.  gpuDecodeFixedWidthValues would be the only work that happens.
     else {
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t);
+
+      if constexpr (has_nesting_t) {
+        next_valid_count =
+          gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, false, level_t>(
+            processed_count, s, sb, nullptr, t);
+      } else {
+        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, false, level_t>(
+          processed_count, s, sb, nullptr, t);
+      }
     }
     __syncthreads();
 
+    // if we have dictionary data
+    if constexpr (has_dict_t) {
+      // We want to limit the number of dictionary items we decode, that correspond to
+      // the rows we have processed in this iteration that are valid.
+      // We know the number of valid rows to process with: next_valid_count - valid_count.
+      dict_stream.decode_next(t, next_valid_count - valid_count);
+      __syncthreads();
+    }
+
     // decode the values themselves
-    gpuDecodeSplitValues(s, sb, valid_count, next_valid_count);
+    decode_values(s, sb, valid_count, next_valid_count, t);
     __syncthreads();
 
     valid_count = next_valid_count;
@@ -689,18 +663,55 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                   size_t num_rows,
                                   size_t min_row,
                                   int level_type_size,
+                                  bool has_nesting,
                                   kernel_error::pointer error_code,
                                   rmm::cuda_stream_view stream)
 {
+  constexpr int decode_block_size = 128;
+
   dim3 dim_block(decode_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   if (level_type_size == 1) {
-    gpuDecodePageDataFixed<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
+                               false,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT,
+                               false,
+                               false,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   } else {
-    gpuDecodePageDataFixed<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
+                               false,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT,
+                               false,
+                               false,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   }
 }
 
@@ -709,40 +720,113 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
                                       size_t num_rows,
                                       size_t min_row,
                                       int level_type_size,
+                                      bool has_nesting,
                                       kernel_error::pointer error_code,
                                       rmm::cuda_stream_view stream)
 {
-  //  dim3 dim_block(decode_block_size, 1); // decode_block_size = 128 threads per block
-  // 1 full warp, and 1 warp of 1 thread
+  constexpr int decode_block_size = 128;
+
   dim3 dim_block(decode_block_size, 1);  // decode_block_size = 128 threads per block
   dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
-    gpuDecodePageDataFixedDict<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT,
+                               true,
+                               false,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   } else {
-    gpuDecodePageDataFixedDict<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT,
+                               true,
+                               false,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   }
 }
 
-void __host__ DecodeSplitPageDataFlat(cudf::detail::hostdevice_span<PageInfo> pages,
-                                      cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
-                                      size_t num_rows,
-                                      size_t min_row,
-                                      int level_type_size,
-                                      kernel_error::pointer error_code,
-                                      rmm::cuda_stream_view stream)
+void __host__
+DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
+                              cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                              size_t num_rows,
+                              size_t min_row,
+                              int level_type_size,
+                              bool has_nesting,
+                              kernel_error::pointer error_code,
+                              rmm::cuda_stream_view stream)
 {
+  constexpr int decode_block_size = 128;
+
   dim3 dim_block(decode_block_size, 1);  // decode_block_size = 128 threads per block
   dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
-    gpuDecodeSplitPageDataFlat<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
+                               true,
+                               true,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
+                               true,
+                               false,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   } else {
-    gpuDecodeSplitPageDataFlat<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
+                               true,
+                               true,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
+                               true,
+                               false,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   }
 }
 
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index cf0dd85e490..d604642be54 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -145,6 +145,11 @@ __device__ inline bool is_nested(ColumnChunkDesc const& chunk)
   return chunk.max_nesting_depth > 1;
 }
 
+__device__ inline bool is_list(ColumnChunkDesc const& chunk)
+{
+  return chunk.max_level[level_type::REPETITION] > 0;
+}
+
 __device__ inline bool is_byte_array(ColumnChunkDesc const& chunk)
 {
   return chunk.physical_type == BYTE_ARRAY;
@@ -178,14 +183,17 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
     return decode_kernel_mask::STRING;
   }
 
-  if (!is_nested(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
+  if (!is_list(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
     if (page.encoding == Encoding::PLAIN) {
-      return decode_kernel_mask::FIXED_WIDTH_NO_DICT;
+      return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED
+                              : decode_kernel_mask::FIXED_WIDTH_NO_DICT;
     } else if (page.encoding == Encoding::PLAIN_DICTIONARY ||
                page.encoding == Encoding::RLE_DICTIONARY) {
-      return decode_kernel_mask::FIXED_WIDTH_DICT;
+      return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED
+                              : decode_kernel_mask::FIXED_WIDTH_DICT;
     } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
-      return decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT;
+      return is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED
+                              : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT;
     }
   }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index d82c6f0de59..efc1f5ebab1 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -207,16 +207,20 @@ enum level_type {
  * Used to control which decode kernels to run.
  */
 enum class decode_kernel_mask {
-  NONE                   = 0,
-  GENERAL                = (1 << 0),  // Run catch-all decode kernel
-  STRING                 = (1 << 1),  // Run decode kernel for string data
-  DELTA_BINARY           = (1 << 2),  // Run decode kernel for DELTA_BINARY_PACKED data
-  DELTA_BYTE_ARRAY       = (1 << 3),  // Run decode kernel for DELTA_BYTE_ARRAY encoded data
-  DELTA_LENGTH_BA        = (1 << 4),  // Run decode kernel for DELTA_LENGTH_BYTE_ARRAY encoded data
-  FIXED_WIDTH_NO_DICT    = (1 << 5),  // Run decode kernel for fixed width non-dictionary pages
-  FIXED_WIDTH_DICT       = (1 << 6),  // Run decode kernel for fixed width dictionary pages
-  BYTE_STREAM_SPLIT      = (1 << 7),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data
-  BYTE_STREAM_SPLIT_FLAT = (1 << 8),  // Same as above but with a flat schema
+  NONE                = 0,
+  GENERAL             = (1 << 0),  // Run catch-all decode kernel
+  STRING              = (1 << 1),  // Run decode kernel for string data
+  DELTA_BINARY        = (1 << 2),  // Run decode kernel for DELTA_BINARY_PACKED data
+  DELTA_BYTE_ARRAY    = (1 << 3),  // Run decode kernel for DELTA_BYTE_ARRAY encoded data
+  DELTA_LENGTH_BA     = (1 << 4),  // Run decode kernel for DELTA_LENGTH_BYTE_ARRAY encoded data
+  FIXED_WIDTH_NO_DICT = (1 << 5),  // Run decode kernel for fixed width non-dictionary pages
+  FIXED_WIDTH_DICT    = (1 << 6),  // Run decode kernel for fixed width dictionary pages
+  BYTE_STREAM_SPLIT   = (1 << 7),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data
+  BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT = (1 << 8),  // Same as above but for flat, fixed-width data
+  BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED =
+    (1 << 9),                              // Same as above but for nested, fixed-width data
+  FIXED_WIDTH_NO_DICT_NESTED = (1 << 10),  // Run decode kernel for fixed width non-dictionary pages
+  FIXED_WIDTH_DICT_NESTED    = (1 << 11),  // Run decode kernel for fixed width dictionary pages
 };
 
 // mask representing all the ways in which a string can be encoded
@@ -888,6 +892,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -896,6 +901,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                          std::size_t num_rows,
                          size_t min_row,
                          int level_type_size,
+                         bool has_nesting,
                          kernel_error::pointer error_code,
                          rmm::cuda_stream_view stream);
 
@@ -910,6 +916,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -918,11 +925,12 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
                              std::size_t num_rows,
                              size_t min_row,
                              int level_type_size,
+                             bool has_nesting,
                              kernel_error::pointer error_code,
                              rmm::cuda_stream_view stream);
 
 /**
- * @brief Launches kernel for reading dictionary fixed width column data stored in the pages
+ * @brief Launches kernel for reading fixed width column data stored in the pages
  *
  * The page data will be written to the output pointed to in the page's
  * associated column chunk.
@@ -932,16 +940,18 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
-void DecodeSplitPageDataFlat(cudf::detail::hostdevice_span<PageInfo> pages,
-                             cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
-                             std::size_t num_rows,
-                             size_t min_row,
-                             int level_type_size,
-                             kernel_error::pointer error_code,
-                             rmm::cuda_stream_view stream);
+void DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
+                                   cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                                   std::size_t num_rows,
+                                   size_t min_row,
+                                   int level_type_size,
+                                   bool has_nesting,
+                                   kernel_error::pointer error_code,
+                                   rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for initializing encoder row group fragments
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 1bd2fae281c..f705f6626e7 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -267,14 +267,27 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
   }
 
   // launch byte stream split decoder
-  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT) != 0) {
-    DecodeSplitPageDataFlat(subpass.pages,
-                            pass.chunks,
-                            num_rows,
-                            skip_rows,
-                            level_type_size,
-                            error_code.data(),
-                            streams[s_idx++]);
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT) != 0) {
+    DecodeSplitPageFixedWidthData(subpass.pages,
+                                  pass.chunks,
+                                  num_rows,
+                                  skip_rows,
+                                  level_type_size,
+                                  false,
+                                  error_code.data(),
+                                  streams[s_idx++]);
+  }
+
+  // launch byte stream split decoder, for nested columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED) != 0) {
+    DecodeSplitPageFixedWidthData(subpass.pages,
+                                  pass.chunks,
+                                  num_rows,
+                                  skip_rows,
+                                  level_type_size,
+                                  true,
+                                  error_code.data(),
+                                  streams[s_idx++]);
   }
 
   // launch byte stream split decoder
@@ -288,22 +301,50 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                         streams[s_idx++]);
   }
 
+  // launch fixed width type decoder
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT) != 0) {
     DecodePageDataFixed(subpass.pages,
                         pass.chunks,
                         num_rows,
                         skip_rows,
                         level_type_size,
+                        false,
+                        error_code.data(),
+                        streams[s_idx++]);
+  }
+
+  // launch fixed width type decoder, for nested columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED) != 0) {
+    DecodePageDataFixed(subpass.pages,
+                        pass.chunks,
+                        num_rows,
+                        skip_rows,
+                        level_type_size,
+                        true,
                         error_code.data(),
                         streams[s_idx++]);
   }
 
+  // launch fixed width type decoder with dictionaries
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT) != 0) {
     DecodePageDataFixedDict(subpass.pages,
                             pass.chunks,
                             num_rows,
                             skip_rows,
                             level_type_size,
+                            false,
+                            error_code.data(),
+                            streams[s_idx++]);
+  }
+
+  // launch fixed width type decoder with dictionaries, for nested columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_NESTED) != 0) {
+    DecodePageDataFixedDict(subpass.pages,
+                            pass.chunks,
+                            num_rows,
+                            skip_rows,
+                            level_type_size,
+                            true,
                             error_code.data(),
                             streams[s_idx++]);
   }
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 84ab83e33d0..a1f4c7b81d8 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -1785,7 +1785,8 @@ TEST_F(ParquetWriterTest, DeltaBinaryStartsWithNulls)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
-TEST_F(ParquetWriterTest, ByteStreamSplit)
+std::pair<std::unique_ptr<cudf::table>, cudf::io::table_input_metadata>
+make_byte_stream_split_table(bool as_struct)
 {
   constexpr auto num_rows = 100;
   std::mt19937 engine{31337};
@@ -1802,24 +1803,73 @@ TEST_F(ParquetWriterTest, ByteStreamSplit)
   // throw in a list to make sure both decoders are working
   auto col4 = make_parquet_list_col<int32_t>(engine, num_rows, 5, true);
 
-  auto expected = table_view{{col0, col1, col2, col3, *col4}};
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.reserve(5);
+  columns.push_back(col0.release());
+  columns.push_back(col1.release());
+  columns.push_back(col2.release());
+  columns.push_back(col3.release());
+  columns.push_back(std::move(col4));
+
+  return [&]() -> std::pair<std::unique_ptr<cudf::table>, cudf::io::table_input_metadata> {
+    auto const encoding = cudf::io::column_encoding::BYTE_STREAM_SPLIT;
+
+    // make as a nested struct
+    if (as_struct) {
+      auto valids =
+        cudf::detail::make_counting_transform_iterator(0, [](int i) { return i % 2 == 0; });
+      auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows);
+
+      std::vector<std::unique_ptr<cudf::column>> table_cols;
+      table_cols.push_back(
+        cudf::make_structs_column(num_rows, std::move(columns), null_count, std::move(null_mask)));
+
+      auto tbl      = std::make_unique<cudf::table>(std::move(table_cols));
+      auto expected = table_view{*tbl};
+
+      cudf::io::table_input_metadata expected_metadata(expected);
+      expected_metadata.column_metadata[0].set_name("struct");
+      expected_metadata.column_metadata[0].set_encoding(encoding);
+
+      expected_metadata.column_metadata[0].child(0).set_name("int32s");
+      expected_metadata.column_metadata[0].child(1).set_name("int64s");
+      expected_metadata.column_metadata[0].child(2).set_name("floats");
+      expected_metadata.column_metadata[0].child(3).set_name("doubles");
+      expected_metadata.column_metadata[0].child(4).set_name("int32list");
+      for (int idx = 0; idx <= 3; idx++) {
+        expected_metadata.column_metadata[0].child(idx).set_encoding(encoding);
+      }
+      expected_metadata.column_metadata[0].child(4).child(1).set_encoding(encoding);
 
-  cudf::io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_name("int32s");
-  expected_metadata.column_metadata[1].set_name("int64s");
-  expected_metadata.column_metadata[2].set_name("floats");
-  expected_metadata.column_metadata[3].set_name("doubles");
-  expected_metadata.column_metadata[4].set_name("int32list");
-  auto const encoding = cudf::io::column_encoding::BYTE_STREAM_SPLIT;
-  for (int i = 0; i <= 3; i++) {
-    expected_metadata.column_metadata[i].set_encoding(encoding);
-  }
+      return {std::move(tbl), expected_metadata};
+    }
+
+    // make flat
+    auto tbl      = std::make_unique<cudf::table>(std::move(columns));
+    auto expected = table_view{*tbl};
 
-  expected_metadata.column_metadata[4].child(1).set_encoding(encoding);
+    cudf::io::table_input_metadata expected_metadata(expected);
+    expected_metadata.column_metadata[0].set_name("int32s");
+    expected_metadata.column_metadata[1].set_name("int64s");
+    expected_metadata.column_metadata[2].set_name("floats");
+    expected_metadata.column_metadata[3].set_name("doubles");
+    expected_metadata.column_metadata[4].set_name("int32list");
+    for (int idx = 0; idx <= 3; idx++) {
+      expected_metadata.column_metadata[idx].set_encoding(encoding);
+    }
+
+    expected_metadata.column_metadata[4].child(1).set_encoding(encoding);
+    return {std::move(tbl), expected_metadata};
+  }();
+}
+
+TEST_F(ParquetWriterTest, ByteStreamSplit)
+{
+  auto [expected, expected_metadata] = make_byte_stream_split_table(false);
 
   auto const filepath = temp_env->get_temp_filepath("ByteStreamSplit.parquet");
   cudf::io::parquet_writer_options out_opts =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
       .metadata(expected_metadata);
   cudf::io::write_parquet(out_opts);
 
@@ -1827,7 +1877,24 @@ TEST_F(ParquetWriterTest, ByteStreamSplit)
     cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
   auto result = cudf::io::read_parquet(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, result.tbl->view());
+}
+
+TEST_F(ParquetWriterTest, ByteStreamSplitStruct)
+{
+  auto [expected, expected_metadata] = make_byte_stream_split_table(true);
+
+  auto const filepath = temp_env->get_temp_filepath("ByteStreamSplitStruct.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, result.tbl->view());
 }
 
 TEST_F(ParquetWriterTest, DecimalByteStreamSplit)

From 78f4a8a3f639677358bce83a699f92c90476ae75 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 28 Jun 2024 11:26:27 -0400
Subject: [PATCH 14/27] Move common string utilities to public api (#16070)

As part of https://github.com/rapidsai/cudf/pull/15982 a subset of the strings utility functions have been identified as being worth expsosing as part of the cudf public API.

The `create_string_vector_from_column`, `get_offset64_threshold`, and `is_large_strings_enabled` are now made part of the public `cudf::strings` api.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - David Wendt (https://github.com/davidwendt)
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16070
---
 .../cudf/strings/detail/strings_children.cuh  |  7 ++-
 cpp/include/cudf/strings/utilities.hpp        | 62 +++++++++++++++++++
 cpp/src/strings/utilities.cu                  | 22 +++++--
 cpp/tests/column/factories_test.cpp           |  4 +-
 cpp/tests/copying/concatenate_tests.cpp       |  8 +--
 cpp/tests/strings/array_tests.cpp             |  4 +-
 cpp/tests/strings/repeat_strings_tests.cpp    |  4 +-
 .../strings/src/strings/udf/udf_apis.cu       |  4 +-
 8 files changed, 95 insertions(+), 20 deletions(-)
 create mode 100644 cpp/include/cudf/strings/utilities.hpp

diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index f105a6dc546..f5f3982a5d6 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -21,6 +21,7 @@
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -81,11 +82,11 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
   auto const total_bytes =
     cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
 
-  auto const threshold = get_offset64_threshold();
-  CUDF_EXPECTS(is_large_strings_enabled() || (total_bytes < threshold),
+  auto const threshold = cudf::strings::get_offset64_threshold();
+  CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || (total_bytes < threshold),
                "Size of output exceeds the column size limit",
                std::overflow_error);
-  if (total_bytes >= get_offset64_threshold()) {
+  if (total_bytes >= cudf::strings::get_offset64_threshold()) {
     // recompute as int64 offsets when above the threshold
     offsets_column = make_numeric_column(
       data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/include/cudf/strings/utilities.hpp b/cpp/include/cudf/strings/utilities.hpp
new file mode 100644
index 00000000000..ae445282382
--- /dev/null
+++ b/cpp/include/cudf/strings/utilities.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
+
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+namespace CUDF_EXPORT cudf {
+namespace strings {
+
+/**
+ * @brief Creates a string_view vector from a strings column.
+ *
+ * @param strings Strings column instance.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned vector's device memory.
+ * @return Device vector of string_views
+ */
+rmm::device_uvector<string_view> create_string_vector_from_column(
+  cudf::strings_column_view const strings,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Return the threshold size for a strings column to use int64 offsets
+ *
+ * A computed size above this threshold should using int64 offsets, otherwise
+ * int32 offsets. By default this function will return std::numeric_limits<int32_t>::max().
+ * This value can be overridden at runtime using the environment variable
+ * LIBCUDF_LARGE_STRINGS_THRESHOLD.
+ *
+ * @return size in bytes
+ */
+int64_t get_offset64_threshold();
+
+/**
+ * @brief Checks if large strings is enabled
+ *
+ * This checks the setting in the environment variable LIBCUDF_LARGE_STRINGS_ENABLED.
+ *
+ * @return true if large strings are supported
+ */
+bool is_large_strings_enabled();
+
+}  // namespace strings
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 18e726a6d7d..101004a5d06 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -13,16 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include "strings/char_types/char_cases.h"
 #include "strings/char_types/char_flags.h"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -36,8 +37,7 @@
 #include <cstdlib>
 #include <string>
 
-namespace cudf {
-namespace strings {
+namespace cudf::strings {
 namespace detail {
 
 /**
@@ -175,5 +175,17 @@ int64_t get_offset_value(cudf::column_view const& offsets,
 }
 
 }  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+
+rmm::device_uvector<string_view> create_string_vector_from_column(
+  cudf::strings_column_view const strings,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::create_string_vector_from_column(strings, stream, mr);
+}
+
+int64_t get_offset64_threshold() { return detail::get_offset64_threshold(); }
+bool is_large_strings_enabled() { return detail::is_large_strings_enabled(); }
+
+}  // namespace cudf::strings
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index dca36eaa4e7..603187f0330 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -24,7 +24,7 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -762,7 +762,7 @@ TEST_F(ColumnFactoryTest, FromStructScalarNull) { struct_from_scalar(false); }
 
 TEST_F(ColumnFactoryTest, FromScalarErrors)
 {
-  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+  if (cudf::strings::is_large_strings_enabled()) { return; }
   cudf::string_scalar ss("hello world");
   EXPECT_THROW(cudf::make_column_from_scalar(ss, 214748365), std::overflow_error);
 
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 078e0ef9bae..054441788d0 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -29,7 +29,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/filling.hpp>
-#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -189,7 +189,7 @@ TEST_F(StringColumnTest, ConcatenateManyColumns)
 
 TEST_F(StringColumnTest, ConcatenateTooLarge)
 {
-  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+  if (cudf::strings::is_large_strings_enabled()) { return; }
 
   std::string big_str(1000000, 'a');  // 1 million bytes x 5 = 5 million bytes
   cudf::test::strings_column_wrapper input{big_str, big_str, big_str, big_str, big_str};
@@ -379,7 +379,7 @@ TEST_F(OverflowTest, OverflowTest)
   }
 
   // string column, overflow on chars
-  if (!cudf::strings::detail::is_large_strings_enabled()) {
+  if (!cudf::strings::is_large_strings_enabled()) {
     constexpr auto size = static_cast<cudf::size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 6 string columns of with 1 billion chars in each
@@ -502,7 +502,7 @@ TEST_F(OverflowTest, Presliced)
   }
 
   // strings, overflow on chars
-  if (!cudf::strings::detail::is_large_strings_enabled()) {
+  if (!cudf::strings::is_large_strings_enabled()) {
     constexpr cudf::size_type total_chars_size = 1024 * 1024 * 1024;
     constexpr cudf::size_type string_size      = 64;
     constexpr cudf::size_type num_rows         = total_chars_size / string_size;
diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp
index a1bb87a43fb..9c0ecaa52c0 100644
--- a/cpp/tests/strings/array_tests.cpp
+++ b/cpp/tests/strings/array_tests.cpp
@@ -23,8 +23,8 @@
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/sorting.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -153,7 +153,7 @@ TEST_F(StringsColumnTest, GatherZeroSizeStringsColumn)
 
 TEST_F(StringsColumnTest, GatherTooBig)
 {
-  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+  if (cudf::strings::is_large_strings_enabled()) { return; }
 
   std::vector<int8_t> h_chars(3000000);
   cudf::test::fixed_width_column_wrapper<int8_t> chars(h_chars.begin(), h_chars.end());
diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp
index 0539895c5f4..aa4d9320d7c 100644
--- a/cpp/tests/strings/repeat_strings_tests.cpp
+++ b/cpp/tests/strings/repeat_strings_tests.cpp
@@ -20,9 +20,9 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/utilities.hpp>
 
 using namespace cudf::test::iterators;
 
@@ -221,7 +221,7 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesInvalidInput)
 
 TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesOverflowOutput)
 {
-  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+  if (cudf::strings::is_large_strings_enabled()) { return; }
 
   auto const strs    = strs_col{"1", "12", "123", "1234", "12345", "123456", "1234567"};
   auto const strs_cv = cudf::strings_column_view(strs);
diff --git a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
index 941e61e6787..b924995cf4b 100644
--- a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
+++ b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
@@ -15,10 +15,10 @@
  */
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/udf/udf_apis.hpp>
 #include <cudf/strings/udf/udf_string.cuh>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -57,7 +57,7 @@ std::unique_ptr<rmm::device_buffer> to_string_view_array(cudf::column_view const
                                                          rmm::cuda_stream_view stream)
 {
   return std::make_unique<rmm::device_buffer>(
-    std::move(cudf::strings::detail::create_string_vector_from_column(
+    std::move(cudf::strings::create_string_vector_from_column(
                 cudf::strings_column_view(input), stream, rmm::mr::get_current_device_resource())
                 .release()));
 }

From fb12d980342833a9d7092a19717eedad22328e6a Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 28 Jun 2024 12:14:58 -0400
Subject: [PATCH 15/27] Installed cudf header use cudf::allocate_like (#16087)

Remove usage of non public cudf::allocate_like from implementations in headers we install

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16087
---
 cpp/include/cudf/detail/copy_if.cuh       |  6 +++---
 cpp/include/cudf/detail/gather.cuh        | 13 ++++++-------
 cpp/src/copying/sample.cu                 |  1 +
 cpp/src/lists/copying/segmented_gather.cu |  1 +
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index c98057d077a..b6310e6cd2f 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -18,7 +18,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/copy.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -242,8 +242,8 @@ struct scatter_gather_functor {
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
   {
-    auto output_column = cudf::detail::allocate_like(
-      input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
+    auto output_column =
+      cudf::allocate_like(input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
     auto output = output_column->mutable_view();
 
     bool has_valid = input.nullable();
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index c9d350ce983..5977c7341c1 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <cudf/detail/copy.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/assert.cuh>
@@ -217,10 +217,9 @@ struct column_gatherer_impl<Element, std::enable_if_t<is_rep_layout_compatible<E
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
   {
-    auto const num_rows = cudf::distance(gather_map_begin, gather_map_end);
-    auto const policy   = cudf::mask_allocation_policy::NEVER;
-    auto destination_column =
-      cudf::detail::allocate_like(source_column, num_rows, policy, stream, mr);
+    auto const num_rows     = cudf::distance(gather_map_begin, gather_map_end);
+    auto const policy       = cudf::mask_allocation_policy::NEVER;
+    auto destination_column = cudf::allocate_like(source_column, num_rows, policy, stream, mr);
 
     gather_helper(source_column.data<Element>(),
                   source_column.size(),
@@ -413,8 +412,8 @@ struct column_gatherer_impl<dictionary32> {
     auto keys_copy = std::make_unique<column>(dictionary.keys(), stream, mr);
     // Perform gather on just the indices
     column_view indices = dictionary.get_indices_annotated();
-    auto new_indices    = cudf::detail::allocate_like(
-      indices, output_count, cudf::mask_allocation_policy::NEVER, stream, mr);
+    auto new_indices =
+      cudf::allocate_like(indices, output_count, cudf::mask_allocation_policy::NEVER, stream, mr);
     gather_helper(
       cudf::detail::indexalator_factory::make_input_iterator(indices),
       indices.size(),
diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu
index f8e3a9a83e3..ba00527f6b6 100644
--- a/cpp/src/copying/sample.cu
+++ b/cpp/src/copying/sample.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 89b1a126fc5..779eca438db 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/copy_range.cuh>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/indexalator.cuh>

From df88cf5ffccd8a454f17ba686dcb5ec0d7a045b3 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 28 Jun 2024 15:40:52 -0500
Subject: [PATCH 16/27] Use size_t to allow large conditional joins (#16127)

The conditional join kernels were using `cudf::size_type` where `std::size_t` was needed. This PR fixes that bug, which caused `cudaErrorIllegalAddress` as shown in #16115. This closes #16115.

I did not add tests because we typically do not test very large workloads. However, I committed the test and reverted it in this PR, so there is a record of my validation code.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - https://github.com/nvdbaranec
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16127
---
 cpp/src/join/conditional_join.cu          |   5 +-
 cpp/src/join/conditional_join_kernels.cuh | 124 ++++++++++++++++++++--
 cpp/src/join/join_common_utils.cuh        |  95 -----------------
 3 files changed, 117 insertions(+), 107 deletions(-)

diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 97a06d5a923..d4ef2747c9d 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -95,7 +95,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
     join_size = size.value(stream);
   }
 
-  rmm::device_scalar<size_type> write_index(0, stream);
+  rmm::device_scalar<std::size_t> write_index(0, stream);
 
   auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
@@ -232,13 +232,14 @@ conditional_join(table_view const& left,
                      std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
-  rmm::device_scalar<size_type> write_index(0, stream);
+  rmm::device_scalar<std::size_t> write_index(0, stream);
 
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
   auto const& join_output_l = left_indices->data();
   auto const& join_output_r = right_indices->data();
+
   if (has_nulls) {
     conditional_join<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, true>
       <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index 1e16c451f5a..62769862f54 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -29,6 +29,110 @@
 namespace cudf {
 namespace detail {
 
+/**
+ * @brief Adds a pair of indices to the shared memory cache
+ *
+ * @param[in] first The first index in the pair
+ * @param[in] second The second index in the pair
+ * @param[in,out] current_idx_shared Pointer to shared index that determines
+ * where in the shared memory cache the pair will be written
+ * @param[in] warp_id The ID of the warp of the calling the thread
+ * @param[out] joined_shared_l Pointer to the shared memory cache for left indices
+ * @param[out] joined_shared_r Pointer to the shared memory cache for right indices
+ */
+__inline__ __device__ void add_pair_to_cache(size_type const first,
+                                             size_type const second,
+                                             std::size_t* current_idx_shared,
+                                             int const warp_id,
+                                             size_type* joined_shared_l,
+                                             size_type* joined_shared_r)
+{
+  cuda::atomic_ref<std::size_t, cuda::thread_scope_block> ref{*(current_idx_shared + warp_id)};
+  std::size_t my_current_idx = ref.fetch_add(1, cuda::memory_order_relaxed);
+  // It's guaranteed to fit into the shared cache
+  joined_shared_l[my_current_idx] = first;
+  joined_shared_r[my_current_idx] = second;
+}
+
+__inline__ __device__ void add_left_to_cache(size_type const first,
+                                             std::size_t* current_idx_shared,
+                                             int const warp_id,
+                                             size_type* joined_shared_l)
+{
+  cuda::atomic_ref<std::size_t, cuda::thread_scope_block> ref{*(current_idx_shared + warp_id)};
+  std::size_t my_current_idx      = ref.fetch_add(1, cuda::memory_order_relaxed);
+  joined_shared_l[my_current_idx] = first;
+}
+
+template <int num_warps, cudf::size_type output_cache_size>
+__device__ void flush_output_cache(unsigned int const activemask,
+                                   std::size_t const max_size,
+                                   int const warp_id,
+                                   int const lane_id,
+                                   std::size_t* current_idx,
+                                   std::size_t current_idx_shared[num_warps],
+                                   size_type join_shared_l[num_warps][output_cache_size],
+                                   size_type join_shared_r[num_warps][output_cache_size],
+                                   size_type* join_output_l,
+                                   size_type* join_output_r)
+{
+  // count how many active threads participating here which could be less than warp_size
+  int const num_threads     = __popc(activemask);
+  std::size_t output_offset = 0;
+
+  if (0 == lane_id) {
+    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*current_idx};
+    output_offset = ref.fetch_add(current_idx_shared[warp_id], cuda::memory_order_relaxed);
+  }
+
+  // No warp sync is necessary here because we are assuming that ShuffleIndex
+  // is internally using post-CUDA 9.0 synchronization-safe primitives
+  // (__shfl_sync instead of __shfl). __shfl is technically not guaranteed to
+  // be safe by the compiler because it is not required by the standard to
+  // converge divergent branches before executing.
+  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
+
+  for (std::size_t shared_out_idx = static_cast<std::size_t>(lane_id);
+       shared_out_idx < current_idx_shared[warp_id];
+       shared_out_idx += num_threads) {
+    std::size_t thread_offset = output_offset + shared_out_idx;
+    if (thread_offset < max_size) {
+      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
+      join_output_r[thread_offset] = join_shared_r[warp_id][shared_out_idx];
+    }
+  }
+}
+
+template <int num_warps, cudf::size_type output_cache_size>
+__device__ void flush_output_cache(unsigned int const activemask,
+                                   std::size_t const max_size,
+                                   int const warp_id,
+                                   int const lane_id,
+                                   std::size_t* current_idx,
+                                   std::size_t current_idx_shared[num_warps],
+                                   size_type join_shared_l[num_warps][output_cache_size],
+                                   size_type* join_output_l)
+{
+  int const num_threads     = __popc(activemask);
+  std::size_t output_offset = 0;
+
+  if (0 == lane_id) {
+    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*current_idx};
+    output_offset = ref.fetch_add(current_idx_shared[warp_id], cuda::memory_order_relaxed);
+  }
+
+  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
+
+  for (std::size_t shared_out_idx = static_cast<std::size_t>(lane_id);
+       shared_out_idx < current_idx_shared[warp_id];
+       shared_out_idx += num_threads) {
+    std::size_t thread_offset = output_offset + shared_out_idx;
+    if (thread_offset < max_size) {
+      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
+    }
+  }
+}
+
 /**
  * @brief Computes the output size of joining the left table to the right table.
  *
@@ -103,14 +207,14 @@ CUDF_KERNEL void compute_conditional_join_output_size(
     }
   }
 
-  using BlockReduce = cub::BlockReduce<cudf::size_type, block_size>;
+  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
 
   // Add block counter to global counter
   if (threadIdx.x == 0) {
     cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*output_size};
-    ref.fetch_add(block_counter, cuda::std::memory_order_relaxed);
+    ref.fetch_add(block_counter, cuda::memory_order_relaxed);
   }
 }
 
@@ -143,13 +247,13 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
                                   join_kind join_type,
                                   cudf::size_type* join_output_l,
                                   cudf::size_type* join_output_r,
-                                  cudf::size_type* current_idx,
+                                  std::size_t* current_idx,
                                   cudf::ast::detail::expression_device_view device_expression_data,
-                                  cudf::size_type const max_size,
+                                  std::size_t const max_size,
                                   bool const swap_tables)
 {
   constexpr int num_warps = block_size / detail::warp_size;
-  __shared__ cudf::size_type current_idx_shared[num_warps];
+  __shared__ std::size_t current_idx_shared[num_warps];
   __shared__ cudf::size_type join_shared_l[num_warps][output_cache_size];
   __shared__ cudf::size_type join_shared_r[num_warps][output_cache_size];
 
@@ -183,7 +287,7 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
 
   if (outer_row_index < outer_num_rows) {
     bool found_match = false;
-    for (thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
+    for (cudf::thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
          ++inner_row_index) {
       auto output_dest           = cudf::ast::detail::value_expression_result<bool, has_nulls>();
       auto const left_row_index  = swap_tables ? inner_row_index : outer_row_index;
@@ -277,12 +381,12 @@ CUDF_KERNEL void conditional_join_anti_semi(
   table_device_view right_table,
   join_kind join_type,
   cudf::size_type* join_output_l,
-  cudf::size_type* current_idx,
+  std::size_t* current_idx,
   cudf::ast::detail::expression_device_view device_expression_data,
-  cudf::size_type const max_size)
+  std::size_t const max_size)
 {
   constexpr int num_warps = block_size / detail::warp_size;
-  __shared__ cudf::size_type current_idx_shared[num_warps];
+  __shared__ std::size_t current_idx_shared[num_warps];
   __shared__ cudf::size_type join_shared_l[num_warps][output_cache_size];
 
   extern __shared__ char raw_intermediate_storage[];
@@ -310,7 +414,7 @@ CUDF_KERNEL void conditional_join_anti_semi(
   for (cudf::thread_index_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
        outer_row_index += stride) {
     bool found_match = false;
-    for (thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
+    for (cudf::thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
          ++inner_row_index) {
       auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
 
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 31f267d5cfb..3d0f3e4340d 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -262,101 +262,6 @@ struct valid_range {
   }
 };
 
-/**
- * @brief Adds a pair of indices to the shared memory cache
- *
- * @param[in] first The first index in the pair
- * @param[in] second The second index in the pair
- * @param[in,out] current_idx_shared Pointer to shared index that determines
- * where in the shared memory cache the pair will be written
- * @param[in] warp_id The ID of the warp of the calling the thread
- * @param[out] joined_shared_l Pointer to the shared memory cache for left indices
- * @param[out] joined_shared_r Pointer to the shared memory cache for right indices
- */
-__inline__ __device__ void add_pair_to_cache(size_type const first,
-                                             size_type const second,
-                                             size_type* current_idx_shared,
-                                             int const warp_id,
-                                             size_type* joined_shared_l,
-                                             size_type* joined_shared_r)
-{
-  size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
-  // its guaranteed to fit into the shared cache
-  joined_shared_l[my_current_idx] = first;
-  joined_shared_r[my_current_idx] = second;
-}
-
-__inline__ __device__ void add_left_to_cache(size_type const first,
-                                             size_type* current_idx_shared,
-                                             int const warp_id,
-                                             size_type* joined_shared_l)
-{
-  size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
-
-  joined_shared_l[my_current_idx] = first;
-}
-
-template <int num_warps, cudf::size_type output_cache_size>
-__device__ void flush_output_cache(unsigned int const activemask,
-                                   cudf::size_type const max_size,
-                                   int const warp_id,
-                                   int const lane_id,
-                                   cudf::size_type* current_idx,
-                                   cudf::size_type current_idx_shared[num_warps],
-                                   size_type join_shared_l[num_warps][output_cache_size],
-                                   size_type join_shared_r[num_warps][output_cache_size],
-                                   size_type* join_output_l,
-                                   size_type* join_output_r)
-{
-  // count how many active threads participating here which could be less than warp_size
-  int const num_threads         = __popc(activemask);
-  cudf::size_type output_offset = 0;
-
-  if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
-
-  // No warp sync is necessary here because we are assuming that ShuffleIndex
-  // is internally using post-CUDA 9.0 synchronization-safe primitives
-  // (__shfl_sync instead of __shfl). __shfl is technically not guaranteed to
-  // be safe by the compiler because it is not required by the standard to
-  // converge divergent branches before executing.
-  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
-
-  for (int shared_out_idx = lane_id; shared_out_idx < current_idx_shared[warp_id];
-       shared_out_idx += num_threads) {
-    cudf::size_type thread_offset = output_offset + shared_out_idx;
-    if (thread_offset < max_size) {
-      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
-      join_output_r[thread_offset] = join_shared_r[warp_id][shared_out_idx];
-    }
-  }
-}
-
-template <int num_warps, cudf::size_type output_cache_size>
-__device__ void flush_output_cache(unsigned int const activemask,
-                                   cudf::size_type const max_size,
-                                   int const warp_id,
-                                   int const lane_id,
-                                   cudf::size_type* current_idx,
-                                   cudf::size_type current_idx_shared[num_warps],
-                                   size_type join_shared_l[num_warps][output_cache_size],
-                                   size_type* join_output_l)
-{
-  int const num_threads         = __popc(activemask);
-  cudf::size_type output_offset = 0;
-
-  if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
-
-  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
-
-  for (int shared_out_idx = lane_id; shared_out_idx < current_idx_shared[warp_id];
-       shared_out_idx += num_threads) {
-    cudf::size_type thread_offset = output_offset + shared_out_idx;
-    if (thread_offset < max_size) {
-      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
-    }
-  }
-}
-
 }  // namespace detail
 
 }  // namespace cudf

From 3c3edfef406288e164cc80ab82f9c64c0b88d0bd Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 28 Jun 2024 13:58:22 -0700
Subject: [PATCH 17/27] Update implementations to build with the latest cuco
 (#15938)

This PR updates existing libcudf to accommodate a cuco breaking change introduced in https://github.com/NVIDIA/cuCollections/pull/479. It helps avoid breaking cudf when bumping the cuco version in `rapids-cmake`.

Redundant equal/hash overloads will be removed once the version bump is done on the `rapids-cmake` end.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15938
---
 .../cudf/detail/distinct_hash_join.cuh        | 22 +++++++++++-
 cpp/src/join/distinct_hash_join.cu            | 10 +++---
 cpp/src/search/contains_table.cu              | 35 ++++++++++++++-----
 cpp/src/text/bpe/byte_pair_encoding.cuh       | 13 +++++++
 cpp/src/text/vocabulary_tokenize.cu           |  8 +++++
 5 files changed, 74 insertions(+), 14 deletions(-)

diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index de3d23e9470..1ef8b3b120a 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -42,6 +42,9 @@ template <typename Equal>
 struct comparator_adapter {
   comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {}
 
+  // suppress "function was declared but never referenced warning"
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177
   __device__ constexpr auto operator()(
     cuco::pair<hash_value_type, lhs_index_type> const&,
     cuco::pair<hash_value_type, lhs_index_type> const&) const noexcept
@@ -50,6 +53,14 @@ struct comparator_adapter {
     return false;
   }
 
+  __device__ constexpr auto operator()(
+    cuco::pair<hash_value_type, rhs_index_type> const&,
+    cuco::pair<hash_value_type, rhs_index_type> const&) const noexcept
+  {
+    // All build table keys are distinct thus `false` no matter what
+    return false;
+  }
+
   __device__ constexpr auto operator()(
     cuco::pair<hash_value_type, lhs_index_type> const& lhs,
     cuco::pair<hash_value_type, rhs_index_type> const& rhs) const noexcept
@@ -58,6 +69,15 @@ struct comparator_adapter {
     return _d_equal(lhs.second, rhs.second);
   }
 
+  __device__ constexpr auto operator()(
+    cuco::pair<hash_value_type, rhs_index_type> const& lhs,
+    cuco::pair<hash_value_type, lhs_index_type> const& rhs) const noexcept
+  {
+    if (lhs.first != rhs.first) { return false; }
+    return _d_equal(lhs.second, rhs.second);
+  }
+#pragma nv_diagnostic pop
+
  private:
   Equal _d_equal;
 };
@@ -94,7 +114,7 @@ struct distinct_hash_join {
   using cuco_storage_type   = cuco::storage<1>;
 
   /// Hash table type
-  using hash_table_type = cuco::static_set<cuco::pair<hash_value_type, lhs_index_type>,
+  using hash_table_type = cuco::static_set<cuco::pair<hash_value_type, rhs_index_type>,
                                            cuco::extent<size_type>,
                                            cuda::thread_scope_device,
                                            comparator_adapter<d_equal_type>,
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index 5048da25e86..daa1bf17c0d 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -54,7 +54,7 @@ auto prepare_device_equal(
   cudf::null_equality compare_nulls)
 {
   auto const two_table_equal =
-    cudf::experimental::row::equality::two_table_comparator(build, probe);
+    cudf::experimental::row::equality::two_table_comparator(probe, build);
   return comparator_adapter{two_table_equal.equal_to<HasNested == cudf::has_nested::YES>(
     nullate::DYNAMIC{has_nulls}, compare_nulls)};
 }
@@ -113,7 +113,7 @@ distinct_hash_join<HasNested>::distinct_hash_join(cudf::table_view const& build,
     _hash_table{build.num_rows(),
                 CUCO_DESIRED_LOAD_FACTOR,
                 cuco::empty_key{cuco::pair{std::numeric_limits<hash_value_type>::max(),
-                                           lhs_index_type{JoinNoneValue}}},
+                                           rhs_index_type{JoinNoneValue}}},
                 prepare_device_equal<HasNested>(
                   _preprocessed_build, _preprocessed_probe, has_nulls, compare_nulls),
                 {},
@@ -131,7 +131,7 @@ distinct_hash_join<HasNested>::distinct_hash_join(cudf::table_view const& build,
   auto const d_hasher   = row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls});
 
   auto const iter = cudf::detail::make_counting_transform_iterator(
-    0, build_keys_fn<decltype(d_hasher), lhs_index_type>{d_hasher});
+    0, build_keys_fn<decltype(d_hasher), rhs_index_type>{d_hasher});
 
   size_type const build_table_num_rows{build.num_rows()};
   if (this->_nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(this->_build))) {
@@ -174,7 +174,7 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
     cudf::experimental::row::hash::row_hasher{this->_preprocessed_probe};
   auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls});
   auto const iter           = cudf::detail::make_counting_transform_iterator(
-    0, build_keys_fn<decltype(d_probe_hasher), rhs_index_type>{d_probe_hasher});
+    0, build_keys_fn<decltype(d_probe_hasher), lhs_index_type>{d_probe_hasher});
 
   auto const build_indices_begin =
     thrust::make_transform_output_iterator(build_indices->begin(), output_fn{});
@@ -216,7 +216,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join<HasNested>::l
       cudf::experimental::row::hash::row_hasher{this->_preprocessed_probe};
     auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls});
     auto const iter           = cudf::detail::make_counting_transform_iterator(
-      0, build_keys_fn<decltype(d_probe_hasher), rhs_index_type>{d_probe_hasher});
+      0, build_keys_fn<decltype(d_probe_hasher), lhs_index_type>{d_probe_hasher});
 
     auto const output_begin =
       thrust::make_transform_output_iterator(build_indices->begin(), output_fn{});
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index 466f9093194..fbb0f6cb0f5 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -53,12 +53,12 @@ struct hasher_adapter {
 
   __device__ constexpr auto operator()(lhs_index_type idx) const noexcept
   {
-    return _haystack_hasher(static_cast<size_type>(idx));
+    return _needle_hasher(static_cast<size_type>(idx));
   }
 
   __device__ constexpr auto operator()(rhs_index_type idx) const noexcept
   {
-    return _needle_hasher(static_cast<size_type>(idx));
+    return _haystack_hasher(static_cast<size_type>(idx));
   }
 
  private:
@@ -76,6 +76,9 @@ struct comparator_adapter {
   {
   }
 
+  // suppress "function was declared but never referenced warning"
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177
   __device__ constexpr auto operator()(lhs_index_type lhs_index,
                                        lhs_index_type rhs_index) const noexcept
   {
@@ -85,12 +88,28 @@ struct comparator_adapter {
     return _self_equal(lhs, rhs);
   }
 
+  __device__ constexpr auto operator()(rhs_index_type lhs_index,
+                                       rhs_index_type rhs_index) const noexcept
+  {
+    auto const lhs = static_cast<size_type>(lhs_index);
+    auto const rhs = static_cast<size_type>(rhs_index);
+
+    return _self_equal(lhs, rhs);
+  }
+
   __device__ constexpr auto operator()(lhs_index_type lhs_index,
                                        rhs_index_type rhs_index) const noexcept
   {
     return _two_table_equal(lhs_index, rhs_index);
   }
 
+  __device__ constexpr auto operator()(rhs_index_type lhs_index,
+                                       lhs_index_type rhs_index) const noexcept
+  {
+    return _two_table_equal(lhs_index, rhs_index);
+  }
+#pragma nv_diagnostic pop
+
  private:
   SelfEqual const _self_equal;
   TwoTableEqual const _two_table_equal;
@@ -210,26 +229,26 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
 
   auto const self_equal = cudf::experimental::row::equality::self_comparator(preprocessed_haystack);
   auto const two_table_equal = cudf::experimental::row::equality::two_table_comparator(
-    preprocessed_haystack, preprocessed_needles);
+    preprocessed_needles, preprocessed_haystack);
 
   // The output vector.
   auto contained = rmm::device_uvector<bool>(needles.num_rows(), stream, mr);
 
   auto const haystack_iter = cudf::detail::make_counting_transform_iterator(
-    size_type{0}, cuda::proclaim_return_type<lhs_index_type>([] __device__(auto idx) {
-      return lhs_index_type{idx};
-    }));
-  auto const needles_iter = cudf::detail::make_counting_transform_iterator(
     size_type{0}, cuda::proclaim_return_type<rhs_index_type>([] __device__(auto idx) {
       return rhs_index_type{idx};
     }));
+  auto const needles_iter = cudf::detail::make_counting_transform_iterator(
+    size_type{0}, cuda::proclaim_return_type<lhs_index_type>([] __device__(auto idx) {
+      return lhs_index_type{idx};
+    }));
 
   auto const helper_func =
     [&](auto const& d_self_equal, auto const& d_two_table_equal, auto const& probing_scheme) {
       auto const d_equal = comparator_adapter{d_self_equal, d_two_table_equal};
 
       auto set = cuco::static_set{cuco::extent{compute_hash_table_size(haystack.num_rows())},
-                                  cuco::empty_key{lhs_index_type{-1}},
+                                  cuco::empty_key{rhs_index_type{-1}},
                                   d_equal,
                                   probing_scheme,
                                   {},
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cuh b/cpp/src/text/bpe/byte_pair_encoding.cuh
index 2ad22fd4e46..3bb574748b6 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cuh
+++ b/cpp/src/text/bpe/byte_pair_encoding.cuh
@@ -96,6 +96,14 @@ struct bpe_equal {
     auto const right = d_strings.element<cudf::string_view>(lhs + 1);
     return (left == rhs.first) && (right == rhs.second);
   }
+  // used by find
+  __device__ bool operator()(merge_pair_type const& lhs, cudf::size_type rhs) const noexcept
+  {
+    rhs *= 2;
+    auto const left  = d_strings.element<cudf::string_view>(rhs);
+    auto const right = d_strings.element<cudf::string_view>(rhs + 1);
+    return (left == lhs.first) && (right == lhs.second);
+  }
 };
 
 using bpe_probe_scheme = cuco::linear_probing<1, bpe_hasher>;
@@ -154,6 +162,11 @@ struct mp_equal {
     auto const left = d_strings.element<cudf::string_view>(lhs);
     return left == rhs;
   }
+  __device__ bool operator()(cudf::string_view const& lhs, cudf::size_type rhs) const noexcept
+  {
+    auto const right = d_strings.element<cudf::string_view>(rhs);
+    return lhs == right;
+  }
 };
 
 using mp_probe_scheme = cuco::linear_probing<1, mp_hasher>;
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index f012f7ce09a..ea09f5d17af 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -86,10 +86,18 @@ struct vocab_equal {
     return lhs == rhs;  // all rows are expected to be unique
   }
   // used by find
+  // suppress "function was declared but never referenced warning"
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177
   __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
   {
     return d_strings.element<cudf::string_view>(lhs) == rhs;
   }
+  __device__ bool operator()(cudf::string_view const& lhs, cudf::size_type rhs) const noexcept
+  {
+    return d_strings.element<cudf::string_view>(rhs) == lhs;
+  }
+#pragma nv_diagnostic pop
 };
 
 using probe_scheme        = cuco::linear_probing<1, vocab_hasher>;

From 599ce95aa6c49ae1560b9617e18ed328f9f6a508 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 1 Jul 2024 09:35:35 +0100
Subject: [PATCH 18/27] Implement handlers for series literal in cudf-polars
 (#16113)

A query plan can contain a "literal" polars Series. Often, for example, when calling a contains-like function. To translate these, introduce a new `LiteralColumn` node to capture the concept and add an evaluation rule (converting from arrow).

Since list-dtype Series need the same casting treatment as in dataframe scan case, factor the casting out into a utility, and take the opportunity to handled casting of nested lists correctly.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16113
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 32 ++++++-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 20 ++--
 .../cudf_polars/cudf_polars/dsl/translate.py  |  3 +
 .../cudf_polars/cudf_polars/utils/dtypes.py   | 81 +++++++++++++++-
 .../tests/expressions/test_literal.py         | 96 +++++++++++++++++++
 .../cudf_polars/tests/test_dataframescan.py   | 19 ++++
 6 files changed, 239 insertions(+), 12 deletions(-)
 create mode 100644 python/cudf_polars/tests/expressions/test_literal.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 17d7d15e4e5..16cfd9b9749 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -27,11 +27,12 @@
 import cudf._lib.pylibcudf as plc
 
 from cudf_polars.containers import Column, NamedColumn
-from cudf_polars.utils import sorting
+from cudf_polars.utils import dtypes, sorting
 
 if TYPE_CHECKING:
     from collections.abc import Mapping, Sequence
 
+    import polars.polars as plrs
     import polars.type_aliases as pl_types
 
     from cudf_polars.containers import DataFrame
@@ -369,6 +370,29 @@ def do_evaluate(
         return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1))
 
 
+class LiteralColumn(Expr):
+    __slots__ = ("value",)
+    _non_child = ("dtype", "value")
+    value: pa.Array[Any, Any]
+    children: tuple[()]
+
+    def __init__(self, dtype: plc.DataType, value: plrs.PySeries) -> None:
+        super().__init__(dtype)
+        data = value.to_arrow()
+        self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        # datatype of pyarrow array is correct by construction.
+        return Column(plc.interop.from_arrow(self.value))
+
+
 class Col(Expr):
     __slots__ = ("name",)
     _non_child = ("dtype", "name")
@@ -1156,6 +1180,12 @@ def __init__(
         super().__init__(dtype)
         self.op = op
         self.children = (left, right)
+        if (
+            op in (plc.binaryop.BinaryOperator.ADD, plc.binaryop.BinaryOperator.SUB)
+            and ({left.dtype.id(), right.dtype.id()}.issubset(dtypes.TIMELIKE_TYPES))
+            and not dtypes.have_compatible_resolution(left.dtype.id(), right.dtype.id())
+        ):
+            raise NotImplementedError("Casting rules for timelike types")
 
     _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
         pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 3f5f3c74050..abe26b14a90 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -29,7 +29,7 @@
 
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import DataFrame, NamedColumn
-from cudf_polars.utils import sorting
+from cudf_polars.utils import dtypes, sorting
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
@@ -130,6 +130,11 @@ class IR:
     schema: Schema
     """Mapping from column names to their data types."""
 
+    def __post_init__(self):
+        """Validate preconditions."""
+        if any(dtype.id() == plc.TypeId.EMPTY for dtype in self.schema.values()):
+            raise NotImplementedError("Cannot make empty columns.")
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """
         Evaluate the node and return a dataframe.
@@ -292,15 +297,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         table = pdf.to_arrow()
         schema = table.schema
         for i, field in enumerate(schema):
-            # TODO: Nested types
-            if field.type == pa.large_string():
-                # TODO: goes away when libcudf supports large strings
-                schema = schema.set(i, pa.field(field.name, pa.string()))
-            elif isinstance(field.type, pa.LargeListType):
-                # TODO: goes away when libcudf supports large lists
-                schema = schema.set(
-                    i, pa.field(field.name, pa.list_(field.type.field(0)))
-                )
+            schema = schema.set(
+                i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type))
+            )
+        # No-op if the schema is unchanged.
         table = table.cast(schema)
         df = DataFrame.from_table(
             plc.interop.from_arrow(table), list(self.schema.keys())
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 953ff636cce..f4bf07ae1e0 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -12,6 +12,7 @@
 import pyarrow as pa
 from typing_extensions import assert_never
 
+import polars.polars as plrs
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
 import cudf._lib.pylibcudf as plc
@@ -383,6 +384,8 @@ def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr
 
 @_translate_expr.register
 def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+    if isinstance(node.value, plrs.PySeries):
+        return expr.LiteralColumn(dtype, node.value)
     value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype))
     return expr.Literal(dtype, value)
 
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 3d4a643e1fc..507acb5d33a 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -7,13 +7,92 @@
 
 from functools import cache
 
+import pyarrow as pa
 from typing_extensions import assert_never
 
 import polars as pl
 
 import cudf._lib.pylibcudf as plc
 
-__all__ = ["from_polars"]
+__all__ = ["from_polars", "downcast_arrow_lists", "have_compatible_resolution"]
+
+
+TIMELIKE_TYPES: frozenset[plc.TypeId] = frozenset(
+    [
+        plc.TypeId.TIMESTAMP_MILLISECONDS,
+        plc.TypeId.TIMESTAMP_MICROSECONDS,
+        plc.TypeId.TIMESTAMP_NANOSECONDS,
+        plc.TypeId.TIMESTAMP_DAYS,
+        plc.TypeId.DURATION_MILLISECONDS,
+        plc.TypeId.DURATION_MICROSECONDS,
+        plc.TypeId.DURATION_NANOSECONDS,
+    ]
+)
+
+
+def have_compatible_resolution(lid: plc.TypeId, rid: plc.TypeId):
+    """
+    Do two datetime typeids have matching resolution for a binop.
+
+    Parameters
+    ----------
+    lid
+       Left type id
+    rid
+       Right type id
+
+    Returns
+    -------
+    True if resolutions are compatible, False otherwise.
+
+    Notes
+    -----
+    Polars has different casting rules for combining
+    datetimes/durations than libcudf, and while we don't encode the
+    casting rules fully, just reject things we can't handle.
+
+    Precondition for correctness: both lid and rid are timelike.
+    """
+    if lid == rid:
+        return True
+    # Timestamps are smaller than durations in the libcudf enum.
+    lid, rid = sorted([lid, rid])
+    if lid == plc.TypeId.TIMESTAMP_MILLISECONDS:
+        return rid == plc.TypeId.DURATION_MILLISECONDS
+    elif lid == plc.TypeId.TIMESTAMP_MICROSECONDS:
+        return rid == plc.TypeId.DURATION_MICROSECONDS
+    elif lid == plc.TypeId.TIMESTAMP_NANOSECONDS:
+        return rid == plc.TypeId.DURATION_NANOSECONDS
+    return False
+
+
+def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType:
+    """
+    Sanitize an arrow datatype from polars.
+
+    Parameters
+    ----------
+    typ
+        Arrow type to sanitize
+
+    Returns
+    -------
+    Sanitized arrow type
+
+    Notes
+    -----
+    As well as arrow ``ListType``s, polars can produce
+    ``LargeListType``s and ``FixedSizeListType``s, these are not
+    currently handled by libcudf, so we attempt to cast them all into
+    normal ``ListType``s on the arrow side before consuming the arrow
+    data.
+    """
+    if isinstance(typ, pa.LargeListType):
+        return pa.list_(downcast_arrow_lists(typ.value_type))
+    # We don't have to worry about diving into struct types for now
+    # since those are always NotImplemented before we get here.
+    assert not isinstance(typ, pa.StructType)
+    return typ
 
 
 @cache
diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py
new file mode 100644
index 00000000000..55e688428bd
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_literal.py
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
+from cudf_polars.utils import dtypes
+
+
+@pytest.fixture(
+    params=[
+        None,
+        pl.Int8(),
+        pl.Int16(),
+        pl.Int32(),
+        pl.Int64(),
+        pl.UInt8(),
+        pl.UInt16(),
+        pl.UInt32(),
+        pl.UInt64(),
+    ]
+)
+def integer(request):
+    return pl.lit(10, dtype=request.param)
+
+
+@pytest.fixture(params=[None, pl.Float32(), pl.Float64()])
+def float(request):
+    return pl.lit(1.0, dtype=request.param)
+
+
+def test_numeric_literal(integer, float):
+    df = pl.LazyFrame({})
+
+    q = df.select(integer=integer, float_=float, sum_=integer + float)
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.fixture(
+    params=[pl.Date(), pl.Datetime("ms"), pl.Datetime("us"), pl.Datetime("ns")]
+)
+def timestamp(request):
+    return pl.lit(10_000, dtype=request.param)
+
+
+@pytest.fixture(params=[pl.Duration("ms"), pl.Duration("us"), pl.Duration("ns")])
+def timedelta(request):
+    return pl.lit(9_000, dtype=request.param)
+
+
+def test_timelike_literal(timestamp, timedelta):
+    df = pl.LazyFrame({})
+
+    q = df.select(
+        time=timestamp,
+        delta=timedelta,
+        adjusted=timestamp + timedelta,
+        two_delta=timedelta + timedelta,
+    )
+    schema = q.collect_schema()
+    time_type = schema["time"]
+    delta_type = schema["delta"]
+    if dtypes.have_compatible_resolution(
+        dtypes.from_polars(time_type).id(), dtypes.from_polars(delta_type).id()
+    ):
+        assert_gpu_result_equal(q)
+    else:
+        assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_select_literal_series():
+    df = pl.LazyFrame({})
+
+    q = df.select(
+        a=pl.Series(["a", "b", "c"], dtype=pl.String()),
+        b=pl.Series([[1, 2], [3], None], dtype=pl.List(pl.UInt16())),
+        c=pl.Series([[[1]], [], [[1, 2, 3, 4]]], dtype=pl.List(pl.List(pl.Float32()))),
+    )
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("expr", [pl.lit(None), pl.lit(10, dtype=pl.Decimal())])
+def test_unsupported_literal_raises(expr):
+    df = pl.LazyFrame({})
+
+    q = df.select(expr)
+
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_dataframescan.py b/python/cudf_polars/tests/test_dataframescan.py
index 1ffe06ac562..b5c0fb7be9f 100644
--- a/python/cudf_polars/tests/test_dataframescan.py
+++ b/python/cudf_polars/tests/test_dataframescan.py
@@ -41,3 +41,22 @@ def test_scan_drop_nulls(subset, predicate_pushdown):
     assert_gpu_result_equal(
         q, collect_kwargs={"predicate_pushdown": predicate_pushdown}
     )
+
+
+def test_can_convert_lists():
+    df = pl.LazyFrame(
+        {
+            "a": pl.Series([[1, 2], [3]], dtype=pl.List(pl.Int8())),
+            "b": pl.Series([[1], [2]], dtype=pl.List(pl.UInt16())),
+            "c": pl.Series(
+                [
+                    [["1", "2", "3"], ["4", "567"]],
+                    [["8", "9"], []],
+                ],
+                dtype=pl.List(pl.List(pl.String())),
+            ),
+            "d": pl.Series([[[1, 2]], []], dtype=pl.List(pl.List(pl.UInt16()))),
+        }
+    )
+
+    assert_gpu_result_equal(df)

From e932fbd9dd59aafd17b41b80a8b94424e8d367a2 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 1 Jul 2024 09:17:32 -0700
Subject: [PATCH 19/27] Add patch for incorrect cuco noexcept clauses (#16077)

[cuco previously marked a number of methods as noexcept that can in fact
throw exceptions](https://github.com/nvidia/cuCollections/issues/510).
This causes problems for cudf functions that call these methods. The
issue [was fixed in cuco
upstream](https://github.com/NVIDIA/cuCollections/pull/511), but we
cannot easily update to the latest commit of cuco, especially in a patch
fix for 24.06. This PR instead adds a rapids-cmake patch for the cuco
clone to address this issue. The patch may be removed once we update to
a commit of cuco that contains the necessary fix.

Resolves #16059
---
 cpp/cmake/thirdparty/get_cucollections.cmake  |   7 +-
 .../thirdparty/patches/cuco_noexcept.diff     | 227 ++++++++++++++++++
 .../thirdparty/patches/cuco_override.json     |  14 ++
 3 files changed, 247 insertions(+), 1 deletion(-)
 create mode 100644 cpp/cmake/thirdparty/patches/cuco_noexcept.diff
 create mode 100644 cpp/cmake/thirdparty/patches/cuco_override.json

diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 9758958b44f..6ec35ddcaf1 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -15,6 +15,11 @@
 # This function finds cuCollections and performs any additional configuration.
 function(find_and_configure_cucollections)
   include(${rapids-cmake-dir}/cpm/cuco.cmake)
+  include(${rapids-cmake-dir}/cpm/package_override.cmake)
+
+  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
+  rapids_cpm_package_override("${cudf_patch_dir}/cuco_override.json")
+
   if(BUILD_SHARED_LIBS)
     rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports)
   else()
diff --git a/cpp/cmake/thirdparty/patches/cuco_noexcept.diff b/cpp/cmake/thirdparty/patches/cuco_noexcept.diff
new file mode 100644
index 00000000000..0f334c0e81f
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/cuco_noexcept.diff
@@ -0,0 +1,227 @@
+diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh
+index 7f9de01..5228193 100644
+--- a/include/cuco/aow_storage.cuh
++++ b/include/cuco/aow_storage.cuh
+@@ -81,7 +81,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
+    * @param size Number of windows to (de)allocate
+    * @param allocator Allocator used for (de)allocating device storage
+    */
+-  explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept;
++  explicit constexpr aow_storage(Extent size, Allocator const& allocator = {});
+ 
+   aow_storage(aow_storage&&) = default;  ///< Move constructor
+   /**
+@@ -122,7 +122,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
+    * @param key Key to which all keys in `slots` are initialized
+    * @param stream Stream used for executing the kernel
+    */
+-  void initialize(value_type key, cuda_stream_ref stream = {}) noexcept;
++  void initialize(value_type key, cuda_stream_ref stream = {});
+ 
+   /**
+    * @brief Asynchronously initializes each slot in the AoW storage to contain `key`.
+diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
+index c2c9c14..8ac4236 100644
+--- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh
++++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
+@@ -125,7 +125,7 @@ class open_addressing_impl {
+                                  KeyEqual const& pred,
+                                  ProbingScheme const& probing_scheme,
+                                  Allocator const& alloc,
+-                                 cuda_stream_ref stream) noexcept
++                                 cuda_stream_ref stream)
+     : empty_slot_sentinel_{empty_slot_sentinel},
+       erased_key_sentinel_{this->extract_key(empty_slot_sentinel)},
+       predicate_{pred},
+@@ -233,7 +233,7 @@ class open_addressing_impl {
+    *
+    * @param stream CUDA stream this operation is executed in
+    */
+-  void clear(cuda_stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); }
++  void clear(cuda_stream_ref stream) { storage_.initialize(empty_slot_sentinel_, stream); }
+ 
+   /**
+    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
+@@ -599,7 +599,7 @@ class open_addressing_impl {
+    *
+    * @return The number of elements in the container
+    */
+-  [[nodiscard]] size_type size(cuda_stream_ref stream) const noexcept
++  [[nodiscard]] size_type size(cuda_stream_ref stream) const
+   {
+     auto counter =
+       detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
+diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
+index e17a145..3fa1d02 100644
+--- a/include/cuco/detail/static_map/static_map.inl
++++ b/include/cuco/detail/static_map/static_map.inl
+@@ -123,7 +123,7 @@ template <class Key,
+           class Allocator,
+           class Storage>
+ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
+-  cuda_stream_ref stream) noexcept
++  cuda_stream_ref stream)
+ {
+   impl_->clear(stream);
+ }
+@@ -215,7 +215,7 @@ template <class Key,
+           class Storage>
+ template <typename InputIt>
+ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+-  insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream) noexcept
++  insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream)
+ {
+   return this->insert_or_assign_async(first, last, stream);
+   stream.synchronize();
+@@ -465,7 +465,7 @@ template <class Key,
+           class Storage>
+ static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+ static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
+-  cuda_stream_ref stream) const noexcept
++  cuda_stream_ref stream) const
+ {
+   return impl_->size(stream);
+ }
+diff --git a/include/cuco/detail/static_multiset/static_multiset.inl b/include/cuco/detail/static_multiset/static_multiset.inl
+index 174f9bc..582926b 100644
+--- a/include/cuco/detail/static_multiset/static_multiset.inl
++++ b/include/cuco/detail/static_multiset/static_multiset.inl
+@@ -97,7 +97,7 @@ template <class Key,
+           class Allocator,
+           class Storage>
+ void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
+-  cuda_stream_ref stream) noexcept
++  cuda_stream_ref stream)
+ {
+   impl_->clear(stream);
+ }
+@@ -183,7 +183,7 @@ template <class Key,
+           class Storage>
+ static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+ static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
+-  cuda_stream_ref stream) const noexcept
++  cuda_stream_ref stream) const
+ {
+   return impl_->size(stream);
+ }
+diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl
+index 645013f..d3cece0 100644
+--- a/include/cuco/detail/static_set/static_set.inl
++++ b/include/cuco/detail/static_set/static_set.inl
+@@ -98,7 +98,7 @@ template <class Key,
+           class Allocator,
+           class Storage>
+ void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
+-  cuda_stream_ref stream) noexcept
++  cuda_stream_ref stream)
+ {
+   impl_->clear(stream);
+ }
+@@ -429,7 +429,7 @@ template <class Key,
+           class Storage>
+ static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+ static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
+-  cuda_stream_ref stream) const noexcept
++  cuda_stream_ref stream) const
+ {
+   return impl_->size(stream);
+ }
+diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl
+index 3547f4c..94b7f98 100644
+--- a/include/cuco/detail/storage/aow_storage.inl
++++ b/include/cuco/detail/storage/aow_storage.inl
+@@ -32,8 +32,8 @@
+ namespace cuco {
+ 
+ template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
+-constexpr aow_storage<T, WindowSize, Extent, Allocator>::aow_storage(
+-  Extent size, Allocator const& allocator) noexcept
++constexpr aow_storage<T, WindowSize, Extent, Allocator>::aow_storage(Extent size,
++                                                                     Allocator const& allocator)
+   : detail::aow_storage_base<T, WindowSize, Extent>{size},
+     allocator_{allocator},
+     window_deleter_{capacity(), allocator_},
+@@ -64,7 +64,7 @@ aow_storage<T, WindowSize, Extent, Allocator>::ref() const noexcept
+ 
+ template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
+ void aow_storage<T, WindowSize, Extent, Allocator>::initialize(value_type key,
+-                                                               cuda_stream_ref stream) noexcept
++                                                               cuda_stream_ref stream)
+ {
+   this->initialize_async(key, stream);
+   stream.synchronize();
+diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
+index c86e90c..95da423 100644
+--- a/include/cuco/static_map.cuh
++++ b/include/cuco/static_map.cuh
+@@ -269,7 +269,7 @@ class static_map {
+    *
+    * @param stream CUDA stream this operation is executed in
+    */
+-  void clear(cuda_stream_ref stream = {}) noexcept;
++  void clear(cuda_stream_ref stream = {});
+ 
+   /**
+    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
+@@ -387,7 +387,7 @@ class static_map {
+    * @param stream CUDA stream used for insert
+    */
+   template <typename InputIt>
+-  void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept;
++  void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {});
+ 
+   /**
+    * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k`
+@@ -690,7 +690,7 @@ class static_map {
+    * @param stream CUDA stream used to get the number of inserted elements
+    * @return The number of elements in the container
+    */
+-  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
++  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
+ 
+   /**
+    * @brief Gets the maximum number of elements the hash map can hold.
+diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh
+index 0daf103..fbcbc9c 100644
+--- a/include/cuco/static_multiset.cuh
++++ b/include/cuco/static_multiset.cuh
+@@ -235,7 +235,7 @@ class static_multiset {
+    *
+    * @param stream CUDA stream this operation is executed in
+    */
+-  void clear(cuda_stream_ref stream = {}) noexcept;
++  void clear(cuda_stream_ref stream = {});
+ 
+   /**
+    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
+@@ -339,7 +339,7 @@ class static_multiset {
+    * @param stream CUDA stream used to get the number of inserted elements
+    * @return The number of elements in the container
+    */
+-  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
++  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
+ 
+   /**
+    * @brief Gets the maximum number of elements the multiset can hold.
+diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh
+index a069939..3517f84 100644
+--- a/include/cuco/static_set.cuh
++++ b/include/cuco/static_set.cuh
+@@ -240,7 +240,7 @@ class static_set {
+    *
+    * @param stream CUDA stream this operation is executed in
+    */
+-  void clear(cuda_stream_ref stream = {}) noexcept;
++  void clear(cuda_stream_ref stream = {});
+ 
+   /**
+    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
+@@ -687,7 +687,7 @@ class static_set {
+    * @param stream CUDA stream used to get the number of inserted elements
+    * @return The number of elements in the container
+    */
+-  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
++  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
+ 
+   /**
+    * @brief Gets the maximum number of elements the hash set can hold.
diff --git a/cpp/cmake/thirdparty/patches/cuco_override.json b/cpp/cmake/thirdparty/patches/cuco_override.json
new file mode 100644
index 00000000000..ae0a9a4b4f0
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/cuco_override.json
@@ -0,0 +1,14 @@
+
+{
+  "packages" : {
+    "cuco" : {
+      "patches" : [
+        {
+          "file" : "${current_json_dir}/cuco_noexcept.diff",
+          "issue" : "Remove erroneous noexcept clauses on cuco functions that may throw [https://github.com/rapidsai/cudf/issues/16059]",
+          "fixed_in" : ""
+        }
+      ]
+    }
+  }
+}

From 5efd72f64e3b1e25337c30ba0ab246051d3fe396 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 1 Jul 2024 07:37:12 -1000
Subject: [PATCH 20/27] Ensure cudf objects can astype to any type when empty
 (#16106)

pandas allows objects to `astype` to any other type if the object is empty. The PR mirrors that behavior for cudf.

This PR also more consistently uses `astype` instead of `as_*_column` and fixes a bug in `IntervalDtype.__eq__` discovered when writing a unit test for this bug.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16106
---
 python/cudf/cudf/core/column/column.py    |  9 ++++++
 python/cudf/cudf/core/column/datetime.py  | 36 +++++++++++----------
 python/cudf/cudf/core/column/decimal.py   |  2 +-
 python/cudf/cudf/core/column/interval.py  | 26 +++++++--------
 python/cudf/cudf/core/column/timedelta.py | 34 +++++++++++---------
 python/cudf/cudf/core/dataframe.py        |  2 +-
 python/cudf/cudf/core/dtypes.py           |  2 +-
 python/cudf/cudf/core/frame.py            |  4 +--
 python/cudf/cudf/core/indexing_utils.py   |  2 +-
 python/cudf/cudf/core/series.py           |  8 +++--
 python/cudf/cudf/core/tools/numeric.py    | 14 ++++----
 python/cudf/cudf/tests/test_interval.py   |  6 ++++
 python/cudf/cudf/tests/test_series.py     | 39 +++++++++++++++++++++++
 13 files changed, 121 insertions(+), 63 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 5db6fd904a9..e7a2863da8c 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -959,6 +959,15 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
         raise NotImplementedError()
 
     def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
+        if len(self) == 0:
+            dtype = cudf.dtype(dtype)
+            if self.dtype == dtype:
+                if copy:
+                    return self.copy()
+                else:
+                    return self
+            else:
+                return column_empty(0, dtype=dtype, masked=self.nullable)
         if copy:
             col = self.copy()
         else:
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 121076b69ce..c10aceba9f4 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -280,8 +280,8 @@ def __contains__(self, item: ScalarLike) -> bool:
             return False
         elif ts.tzinfo is not None:
             ts = ts.tz_convert(None)
-        return ts.to_numpy().astype("int64") in self.as_numerical_column(
-            "int64"
+        return ts.to_numpy().astype("int64") in cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
         )
 
     @functools.cached_property
@@ -503,9 +503,9 @@ def mean(
         self, skipna=None, min_count: int = 0, dtype=np.float64
     ) -> ScalarLike:
         return pd.Timestamp(
-            self.as_numerical_column("int64").mean(
-                skipna=skipna, min_count=min_count, dtype=dtype
-            ),
+            cast(
+                "cudf.core.column.NumericalColumn", self.astype("int64")
+            ).mean(skipna=skipna, min_count=min_count, dtype=dtype),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -517,7 +517,7 @@ def std(
         ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical_column("int64").std(
+            cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
                 skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
             )
             * _unit_to_nanoseconds_conversion[self.time_unit],
@@ -525,7 +525,9 @@ def std(
 
     def median(self, skipna: bool | None = None) -> pd.Timestamp:
         return pd.Timestamp(
-            self.as_numerical_column("int64").median(skipna=skipna),
+            cast(
+                "cudf.core.column.NumericalColumn", self.astype("int64")
+            ).median(skipna=skipna),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -534,18 +536,18 @@ def cov(self, other: DatetimeColumn) -> float:
             raise TypeError(
                 f"cannot perform cov with types {self.dtype}, {other.dtype}"
             )
-        return self.as_numerical_column("int64").cov(
-            other.as_numerical_column("int64")
-        )
+        return cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
+        ).cov(cast("cudf.core.column.NumericalColumn", other.astype("int64")))
 
     def corr(self, other: DatetimeColumn) -> float:
         if not isinstance(other, DatetimeColumn):
             raise TypeError(
                 f"cannot perform corr with types {self.dtype}, {other.dtype}"
             )
-        return self.as_numerical_column("int64").corr(
-            other.as_numerical_column("int64")
-        )
+        return cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
+        ).corr(cast("cudf.core.column.NumericalColumn", other.astype("int64")))
 
     def quantile(
         self,
@@ -554,7 +556,7 @@ def quantile(
         exact: bool,
         return_scalar: bool,
     ) -> ColumnBase:
-        result = self.as_numerical_column("int64").quantile(
+        result = self.astype("int64").quantile(
             q=q,
             interpolation=interpolation,
             exact=exact,
@@ -645,12 +647,12 @@ def indices_of(
     ) -> cudf.core.column.NumericalColumn:
         value = column.as_column(
             pd.to_datetime(value), dtype=self.dtype
-        ).as_numerical_column("int64")
-        return self.as_numerical_column("int64").indices_of(value)
+        ).astype("int64")
+        return self.astype("int64").indices_of(value)
 
     @property
     def is_unique(self) -> bool:
-        return self.as_numerical_column("int64").is_unique
+        return self.astype("int64").is_unique
 
     def isin(self, values: Sequence) -> ColumnBase:
         return cudf.core.tools.datetimes._isin_datetimelike(self, values)
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index d66908b5f94..3e238d65cff 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -157,7 +157,7 @@ def normalize_binop_value(self, other):
                         "Decimal columns only support binary operations with "
                         "integer numerical columns."
                     )
-                other = other.as_decimal_column(
+                other = other.astype(
                     self.dtype.__class__(self.dtype.__class__.MAX_PRECISION, 0)
                 )
             elif not isinstance(other, DecimalBaseColumn):
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index f24ca3fdad1..d09a1f66539 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -4,7 +4,7 @@
 
 import cudf
 from cudf.core.column import StructColumn
-from cudf.core.dtypes import CategoricalDtype, IntervalDtype
+from cudf.core.dtypes import IntervalDtype
 
 
 class IntervalColumn(StructColumn):
@@ -87,20 +87,16 @@ def copy(self, deep=True):
 
     def as_interval_column(self, dtype):
         if isinstance(dtype, IntervalDtype):
-            if isinstance(self.dtype, CategoricalDtype):
-                new_struct = self._get_decategorized_column()
-                return IntervalColumn.from_struct_column(new_struct)
-            else:
-                return IntervalColumn(
-                    size=self.size,
-                    dtype=dtype,
-                    mask=self.mask,
-                    offset=self.offset,
-                    null_count=self.null_count,
-                    children=tuple(
-                        child.astype(dtype.subtype) for child in self.children
-                    ),
-                )
+            return IntervalColumn(
+                size=self.size,
+                dtype=dtype,
+                mask=self.mask,
+                offset=self.offset,
+                null_count=self.null_count,
+                children=tuple(
+                    child.astype(dtype.subtype) for child in self.children
+                ),
+            )
         else:
             raise ValueError("dtype must be IntervalDtype")
 
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 8f41bcb6422..5a0171bbbdc 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -107,7 +107,9 @@ def __contains__(self, item: DatetimeLikeScalar) -> bool:
             # np.timedelta64 raises ValueError, hence `item`
             # cannot exist in `self`.
             return False
-        return item.view("int64") in self.as_numerical_column("int64")
+        return item.view("int64") in cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
+        )
 
     @property
     def values(self):
@@ -132,9 +134,7 @@ def to_arrow(self) -> pa.Array:
                 self.mask_array_view(mode="read").copy_to_host()
             )
         data = pa.py_buffer(
-            self.as_numerical_column("int64")
-            .data_array_view(mode="read")
-            .copy_to_host()
+            self.astype("int64").data_array_view(mode="read").copy_to_host()
         )
         pa_dtype = np_to_pa_dtype(self.dtype)
         return pa.Array.from_buffers(
@@ -295,13 +295,17 @@ def as_timedelta_column(
 
     def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical_column("int64").mean(skipna=skipna, dtype=dtype),
+            cast(
+                "cudf.core.column.NumericalColumn", self.astype("int64")
+            ).mean(skipna=skipna, dtype=dtype),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
     def median(self, skipna: bool | None = None) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical_column("int64").median(skipna=skipna),
+            cast(
+                "cudf.core.column.NumericalColumn", self.astype("int64")
+            ).median(skipna=skipna),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -315,7 +319,7 @@ def quantile(
         exact: bool,
         return_scalar: bool,
     ) -> ColumnBase:
-        result = self.as_numerical_column("int64").quantile(
+        result = self.astype("int64").quantile(
             q=q,
             interpolation=interpolation,
             exact=exact,
@@ -337,7 +341,7 @@ def sum(
             # Since sum isn't overridden in Numerical[Base]Column, mypy only
             # sees the signature from Reducible (which doesn't have the extra
             # parameters from ColumnBase._reduce) so we have to ignore this.
-            self.as_numerical_column("int64").sum(  # type: ignore
+            self.astype("int64").sum(  # type: ignore
                 skipna=skipna, min_count=min_count, dtype=dtype
             ),
             unit=self.time_unit,
@@ -351,7 +355,7 @@ def std(
         ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical_column("int64").std(
+            cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
                 skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype
             ),
             unit=self.time_unit,
@@ -362,18 +366,18 @@ def cov(self, other: TimeDeltaColumn) -> float:
             raise TypeError(
                 f"cannot perform cov with types {self.dtype}, {other.dtype}"
             )
-        return self.as_numerical_column("int64").cov(
-            other.as_numerical_column("int64")
-        )
+        return cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
+        ).cov(cast("cudf.core.column.NumericalColumn", other.astype("int64")))
 
     def corr(self, other: TimeDeltaColumn) -> float:
         if not isinstance(other, TimeDeltaColumn):
             raise TypeError(
                 f"cannot perform corr with types {self.dtype}, {other.dtype}"
             )
-        return self.as_numerical_column("int64").corr(
-            other.as_numerical_column("int64")
-        )
+        return cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
+        ).corr(cast("cudf.core.column.NumericalColumn", other.astype("int64")))
 
     def components(self) -> dict[str, ColumnBase]:
         """
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 4dfeb68b7ba..b249410c2e4 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2404,7 +2404,7 @@ def scatter_by_map(
         if isinstance(map_index, cudf.core.column.StringColumn):
             cat_index = cast(
                 cudf.core.column.CategoricalColumn,
-                map_index.as_categorical_column("category"),
+                map_index.astype("category"),
             )
             map_index = cat_index.codes
             warnings.warn(
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 034849d0e71..de715191c08 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -937,7 +937,7 @@ def to_pandas(self) -> pd.IntervalDtype:
     def __eq__(self, other):
         if isinstance(other, str):
             # This means equality isn't transitive but mimics pandas
-            return other == self.name
+            return other in (self.name, str(self))
         return (
             type(self) == type(other)
             and self.subtype == other.subtype
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 9bac75dc6ac..253d200f7d4 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -927,7 +927,7 @@ def from_arrow(cls, data: pa.Table) -> Self:
                 # of column is 0 (i.e., empty) then we will have an
                 # int8 column in result._data[name] returned by libcudf,
                 # which needs to be type-casted to 'category' dtype.
-                result[name] = result[name].as_categorical_column("category")
+                result[name] = result[name].astype("category")
             elif (
                 pandas_dtypes.get(name) == "empty"
                 and np_dtypes.get(name) == "object"
@@ -936,7 +936,7 @@ def from_arrow(cls, data: pa.Table) -> Self:
                 # is specified as 'empty' and np_dtypes as 'object',
                 # hence handling this special case to type-cast the empty
                 # float column to str column.
-                result[name] = result[name].as_string_column(cudf.dtype("str"))
+                result[name] = result[name].astype(cudf.dtype("str"))
             elif name in data.column_names and isinstance(
                 data[name].type,
                 (
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index 73a1cd26367..a5fed02cbed 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -229,7 +229,7 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec:
     else:
         key = cudf.core.column.as_column(key)
         if isinstance(key, cudf.core.column.CategoricalColumn):
-            key = key.as_numerical_column(key.codes.dtype)
+            key = key.astype(key.codes.dtype)
         if is_bool_dtype(key.dtype):
             return MaskIndexer(BooleanMask(key, n))
         elif len(key) == 0:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 97b6bbec2d4..4a60470fafa 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3107,10 +3107,12 @@ def value_counts(
         # Pandas returns an IntervalIndex as the index of res
         # this condition makes sure we do too if bins is given
         if bins is not None and len(res) == len(res.index.categories):
-            int_index = IntervalColumn.as_interval_column(
-                res.index._column, res.index.categories.dtype
+            interval_col = IntervalColumn.from_struct_column(
+                res.index._column._get_decategorized_column()
+            )
+            res.index = cudf.IntervalIndex._from_data(
+                {res.index.name: interval_col}
             )
-            res.index = int_index
         res.name = result_name
         return res
 
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 68b23f1e059..ef6b86a04a7 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -115,11 +115,11 @@ def to_numeric(arg, errors="raise", downcast=None):
     dtype = col.dtype
 
     if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype):
-        col = col.as_numerical_column(cudf.dtype("int64"))
+        col = col.astype(cudf.dtype("int64"))
     elif isinstance(dtype, CategoricalDtype):
         cat_dtype = col.dtype.type
         if _is_non_decimal_numeric_dtype(cat_dtype):
-            col = col.as_numerical_column(cat_dtype)
+            col = col.astype(cat_dtype)
         else:
             try:
                 col = _convert_str_col(
@@ -146,8 +146,8 @@ def to_numeric(arg, errors="raise", downcast=None):
         raise ValueError("Unrecognized datatype")
 
     # str->float conversion may require lower precision
-    if col.dtype == cudf.dtype("f"):
-        col = col.as_numerical_column("d")
+    if col.dtype == cudf.dtype("float32"):
+        col = col.astype("float64")
 
     if downcast:
         if downcast == "float":
@@ -205,7 +205,7 @@ def _convert_str_col(col, errors, _downcast=None):
 
     is_integer = libstrings.is_integer(col)
     if is_integer.all():
-        return col.as_numerical_column(dtype=cudf.dtype("i8"))
+        return col.astype(dtype=cudf.dtype("i8"))
 
     col = _proc_inf_empty_strings(col)
 
@@ -218,9 +218,9 @@ def _convert_str_col(col, errors, _downcast=None):
                     "limited by float32 precision."
                 )
             )
-            return col.as_numerical_column(dtype=cudf.dtype("f"))
+            return col.astype(dtype=cudf.dtype("float32"))
         else:
-            return col.as_numerical_column(dtype=cudf.dtype("d"))
+            return col.astype(dtype=cudf.dtype("float64"))
     else:
         if errors == "coerce":
             col = libcudf.string_casting.stod(col)
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index 1b395c09ba8..5eeea87d8e0 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -188,3 +188,9 @@ def test_from_pandas_intervaldtype():
     result = cudf.from_pandas(dtype)
     expected = cudf.IntervalDtype("int64", closed="left")
     assert_eq(result, expected)
+
+
+def test_intervaldtype_eq_string_with_attributes():
+    dtype = cudf.IntervalDtype("int64", closed="left")
+    assert dtype == "interval"
+    assert dtype == "interval[int64, left]"
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 467d0c46ae7..f2501041f25 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2873,3 +2873,42 @@ def test_nunique_all_null(dropna):
     result = pd_ser.nunique(dropna=dropna)
     expected = cudf_ser.nunique(dropna=dropna)
     assert result == expected
+
+
+@pytest.mark.parametrize(
+    "type1",
+    [
+        "category",
+        "interval[int64, right]",
+        "int64",
+        "float64",
+        "str",
+        "datetime64[ns]",
+        "timedelta64[ns]",
+    ],
+)
+@pytest.mark.parametrize(
+    "type2",
+    [
+        "category",
+        "interval[int64, right]",
+        "int64",
+        "float64",
+        "str",
+        "datetime64[ns]",
+        "timedelta64[ns]",
+    ],
+)
+@pytest.mark.parametrize(
+    "as_dtype", [lambda x: x, cudf.dtype], ids=["string", "object"]
+)
+@pytest.mark.parametrize("copy", [True, False])
+def test_empty_astype_always_castable(type1, type2, as_dtype, copy):
+    ser = cudf.Series([], dtype=as_dtype(type1))
+    result = ser.astype(as_dtype(type2), copy=copy)
+    expected = cudf.Series([], dtype=as_dtype(type2))
+    assert_eq(result, expected)
+    if not copy and cudf.dtype(type1) == cudf.dtype(type2):
+        assert ser._column is result._column
+    else:
+        assert ser._column is not result._column

From b691b1c1cd99a5721230ac8db2afa8ad99835b9c Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 1 Jul 2024 14:25:11 -0400
Subject: [PATCH 21/27] Add stream parameter to cudf::io::text::multibyte_split
 (#16034)

Adds stream support the `cudf::io::text::multibyte_split` API.
Also adds a stream test and deprecates an overloaded API.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16034
---
 cpp/include/cudf/io/text/byte_range_info.hpp  | 15 +++-
 .../cudf/io/text/data_chunk_source.hpp        | 10 ++-
 cpp/include/cudf/io/text/multibyte_split.hpp  | 27 ++++++-
 cpp/src/io/text/multibyte_split.cu            | 19 ++---
 cpp/tests/CMakeLists.txt                      |  1 +
 cpp/tests/io/text/multibyte_split_test.cpp    | 81 ++++++++++++-------
 cpp/tests/streams/io/multibyte_split_test.cpp | 36 +++++++++
 docs/cudf/source/conf.py                      |  2 +-
 8 files changed, 141 insertions(+), 50 deletions(-)
 create mode 100644 cpp/tests/streams/io/multibyte_split_test.cpp

diff --git a/cpp/include/cudf/io/text/byte_range_info.hpp b/cpp/include/cudf/io/text/byte_range_info.hpp
index 0086432d003..60ee867f058 100644
--- a/cpp/include/cudf/io/text/byte_range_info.hpp
+++ b/cpp/include/cudf/io/text/byte_range_info.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,17 +24,22 @@
 namespace cudf {
 namespace io {
 namespace text {
+/**
+ * @addtogroup io_readers
+ * @{
+ * @file
+ */
 
 /**
  * @brief stores offset and size used to indicate a byte range
  */
 class byte_range_info {
  private:
-  int64_t _offset;  ///< offset in bytes
-  int64_t _size;    ///< size in bytes
+  int64_t _offset{};  ///< offset in bytes
+  int64_t _size{};    ///< size in bytes
 
  public:
-  constexpr byte_range_info() noexcept : _offset(0), _size(0) {}
+  constexpr byte_range_info() = default;
   /**
    * @brief Constructs a byte_range_info object
    *
@@ -104,6 +109,8 @@ std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_b
  */
 byte_range_info create_byte_range_info_max();
 
+/** @} */  // end of group
+
 }  // namespace text
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp
index 28204c82780..13aff4b3b8f 100644
--- a/cpp/include/cudf/io/text/data_chunk_source.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,12 @@ namespace cudf {
 namespace io {
 namespace text {
 
+/**
+ * @addtogroup io_readers
+ * @{
+ * @file
+ */
+
 /**
  * @brief A contract guaranteeing stream-ordered memory access to the underlying device data.
  *
@@ -110,6 +116,8 @@ class data_chunk_source {
   [[nodiscard]] virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0;
 };
 
+/** @} */  // end of group
+
 }  // namespace text
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index 7abae7c754b..e29ab78ae46 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -30,6 +30,11 @@
 namespace cudf {
 namespace io {
 namespace text {
+/**
+ * @addtogroup io_readers
+ * @{
+ * @file
+ */
 
 /**
  * @brief Parsing options for multibyte_split.
@@ -79,6 +84,7 @@ struct parse_options {
  * @param source The source string
  * @param delimiter UTF-8 encoded string for which to find offsets in the source
  * @param options the parsing options to use (including byte range)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Memory resource to use for the device memory allocation
  * @return The strings found by splitting the source by the delimiter within the relevant byte
  * range.
@@ -87,17 +93,30 @@ std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
   parse_options options             = {},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<cudf::column> multibyte_split(
+/**
+ * @brief Splits the source text into a strings column using a multiple byte delimiter.
+ *
+ * @deprecated Since 24.08
+ *
+ * @param source The source input data encoded in UTF-8
+ * @param delimiter UTF-8 encoded string for which to find offsets in the source
+ * @param byte_range The position and size within `source` to produce the column from
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Memory resource to use for the device memory allocation
+ * @return The strings found by splitting the source by the delimiter within the relevant byte
+ * range.
+ */
+[[deprecated]] std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
   std::optional<byte_range_info> byte_range,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
-                                              std::string const& delimiter,
-                                              rmm::device_async_resource_ref mr);
+/** @} */  // end of group
 
 }  // namespace text
 }  // namespace io
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 9c406369068..51dc0ca90af 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -565,35 +565,32 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 
 }  // namespace detail
 
+// deprecated in 24.08
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               std::optional<byte_range_info> byte_range,
+                                              rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr)
 {
-  return multibyte_split(
-    source, delimiter, parse_options{byte_range.value_or(create_byte_range_info_max())}, mr);
+  return multibyte_split(source,
+                         delimiter,
+                         parse_options{byte_range.value_or(create_byte_range_info_max())},
+                         stream,
+                         mr);
 }
 
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               parse_options options,
+                                              rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr)
 {
-  auto stream = cudf::get_default_stream();
-
   auto result = detail::multibyte_split(
     source, delimiter, options.byte_range, options.strip_delimiters, stream, mr);
 
   return result;
 }
 
-std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
-                                              std::string const& delimiter,
-                                              rmm::device_async_resource_ref mr)
-{
-  return multibyte_split(source, delimiter, parse_options{}, mr);
-}
-
 }  // namespace text
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index eef09954647..244bcb7d897 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -691,6 +691,7 @@ ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_MULTIBYTE_SPLIT_TEST streams/io/multibyte_split_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
index 36338253c9b..408d54bd5ff 100644
--- a/cpp/tests/io/text/multibyte_split_test.cpp
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -97,10 +97,9 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange)
   auto expected = strings_column_wrapper{"abcdefg:"};
 
   auto source = cudf::io::text::make_source(host_input);
-  auto out    = cudf::io::text::multibyte_split(
-    *source,
-    delimiter,
-    cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size())});
+  cudf::io::text::parse_options options{
+    cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size())}};
+  auto out = cudf::io::text::multibyte_split(*source, delimiter, options);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
 }
@@ -113,10 +112,9 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange2)
   auto expected = strings_column_wrapper{"abcdefg:"};
 
   auto source = cudf::io::text::make_source(host_input);
-  auto out    = cudf::io::text::multibyte_split(
-    *source,
-    delimiter,
-    cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size() - 1)});
+  cudf::io::text::parse_options options{
+    cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size() - 1)}};
+  auto out = cudf::io::text::multibyte_split(*source, delimiter, options);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
 }
@@ -277,9 +275,12 @@ TEST_F(MultibyteSplitTest, LargeInputMultipleRange)
   auto source    = cudf::io::text::make_source(host_input);
 
   auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
-  auto out0        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
-  auto out1        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
-  auto out2        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
+  auto out0        = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]});
+  auto out1 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]});
+  auto out2 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]});
 
   auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
   auto out       = cudf::concatenate(out_views);
@@ -303,9 +304,12 @@ TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRange)
   auto source                           = cudf::io::text::make_source(host_input);
 
   auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
-  auto out0        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
-  auto out1        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
-  auto out2        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
+  auto out0        = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]});
+  auto out1 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]});
+  auto out2 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]});
 
   auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
   auto out       = cudf::concatenate(out_views);
@@ -327,9 +331,12 @@ TEST_F(MultibyteSplitTest, LargeInputMultipleRangeSingleByte)
   auto source    = cudf::io::text::make_source(host_input);
 
   auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
-  auto out0        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
-  auto out1        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
-  auto out2        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
+  auto out0        = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]});
+  auto out1 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]});
+  auto out2 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]});
 
   auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
   auto out       = cudf::concatenate(out_views);
@@ -352,9 +359,12 @@ TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRangeSingleByte)
   auto source                       = cudf::io::text::make_source(host_input);
 
   auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
-  auto out0        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
-  auto out1        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
-  auto out2        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
+  auto out0        = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]});
+  auto out1 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]});
+  auto out2 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]});
 
   auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
   auto out       = cudf::concatenate(out_views);
@@ -383,9 +393,14 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRanges)
     SCOPED_TRACE(split1);
     for (int split2 = split1 + 1; split2 < size; split2++) {
       SCOPED_TRACE(split2);
-      auto out1 = multibyte_split(*source, delimiter, byte_range_info{0, split1});
-      auto out2 = multibyte_split(*source, delimiter, byte_range_info{split1, split2 - split1});
-      auto out3 = multibyte_split(*source, delimiter, byte_range_info{split2, size - split2});
+      auto out1 = multibyte_split(
+        *source, delimiter, cudf::io::text::parse_options{byte_range_info{0, split1}});
+      auto out2 =
+        multibyte_split(*source,
+                        delimiter,
+                        cudf::io::text::parse_options{byte_range_info{split1, split2 - split1}});
+      auto out3 = multibyte_split(
+        *source, delimiter, cudf::io::text::parse_options{byte_range_info{split2, size - split2}});
 
       auto out_views = std::vector<cudf::column_view>({out1->view(), out2->view(), out3->view()});
       auto out       = cudf::concatenate(out_views);
@@ -416,9 +431,14 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRangesSingleByte)
     SCOPED_TRACE(split1);
     for (int split2 = split1 + 1; split2 < size; split2++) {
       SCOPED_TRACE(split2);
-      auto out1 = multibyte_split(*source, delimiter, byte_range_info{0, split1});
-      auto out2 = multibyte_split(*source, delimiter, byte_range_info{split1, split2 - split1});
-      auto out3 = multibyte_split(*source, delimiter, byte_range_info{split2, size - split2});
+      auto out1 = multibyte_split(
+        *source, delimiter, cudf::io::text::parse_options{byte_range_info{0, split1}});
+      auto out2 =
+        multibyte_split(*source,
+                        delimiter,
+                        cudf::io::text::parse_options{byte_range_info{split1, split2 - split1}});
+      auto out3 = multibyte_split(
+        *source, delimiter, cudf::io::text::parse_options{byte_range_info{split2, size - split2}});
 
       auto out_views = std::vector<cudf::column_view>({out1->view(), out2->view(), out3->view()});
       auto out       = cudf::concatenate(out_views);
@@ -441,7 +461,8 @@ TEST_F(MultibyteSplitTest, SingletonRangeAtEnd)
   auto source     = make_source(host_input);
   auto expected   = strings_column_wrapper{};
 
-  auto out = multibyte_split(*source, delimiter, byte_range_info{5, 1});
+  auto out =
+    multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{5, 1}});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS);
 }
@@ -480,7 +501,8 @@ TEST_F(MultibyteSplitTest, EmptyRange)
   auto source     = make_source(host_input);
   auto expected   = strings_column_wrapper{};
 
-  auto out = multibyte_split(*source, delimiter, byte_range_info{4, 0});
+  auto out =
+    multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{4, 0}});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS);
 }
@@ -493,7 +515,8 @@ TEST_F(MultibyteSplitTest, EmptyRangeSingleByte)
   auto source     = make_source(host_input);
   auto expected   = strings_column_wrapper{};
 
-  auto out = multibyte_split(*source, delimiter, byte_range_info{3, 0});
+  auto out =
+    multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{3, 0}});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS);
 }
diff --git a/cpp/tests/streams/io/multibyte_split_test.cpp b/cpp/tests/streams/io/multibyte_split_test.cpp
new file mode 100644
index 00000000000..b0eff1d3340
--- /dev/null
+++ b/cpp/tests/streams/io/multibyte_split_test.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/io/text/byte_range_info.hpp>
+#include <cudf/io/text/data_chunk_source_factories.hpp>
+#include <cudf/io/text/multibyte_split.hpp>
+
+#include <string>
+
+class MultibyteSplitTest : public cudf::test::BaseFixture {};
+
+TEST_F(MultibyteSplitTest, Reader)
+{
+  auto delimiter  = std::string(":");
+  auto host_input = std::string("abc:def");
+  auto source     = cudf::io::text::make_source(host_input);
+  cudf::io::text::parse_options options{};
+  auto result =
+    cudf::io::text::multibyte_split(*source, delimiter, options, cudf::test::get_default_stream());
+}
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 108f12bc099..c3c14ac8cad 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -372,7 +372,7 @@ def _generate_namespaces(namespaces):
 _all_namespaces = _generate_namespaces(
     {
         # Note that io::datasource is actually a nested class
-        "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression"},
+        "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression", "io::text"},
         "numeric": {},
         "nvtext": {},
     }

From 760c15cbd4231e4987149b3a5d68fdcd22654dce Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Mon, 1 Jul 2024 14:27:30 -0400
Subject: [PATCH 22/27] Use verify-alpha-spec hook (#16144)

With the deployment of rapids-build-backend, we need to make sure our dependencies have alpha specs.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16144
---
 .pre-commit-config.yaml                          |  3 ++-
 conda/environments/all_cuda-118_arch-x86_64.yaml |  5 ++---
 conda/environments/all_cuda-122_arch-x86_64.yaml |  7 +++----
 dependencies.yaml                                | 10 +++++-----
 4 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f8c4f4b9143..d0457d2c641 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -149,7 +149,7 @@ repos:
       - id: ruff-format
         files: python/.*$
   - repo: https://github.com/rapidsai/pre-commit-hooks
-    rev: v0.0.3
+    rev: v0.2.0
     hooks:
       - id: verify-copyright
         exclude: |
@@ -158,6 +158,7 @@ repos:
             cpp/src/io/parquet/ipc/Message_generated[.]h$|
             cpp/src/io/parquet/ipc/Schema_generated[.]h$
           )
+      - id: verify-alpha-spec
 
 default_language_version:
       python: python3
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 946e2d1cd32..cc9238ab80a 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -26,7 +26,6 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.8.*
 - dask-cuda==24.8.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
@@ -44,10 +43,10 @@ dependencies:
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
-- libkvikio==24.8.*
+- libkvikio==24.8.*,>=0.0.0a0
 - libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.8.*
+- librmm==24.8.*,>=0.0.0a0
 - make
 - moto>=4.0.8
 - msgpack-python
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index f069616ddbe..9fecd452248 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -27,7 +27,6 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.8.*
 - dask-cuda==24.8.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
@@ -43,10 +42,10 @@ dependencies:
 - libarrow==16.1.0.*
 - libcufile-dev
 - libcurand-dev
-- libkvikio==24.8.*
+- libkvikio==24.8.*,>=0.0.0a0
 - libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.8.*
+- librmm==24.8.*,>=0.0.0a0
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -66,7 +65,7 @@ dependencies:
 - pre-commit
 - pyarrow==16.1.0.*
 - pydata-sphinx-theme!=0.14.2
-- pynvjitlink
+- pynvjitlink>=0.0.0a0
 - pytest-benchmark
 - pytest-cases>=3.8.2
 - pytest-cov
diff --git a/dependencies.yaml b/dependencies.yaml
index 38ec30a8033..9efbc47896c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -287,8 +287,8 @@ dependencies:
       - output_types: conda
         packages:
           - fmt>=10.1.1,<11
-          - librmm==24.8.*
-          - libkvikio==24.8.*
+          - librmm==24.8.*,>=0.0.0a0
+          - libkvikio==24.8.*,>=0.0.0a0
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==3.0.6
@@ -500,7 +500,7 @@ dependencies:
       - output_types: [conda]
         packages:
           - breathe>=4.35.0
-          - dask-cuda==24.8.*
+          - dask-cuda==24.8.*,>=0.0.0a0
           - *doxygen
           - make
           - myst-nb
@@ -582,7 +582,7 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - pynvjitlink
+              - pynvjitlink>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
               - cubinlinker
@@ -592,7 +592,7 @@ dependencies:
           - matrix: {cuda: "12.*"}
             packages:
               - rmm-cu12==24.8.*,>=0.0.0a0
-              - pynvjitlink-cu12
+              - pynvjitlink-cu12>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
               - rmm-cu11==24.8.*,>=0.0.0a0

From 08552f816ddf21288448997e4998c3e1e0e58f5f Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 2 Jul 2024 03:12:50 +0100
Subject: [PATCH 23/27] Update cudf-polars for v1 release of polars (#16149)

Minor changes to the IR, which we adapt to, and request `polars>=1.0` in dependencies.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16149
---
 ci/test_cudf_polars.sh                          |  4 +---
 dependencies.yaml                               |  2 +-
 python/cudf_polars/cudf_polars/dsl/expr.py      |  6 +++---
 python/cudf_polars/cudf_polars/dsl/ir.py        | 11 +++++++++--
 python/cudf_polars/cudf_polars/dsl/translate.py |  6 ++++--
 python/cudf_polars/pyproject.toml               |  2 +-
 6 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/ci/test_cudf_polars.sh b/ci/test_cudf_polars.sh
index 669e049ab26..95fb4b431bf 100755
--- a/ci/test_cudf_polars.sh
+++ b/ci/test_cudf_polars.sh
@@ -28,10 +28,8 @@ rapids-logger "Install cudf wheel"
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/cudf*.whl)[test]
 
-rapids-logger "Install polars (allow pre-release versions)"
-python -m pip install 'polars>=1.0.0a0'
-
 rapids-logger "Install cudf_polars"
+python -m pip install 'polars>=1.0'
 python -m pip install --no-deps python/cudf_polars
 
 rapids-logger "Run cudf_polars tests"
diff --git a/dependencies.yaml b/dependencies.yaml
index 9efbc47896c..e3f8a72e76c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -603,7 +603,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=0.20.30
+          - polars>=1.0
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 16cfd9b9749..fe859c8d958 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -978,15 +978,15 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 class Agg(Expr):
     __slots__ = ("name", "options", "op", "request", "children")
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr]
+    children: tuple[Expr, ...]
 
     def __init__(
-        self, dtype: plc.DataType, name: str, options: Any, value: Expr
+        self, dtype: plc.DataType, name: str, options: Any, *children: Expr
     ) -> None:
         super().__init__(dtype)
         self.name = name
         self.options = options
-        self.children = (value,)
+        self.children = children
         if name not in Agg._SUPPORTED:
             raise NotImplementedError(
                 f"Unsupported aggregation {name=}"
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index abe26b14a90..9b3096becd4 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -15,6 +15,7 @@
 
 import dataclasses
 import itertools
+import json
 import types
 from functools import cache
 from typing import TYPE_CHECKING, Any, Callable, ClassVar
@@ -180,8 +181,10 @@ def __post_init__(self):
 class Scan(IR):
     """Input from files."""
 
-    typ: Any
+    typ: str
     """What type of file are we reading? Parquet, CSV, etc..."""
+    options: tuple[Any, ...]
+    """Type specific options, as json-encoded strings."""
     paths: list[str]
     """List of paths to read from."""
     file_options: Any
@@ -211,17 +214,21 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         with_columns = options.with_columns
         row_index = options.row_index
         if self.typ == "csv":
+            opts, cloud_opts = map(json.loads, self.options)
             df = DataFrame.from_cudf(
                 cudf.concat(
                     [cudf.read_csv(p, usecols=with_columns) for p in self.paths]
                 )
             )
         elif self.typ == "parquet":
+            opts, cloud_opts = map(json.loads, self.options)
             cdf = cudf.read_parquet(self.paths, columns=with_columns)
             assert isinstance(cdf, cudf.DataFrame)
             df = DataFrame.from_cudf(cdf)
         else:
-            assert_never(self.typ)
+            raise NotImplementedError(
+                f"Unhandled scan type: {self.typ}"
+            )  # pragma: no cover; post init trips first
         if row_index is not None:
             name, offset = row_index
             dtype = self.schema[name]
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index f4bf07ae1e0..a2fdb3c3d79 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -87,9 +87,11 @@ def _(
 def _(
     node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
+    typ, *options = node.scan_type
     return ir.Scan(
         schema,
-        node.scan_type,
+        typ,
+        tuple(options),
         node.paths,
         node.file_options,
         translate_named_expr(visitor, n=node.predicate)
@@ -445,7 +447,7 @@ def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Ex
         dtype,
         node.name,
         node.options,
-        translate_expr(visitor, n=node.arguments),
+        *(translate_expr(visitor, n=n) for n in node.arguments),
     )
 
 
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index effa4861e0c..bf4673fcc50 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -20,7 +20,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cudf==24.8.*,>=0.0.0a0",
-    "polars>=0.20.30",
+    "polars>=1.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",

From a4be7bd1365ec7ede5191a4b5d74e7c514a2b5fe Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 2 Jul 2024 00:50:42 -0700
Subject: [PATCH 24/27] Use Arrow C Data Interface functions for Python interop
 (#15904)

This PR replaces the internals of `from_arrow` in pylibcudf with an implementation that uses the [Arrow C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) using the [Python Capsule interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). This allows us to decouple our Python builds from using pyarrow Cython (partially, we haven't replaced the `to_arrow` conversion yet) and it will also allow us to support any other Python package that is a producer of the data interface.

To support the above functionality, the following additional changes were needed in this PR:
- Added the ability to produce cudf tables from `ArrowArrayStream` objects since that is what `pyarrow.Table` produces. This function is a simple wrapper around the existing `from_arrrow(ArrowArray)` API.
- Added support for the large strings type, for which support has improved throughout cudf since the `from_arrow_host` API was added and for which we now require a basic overload for tests to pass. I did not add corresponding support for `from_arrow_device` to avoid ballooning the scope of this PR, so that work can be done in a follow-up.
- Proper handling of `type_id::EMPTY` in concatenate because the most natural implementation of the ArrowArrayStream processing is to run `from_arrow` on each chunk and then concatenate the outputs, and from the Python side we can produce chunks of all null arrays from arrow.

Contributes to #14926

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Robert Maynard (https://github.com/robertmaynard)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15904
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cudf/interop.hpp                  |  38 ++++-
 cpp/src/copying/concatenate.cu                |  28 +++-
 cpp/src/interop/arrow_utilities.cpp           |   3 +-
 cpp/src/interop/from_arrow_device.cu          |   3 +
 cpp/src/interop/from_arrow_host.cu            |  32 +++-
 cpp/src/interop/from_arrow_stream.cu          | 143 ++++++++++++++++++
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/copying/concatenate_tests.cpp       |  60 ++++++++
 cpp/tests/interop/from_arrow_stream_test.cpp  | 121 +++++++++++++++
 cpp/tests/interop/nanoarrow_utils.hpp         |   3 +
 python/cudf/cudf/_lib/pylibcudf/interop.pyx   |  36 ++++-
 .../cudf/_lib/pylibcudf/libcudf/interop.pxd   |  20 +++
 python/cudf/cudf/tests/test_series.py         |   2 -
 14 files changed, 466 insertions(+), 25 deletions(-)
 create mode 100644 cpp/src/interop/from_arrow_stream.cu
 create mode 100644 cpp/tests/interop/from_arrow_stream_test.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 35cf90411f2..54070ab6f5a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -365,6 +365,7 @@ add_library(
   src/interop/to_arrow_device.cu
   src/interop/from_arrow_device.cu
   src/interop/from_arrow_host.cu
+  src/interop/from_arrow_stream.cu
   src/interop/to_arrow_schema.cpp
   src/interop/detail/arrow_allocator.cpp
   src/io/avro/avro.cpp
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 56ec62fa6e1..502ffb9ba4f 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -50,6 +50,8 @@ struct ArrowSchema;
 
 struct ArrowArray;
 
+struct ArrowArrayStream;
+
 namespace cudf {
 /**
  * @addtogroup interop_dlpack
@@ -367,10 +369,11 @@ std::unique_ptr<cudf::scalar> from_arrow(
  * @param mr Device memory resource used to allocate `cudf::table`
  * @return cudf table generated from given arrow data
  */
-std::unique_ptr<cudf::table> from_arrow(ArrowSchema const* schema,
-                                        ArrowArray const* input,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+std::unique_ptr<cudf::table> from_arrow(
+  ArrowSchema const* schema,
+  ArrowArray const* input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input
@@ -385,10 +388,11 @@ std::unique_ptr<cudf::table> from_arrow(ArrowSchema const* schema,
  * @param mr Device memory resource used to allocate `cudf::column`
  * @return cudf column generated from given arrow data
  */
-std::unique_ptr<cudf::column> from_arrow_column(ArrowSchema const* schema,
-                                                ArrowArray const* input,
-                                                rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+std::unique_ptr<cudf::column> from_arrow_column(
+  ArrowSchema const* schema,
+  ArrowArray const* input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::table` from given ArrowDeviceArray input
@@ -414,6 +418,24 @@ std::unique_ptr<table> from_arrow_host(
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Create `cudf::table` from given ArrowArrayStream input
+ *
+ * @throws std::invalid_argument if input is NULL
+ *
+ * The conversion WILL release the input ArrayArrayStream and its constituent
+ * arrays or schema since Arrow streams are not suitable for multiple reads.
+ *
+ * @param input `ArrowArrayStream` pointer to object that will produce ArrowArray data
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to perform cuda allocation
+ * @return cudf table generated from the given Arrow data
+ */
+std::unique_ptr<table> from_arrow_stream(
+  ArrowArrayStream* input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Create `cudf::column` from given ArrowDeviceArray input
  *
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 47e74a5cb48..6acbafd24fb 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -463,10 +463,6 @@ void traverse_children::operator()<cudf::list_view>(host_span<column_view const>
  */
 void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(cudf::all_have_same_types(cols.begin(), cols.end()),
-               "Type mismatch in columns to concatenate.",
-               cudf::data_type_error);
-
   // total size of all concatenated rows
   size_t const total_row_count =
     std::accumulate(cols.begin(), cols.end(), std::size_t{}, [](size_t a, auto const& b) {
@@ -476,6 +472,21 @@ void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_v
                "Total number of concatenated rows exceeds the column size limit",
                std::overflow_error);
 
+  if (std::any_of(cols.begin(), cols.end(), [](column_view const& c) {
+        return c.type().id() == cudf::type_id::EMPTY;
+      })) {
+    CUDF_EXPECTS(
+      std::all_of(cols.begin(),
+                  cols.end(),
+                  [](column_view const& c) { return c.type().id() == cudf::type_id::EMPTY; }),
+      "Mismatch in columns to concatenate.",
+      cudf::data_type_error);
+    return;
+  }
+  CUDF_EXPECTS(cudf::all_have_same_types(cols.begin(), cols.end()),
+               "Type mismatch in columns to concatenate.",
+               cudf::data_type_error);
+
   // traverse children
   cudf::type_dispatcher(cols.front().type(), traverse_children{}, cols, stream);
 }
@@ -498,6 +509,15 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_conc
     return empty_like(columns_to_concat.front());
   }
 
+  // For empty columns, we can just create an EMPTY column of the appropriate length.
+  if (columns_to_concat.front().type().id() == cudf::type_id::EMPTY) {
+    auto length = std::accumulate(
+      columns_to_concat.begin(), columns_to_concat.end(), 0, [](auto a, auto const& b) {
+        return a + b.size();
+      });
+    return std::make_unique<column>(
+      data_type(type_id::EMPTY), length, rmm::device_buffer{}, rmm::device_buffer{}, length);
+  }
   return type_dispatcher<dispatch_storage_type>(
     columns_to_concat.front().type(), concatenate_dispatch{columns_to_concat, stream, mr});
 }
diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp
index dd9e9600a87..605d813ed1e 100644
--- a/cpp/src/interop/arrow_utilities.cpp
+++ b/cpp/src/interop/arrow_utilities.cpp
@@ -39,7 +39,8 @@ data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view)
     case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32);
     case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64);
     case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS);
-    case NANOARROW_TYPE_STRING: return data_type(type_id::STRING);
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_LARGE_STRING: return data_type(type_id::STRING);
     case NANOARROW_TYPE_LIST: return data_type(type_id::LIST);
     case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32);
     case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT);
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
index 002a8ec1f14..73c1a474310 100644
--- a/cpp/src/interop/from_arrow_device.cu
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -143,6 +143,9 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::string_view>(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
+  CUDF_EXPECTS(schema->type != NANOARROW_TYPE_LARGE_STRING,
+               "Large strings are not yet supported in from_arrow_device",
+               cudf::data_type_error);
   if (input->length == 0) {
     return std::make_tuple<column_view, owned_columns_t>(
       {type,
diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu
index 854a1d68fdc..b7e07056686 100644
--- a/cpp/src/interop/from_arrow_host.cu
+++ b/cpp/src/interop/from_arrow_host.cu
@@ -188,8 +188,16 @@ std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::string_v
 
   // chars_column does not contain any nulls, they are tracked by the parent string column
   // itself instead. So we pass nullptr for the validity bitmask.
-  size_type const char_data_length =
-    reinterpret_cast<int32_t const*>(offset_buffers[1])[input->length + input->offset];
+  int64_t const char_data_length = [&]() {
+    if (schema->type == NANOARROW_TYPE_LARGE_STRING) {
+      return reinterpret_cast<int64_t const*>(offset_buffers[1])[input->length + input->offset];
+    } else if (schema->type == NANOARROW_TYPE_STRING) {
+      return static_cast<int64_t>(
+        reinterpret_cast<int32_t const*>(offset_buffers[1])[input->length + input->offset]);
+    } else {
+      CUDF_FAIL("Unsupported string type", cudf::data_type_error);
+    }
+  }();
   void const* char_buffers[2] = {nullptr, input->buffers[2]};
   ArrowArray char_array       = {
           .length     = char_data_length,
@@ -210,15 +218,27 @@ std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::string_v
   // offset and char data columns for us.
   ArrowSchemaView view;
   NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr));
-  auto offsets_column =
-    this->operator()<int32_t>(&view, &offsets_array, data_type(type_id::INT32), true);
+  auto offsets_column = [&]() {
+    if (schema->type == NANOARROW_TYPE_LARGE_STRING) {
+      return this->operator()<int64_t>(&view, &offsets_array, data_type(type_id::INT64), true);
+    } else if (schema->type == NANOARROW_TYPE_STRING) {
+      return this->operator()<int32_t>(&view, &offsets_array, data_type(type_id::INT32), true);
+    } else {
+      CUDF_FAIL("Unsupported string type", cudf::data_type_error);
+    }
+  }();
   NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, char_data_schema.get(), nullptr));
-  auto chars_column = this->operator()<int8_t>(&view, &char_array, data_type(type_id::INT8), true);
 
+  rmm::device_buffer chars(char_data_length, stream, mr);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(),
+                                reinterpret_cast<uint8_t const*>(char_array.buffers[1]),
+                                chars.size(),
+                                cudaMemcpyDefault,
+                                stream.value()));
   auto const num_rows = offsets_column->size() - 1;
   auto out_col        = make_strings_column(num_rows,
                                      std::move(offsets_column),
-                                     std::move(chars_column->release().data.release()[0]),
+                                     std::move(chars),
                                      input->null_count,
                                      std::move(*get_mask_buffer(input)));
 
diff --git a/cpp/src/interop/from_arrow_stream.cu b/cpp/src/interop/from_arrow_stream.cu
new file mode 100644
index 00000000000..0c85b561944
--- /dev/null
+++ b/cpp/src/interop/from_arrow_stream.cu
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arrow_utilities.hpp"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/concatenate.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/table/table.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow.hpp>
+
+#include <memory>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+namespace cudf {
+namespace detail {
+
+namespace {
+
+std::unique_ptr<column> make_empty_column_from_schema(ArrowSchema const* schema,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::mr::device_memory_resource* mr)
+{
+  ArrowSchemaView schema_view;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, nullptr));
+
+  auto const type{arrow_to_cudf_type(&schema_view)};
+  switch (type.id()) {
+    case type_id::EMPTY: {
+      return std::make_unique<column>(
+        data_type(type_id::EMPTY), 0, rmm::device_buffer{}, rmm::device_buffer{}, 0);
+    }
+    case type_id::LIST: {
+      return cudf::make_lists_column(0,
+                                     cudf::make_empty_column(data_type{type_id::INT32}),
+                                     make_empty_column_from_schema(schema->children[0], stream, mr),
+                                     0,
+                                     {},
+                                     stream,
+                                     mr);
+    }
+    case type_id::STRUCT: {
+      std::vector<std::unique_ptr<column>> child_columns;
+      child_columns.reserve(schema->n_children);
+      std::transform(
+        schema->children,
+        schema->children + schema->n_children,
+        std::back_inserter(child_columns),
+        [&](auto const& child) { return make_empty_column_from_schema(child, stream, mr); });
+      return cudf::make_structs_column(0, std::move(child_columns), 0, {}, stream, mr);
+    }
+    default: {
+      return cudf::make_empty_column(type);
+    }
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<table> from_arrow_stream(ArrowArrayStream* input,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(input != nullptr, "input ArrowArrayStream must not be NULL", std::invalid_argument);
+
+  // Potential future optimization: Since the from_arrow API accepts an
+  // ArrowSchema we're allocating one here instead of using a view, which we
+  // could avoid with a different underlying implementation.
+  ArrowSchema schema;
+  NANOARROW_THROW_NOT_OK(ArrowArrayStreamGetSchema(input, &schema, nullptr));
+
+  std::vector<std::unique_ptr<cudf::table>> chunks;
+  ArrowArray chunk;
+  while (true) {
+    NANOARROW_THROW_NOT_OK(ArrowArrayStreamGetNext(input, &chunk, nullptr));
+    if (chunk.release == nullptr) { break; }
+    chunks.push_back(from_arrow(&schema, &chunk, stream, mr));
+    chunk.release(&chunk);
+  }
+  input->release(input);
+
+  if (chunks.empty()) {
+    if (schema.n_children == 0) {
+      schema.release(&schema);
+      return std::make_unique<cudf::table>();
+    }
+
+    // If there are no chunks but the schema has children, we need to construct a suitable empty
+    // table.
+    std::vector<std::unique_ptr<cudf::column>> columns;
+    columns.reserve(chunks.size());
+    std::transform(
+      schema.children,
+      schema.children + schema.n_children,
+      std::back_inserter(columns),
+      [&](auto const& child) { return make_empty_column_from_schema(child, stream, mr); });
+    schema.release(&schema);
+    return std::make_unique<cudf::table>(std::move(columns));
+  }
+
+  schema.release(&schema);
+
+  auto chunk_views = std::vector<table_view>{};
+  chunk_views.reserve(chunks.size());
+  std::transform(
+    chunks.begin(), chunks.end(), std::back_inserter(chunk_views), [](auto const& chunk) {
+      return chunk->view();
+    });
+  return cudf::detail::concatenate(chunk_views, stream, mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<table> from_arrow_stream(ArrowArrayStream* input,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::from_arrow_stream(input, stream, mr);
+}
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 244bcb7d897..0eab9ba61d8 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -273,6 +273,7 @@ ConfigureTest(
   interop/from_arrow_test.cpp
   interop/from_arrow_device_test.cpp
   interop/from_arrow_host_test.cpp
+  interop/from_arrow_stream_test.cpp
   interop/dlpack_test.cpp
   EXTRA_LIB
   nanoarrow
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 054441788d0..18140c34abd 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -1667,3 +1667,63 @@ TEST_F(DictionaryConcatTest, ErrorsTest)
   std::vector<cudf::column_view> empty;
   EXPECT_THROW(cudf::concatenate(empty), cudf::logic_error);
 }
+
+struct EmptyColumnTest : public cudf::test::BaseFixture {};
+
+TEST_F(EmptyColumnTest, SimpleTest)
+{
+  std::vector<cudf::column> columns;
+  constexpr auto num_copies = 10;
+  constexpr auto num_rows   = 10;
+  for (auto i = 0; i < num_copies; ++i) {
+    columns.emplace_back(cudf::data_type(cudf::type_id::EMPTY),
+                         num_rows,
+                         rmm::device_buffer{},
+                         rmm::device_buffer{},
+                         0);
+  }
+
+  // Create views from columns
+  std::vector<cudf::column_view> views;
+  for (auto& col : columns) {
+    views.push_back(col.view());
+  }
+  auto result = cudf::concatenate(views);
+
+  ASSERT_EQ(result->size(), num_copies * num_rows);
+  ASSERT_EQ(result->type().id(), cudf::type_id::EMPTY);
+}
+
+struct TableOfEmptyColumnsTest : public cudf::test::BaseFixture {};
+
+TEST_F(TableOfEmptyColumnsTest, SimpleTest)
+{
+  std::vector<cudf::table> tables;
+  constexpr auto num_copies  = 10;
+  constexpr auto num_rows    = 10;
+  constexpr auto num_columns = 10;
+  for (auto i = 0; i < num_copies; ++i) {
+    std::vector<std::unique_ptr<cudf::column>> columns;
+    for (auto j = 0; j < num_columns; ++j) {
+      columns.push_back(std::make_unique<cudf::column>(cudf::data_type(cudf::type_id::EMPTY),
+                                                       num_rows,
+                                                       rmm::device_buffer{},
+                                                       rmm::device_buffer{},
+                                                       0));
+    }
+    tables.emplace_back(std::move(columns));
+  }
+
+  // Create views from columns
+  std::vector<cudf::table_view> views;
+  for (auto& tbl : tables) {
+    views.push_back(tbl.view());
+  }
+  auto result = cudf::concatenate(views);
+
+  ASSERT_EQ(result->num_rows(), num_copies * num_rows);
+  ASSERT_EQ(result->num_columns(), num_columns);
+  for (auto i = 0; i < num_columns; ++i) {
+    ASSERT_EQ(result->get_column(i).type().id(), cudf::type_id::EMPTY);
+  }
+}
diff --git a/cpp/tests/interop/from_arrow_stream_test.cpp b/cpp/tests/interop/from_arrow_stream_test.cpp
new file mode 100644
index 00000000000..418ec057303
--- /dev/null
+++ b/cpp/tests/interop/from_arrow_stream_test.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nanoarrow_utils.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/type_checks.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+struct VectorOfArrays {
+  std::vector<nanoarrow::UniqueArray> arrays;
+  nanoarrow::UniqueSchema schema;
+  size_t index{0};
+
+  static int get_schema(ArrowArrayStream* stream, ArrowSchema* out_schema)
+  {
+    auto private_data = static_cast<VectorOfArrays*>(stream->private_data);
+    ArrowSchemaDeepCopy(private_data->schema.get(), out_schema);
+    return 0;
+  }
+
+  static int get_next(ArrowArrayStream* stream, ArrowArray* out_array)
+  {
+    auto private_data = static_cast<VectorOfArrays*>(stream->private_data);
+    if (private_data->index >= private_data->arrays.size()) {
+      out_array->release = nullptr;
+      return 0;
+    }
+    ArrowArrayMove(private_data->arrays[private_data->index++].get(), out_array);
+    return 0;
+  }
+
+  static const char* get_last_error(ArrowArrayStream* stream) { return nullptr; }
+
+  static void release(ArrowArrayStream* stream)
+  {
+    delete static_cast<VectorOfArrays*>(stream->private_data);
+  }
+};
+
+struct FromArrowStreamTest : public cudf::test::BaseFixture {};
+
+void makeStreamFromArrays(std::vector<nanoarrow::UniqueArray> arrays,
+                          nanoarrow::UniqueSchema schema,
+                          ArrowArrayStream* out)
+{
+  auto* private_data  = new VectorOfArrays{std::move(arrays), std::move(schema)};
+  out->get_schema     = VectorOfArrays::get_schema;
+  out->get_next       = VectorOfArrays::get_next;
+  out->get_last_error = VectorOfArrays::get_last_error;
+  out->release        = VectorOfArrays::release;
+  out->private_data   = private_data;
+}
+
+TEST_F(FromArrowStreamTest, BasicTest)
+{
+  constexpr auto num_copies = 3;
+  std::vector<std::unique_ptr<cudf::table>> tables;
+  // The schema is unique across all tables.
+  nanoarrow::UniqueSchema schema;
+  std::vector<nanoarrow::UniqueArray> arrays;
+  for (auto i = 0; i < num_copies; ++i) {
+    auto [tbl, sch, arr] = get_nanoarrow_host_tables(0);
+    tables.push_back(std::move(tbl));
+    arrays.push_back(std::move(arr));
+    if (i == 0) { sch.move(schema.get()); }
+  }
+  std::vector<cudf::table_view> table_views;
+  for (auto const& table : tables) {
+    table_views.push_back(table->view());
+  }
+  auto expected = cudf::concatenate(table_views);
+
+  ArrowArrayStream stream;
+  makeStreamFromArrays(std::move(arrays), std::move(schema), &stream);
+  auto result = cudf::from_arrow_stream(&stream);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result->view());
+}
+
+TEST_F(FromArrowStreamTest, EmptyTest)
+{
+  auto [tbl, sch, arr] = get_nanoarrow_host_tables(0);
+  std::vector<cudf::table_view> table_views{tbl->view()};
+  auto expected = cudf::concatenate(table_views);
+
+  ArrowArrayStream stream;
+  makeStreamFromArrays({}, std::move(sch), &stream);
+  auto result = cudf::from_arrow_stream(&stream);
+  cudf::have_same_types(expected->view(), result->view());
+}
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index 94c4372e74a..4147728b2a6 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -375,3 +375,6 @@ nanoarrow::UniqueArray get_nanoarrow_list_array(std::initializer_list<T> data,
 
 std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, generated_test_data>
 get_nanoarrow_cudf_table(cudf::size_type length);
+
+std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, nanoarrow::UniqueArray>
+get_nanoarrow_host_tables(cudf::size_type length);
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
index 07e9d1ead11..adf7e1fd7e8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
@@ -1,5 +1,6 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
+from cpython cimport pycapsule
 from cython.operator cimport dereference
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.utility cimport move
@@ -11,9 +12,15 @@ from functools import singledispatch
 
 from pyarrow import lib as pa
 
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.interop cimport (
+    ArrowArray,
+    ArrowArrayStream,
+    ArrowSchema,
     column_metadata,
     from_arrow as cpp_from_arrow,
+    from_arrow_column as cpp_from_arrow_column,
+    from_arrow_stream as cpp_from_arrow_stream,
     to_arrow as cpp_to_arrow,
 )
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
@@ -124,11 +131,15 @@ def _from_arrow_datatype(pyarrow_object):
 def _from_arrow_table(pyarrow_object, *, DataType data_type=None):
     if data_type is not None:
         raise ValueError("data_type may not be passed for tables")
-    cdef shared_ptr[pa.CTable] arrow_table = pa.pyarrow_unwrap_table(pyarrow_object)
+    stream = pyarrow_object.__arrow_c_stream__()
+    cdef ArrowArrayStream* c_stream = (
+        <ArrowArrayStream*>pycapsule.PyCapsule_GetPointer(stream, "arrow_array_stream")
+    )
 
     cdef unique_ptr[table] c_result
     with nogil:
-        c_result = move(cpp_from_arrow(dereference(arrow_table)))
+        # The libcudf function here will release the stream.
+        c_result = move(cpp_from_arrow_stream(c_stream))
 
     return Table.from_libcudf(move(c_result))
 
@@ -190,8 +201,25 @@ def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None):
 def _from_arrow_column(pyarrow_object, *, DataType data_type=None):
     if data_type is not None:
         raise ValueError("data_type may not be passed for arrays")
-    pa_table = pa.table([pyarrow_object], [""])
-    return from_arrow(pa_table).columns()[0]
+
+    schema, array = pyarrow_object.__arrow_c_array__()
+    cdef ArrowSchema* c_schema = (
+        <ArrowSchema*>pycapsule.PyCapsule_GetPointer(schema, "arrow_schema")
+    )
+    cdef ArrowArray* c_array = (
+        <ArrowArray*>pycapsule.PyCapsule_GetPointer(array, "arrow_array")
+    )
+
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = move(cpp_from_arrow_column(c_schema, c_array))
+
+    # The capsule destructors should release automatically for us, but we
+    # choose to do it explicitly here for clarity.
+    c_schema.release(c_schema)
+    c_array.release(c_array)
+
+    return Column.from_libcudf(move(c_result))
 
 
 @singledispatch
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
index 471b78505fb..2151da28d4b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
@@ -7,6 +7,7 @@ from pyarrow.lib cimport CScalar, CTable
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
@@ -16,6 +17,19 @@ cdef extern from "dlpack/dlpack.h" nogil:
     ctypedef struct DLManagedTensor:
         void(*deleter)(DLManagedTensor*) except +
 
+
+# The Arrow structs are not namespaced.
+cdef extern from "cudf/interop.hpp" nogil:
+    cdef struct ArrowSchema:
+        void (*release)(ArrowSchema*) noexcept nogil
+
+    cdef struct ArrowArray:
+        void (*release)(ArrowArray*) noexcept nogil
+
+    cdef struct ArrowArrayStream:
+        void (*release)(ArrowArrayStream*) noexcept nogil
+
+
 cdef extern from "cudf/interop.hpp" namespace "cudf" \
         nogil:
     cdef unique_ptr[table] from_dlpack(const DLManagedTensor* tensor
@@ -42,3 +56,9 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \
         const scalar& input,
         column_metadata metadata,
     ) except +
+
+    cdef unique_ptr[table] from_arrow_stream(ArrowArrayStream* input) except +
+    cdef unique_ptr[column] from_arrow_column(
+        const ArrowSchema* schema,
+        const ArrowArray* input
+    ) except +
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index f2501041f25..8ed78d804bf 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2757,8 +2757,6 @@ def test_series_from_large_string(pa_type):
 
     assert_eq(expected, got)
 
-    assert pa_string_array.equals(got.to_arrow())
-
 
 @pytest.mark.parametrize(
     "scalar",

From a1447c78b8290277b7dbc680479de0c9f4ce0b19 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 2 Jul 2024 09:34:29 -0400
Subject: [PATCH 25/27] Promote has_nested_columns to cudf public API (#16131)

The `has_nested_columns` functionality is used in numerous tests. It looks like it should be part of our stable public API.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16131
---
 .../cudf/table/experimental/row_operators.cuh | 12 +++----
 cpp/include/cudf/table/table_view.hpp         | 19 ++++++++--
 cpp/src/table/table_view.cpp                  |  9 ++---
 .../table/experimental_row_operator_tests.cu  | 36 +++++++++----------
 .../table/row_operator_tests_utilities.cu     |  4 +--
 .../table/row_operator_tests_utilities2.cu    |  2 +-
 6 files changed, 44 insertions(+), 38 deletions(-)

diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index e9b81a525fc..c181ac7d402 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -252,7 +252,7 @@ using optional_dremel_view = thrust::optional<detail::dremel_device_view const>;
  *
  * @tparam has_nested_columns compile-time optimization for primitive types.
  *         This template parameter is to be used by the developer by querying
- *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+ *         `cudf::has_nested_columns(input)`. `true` compiles operator
  *         overloads for nested types, while `false` only compiles operator
  *         overloads for primitive types.
  * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
@@ -1014,7 +1014,7 @@ class self_comparator {
    *
    * @tparam has_nested_columns compile-time optimization for primitive types.
    *         This template parameter is to be used by the developer by querying
-   *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+   *         `cudf::has_nested_columns(input)`. `true` compiles operator
    *         overloads for nested types, while `false` only compiles operator
    *         overloads for primitive types.
    * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
@@ -1186,7 +1186,7 @@ class two_table_comparator {
    *
    * @tparam has_nested_columns compile-time optimization for primitive types.
    *         This template parameter is to be used by the developer by querying
-   *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+   *         `cudf::has_nested_columns(input)`. `true` compiles operator
    *         overloads for nested types, while `false` only compiles operator
    *         overloads for primitive types.
    * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
@@ -1326,7 +1326,7 @@ struct nan_equal_physical_equality_comparator {
  *
  * @tparam has_nested_columns compile-time optimization for primitive types.
  *         This template parameter is to be used by the developer by querying
- *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+ *         `cudf::has_nested_columns(input)`. `true` compiles operator
  *         overloads for nested types, while `false` only compiles operator
  *         overloads for primitive types.
  * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
@@ -1643,7 +1643,7 @@ class self_comparator {
    *
    * @tparam has_nested_columns compile-time optimization for primitive types.
    *         This template parameter is to be used by the developer by querying
-   *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+   *         `cudf::has_nested_columns(input)`. `true` compiles operator
    *         overloads for nested types, while `false` only compiles operator
    *         overloads for primitive types.
    * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
@@ -1757,7 +1757,7 @@ class two_table_comparator {
    *
    * @tparam has_nested_columns compile-time optimization for primitive types.
    *         This template parameter is to be used by the developer by querying
-   *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+   *         `cudf::has_nested_columns(input)`. `true` compiles operator
    *         overloads for nested types, while `false` only compiles operator
    *         overloads for primitive types.
    * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index a71e0558dec..4a990f67ce4 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <algorithm>
 #include <vector>
@@ -32,7 +33,7 @@
  * passed by value.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @brief Base class for a table of `ColumnView`s
@@ -123,7 +124,10 @@ class table_view_base {
    * @param column_index The index of the desired column
    * @return A reference to the desired column
    */
-  [[nodiscard]] ColumnView const& column(size_type column_index) const;
+  [[nodiscard]] ColumnView const& column(size_type column_index) const
+  {
+    return _columns.at(column_index);
+  }
 
   /**
    * @brief Returns the number of columns
@@ -174,8 +178,17 @@ class table_view_base {
  * @return Whether nested columns exist in the input table
  */
 bool has_nested_columns(table_view const& table);
+
 }  // namespace detail
 
+/**
+ * @brief Determine if any nested columns exist in a given table.
+ *
+ * @param table The input table
+ * @return Whether nested columns exist in the input table
+ */
+bool has_nested_columns(table_view const& table);
+
 /**
  * @brief A set of cudf::column_view's of the same size.
  *
@@ -374,4 +387,4 @@ extern template bool is_relationally_comparable<mutable_table_view>(mutable_tabl
                                                                     mutable_table_view const& rhs);
 // @endcond
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp
index 13832b0d9dc..8a5340dc20d 100644
--- a/cpp/src/table/table_view.cpp
+++ b/cpp/src/table/table_view.cpp
@@ -52,12 +52,6 @@ auto concatenate_column_views(std::vector<ViewType> const& views)
   return concat_cols;
 }
 
-template <typename ColumnView>
-ColumnView const& table_view_base<ColumnView>::column(size_type column_index) const
-{
-  return _columns.at(column_index);
-}
-
 // Explicit instantiation for a table of `column_view`s
 template class table_view_base<column_view>;
 
@@ -172,6 +166,7 @@ bool has_nested_columns(table_view const& table)
   return std::any_of(
     table.begin(), table.end(), [](column_view const& col) { return is_nested(col.type()); });
 }
-
 }  // namespace detail
+
+bool has_nested_columns(table_view const& table) { return detail::has_nested_columns(table); }
 }  // namespace cudf
diff --git a/cpp/tests/table/experimental_row_operator_tests.cu b/cpp/tests/table/experimental_row_operator_tests.cu
index 896cc7a82d4..0d9e4e27f2c 100644
--- a/cpp/tests/table/experimental_row_operator_tests.cu
+++ b/cpp/tests/table/experimental_row_operator_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -109,15 +109,14 @@ TYPED_TEST(TypedTableViewTest, TestSortSameTableFromTwoTables)
   auto const lhs       = cudf::table_view{{col1}};
   auto const empty_rhs = cudf::table_view{{col2}};
 
-  auto const stream    = cudf::get_default_stream();
-  auto const test_sort = [stream](auto const& preprocessed,
-                                  auto const& input,
-                                  auto const& comparator,
-                                  auto const& expected) {
-    auto const order = sorted_order(
-      preprocessed, input.num_rows(), cudf::detail::has_nested_columns(input), comparator, stream);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view());
-  };
+  auto const stream = cudf::get_default_stream();
+  auto const test_sort =
+    [stream](
+      auto const& preprocessed, auto const& input, auto const& comparator, auto const& expected) {
+      auto const order = sorted_order(
+        preprocessed, input.num_rows(), cudf::has_nested_columns(input), comparator, stream);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view());
+    };
 
   auto const test_sort_two_tables = [&](auto const& preprocessed_lhs,
                                         auto const& preprocessed_empty_rhs) {
@@ -188,15 +187,14 @@ TYPED_TEST(TypedTableViewTest, TestSortSameTableFromTwoTablesWithListsOfStructs)
   auto const lhs          = cudf::table_view{{*col1}};
   auto const empty_rhs    = cudf::table_view{{*col2}};
 
-  auto const stream    = cudf::get_default_stream();
-  auto const test_sort = [stream](auto const& preprocessed,
-                                  auto const& input,
-                                  auto const& comparator,
-                                  auto const& expected) {
-    auto const order = sorted_order(
-      preprocessed, input.num_rows(), cudf::detail::has_nested_columns(input), comparator, stream);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view());
-  };
+  auto const stream = cudf::get_default_stream();
+  auto const test_sort =
+    [stream](
+      auto const& preprocessed, auto const& input, auto const& comparator, auto const& expected) {
+      auto const order = sorted_order(
+        preprocessed, input.num_rows(), cudf::has_nested_columns(input), comparator, stream);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view());
+    };
 
   auto const test_sort_two_tables = [&](auto const& preprocessed_lhs,
                                         auto const& preprocessed_empty_rhs) {
diff --git a/cpp/tests/table/row_operator_tests_utilities.cu b/cpp/tests/table/row_operator_tests_utilities.cu
index cfffa1cdd54..6127864987d 100644
--- a/cpp/tests/table/row_operator_tests_utilities.cu
+++ b/cpp/tests/table/row_operator_tests_utilities.cu
@@ -42,7 +42,7 @@ std::unique_ptr<cudf::column> two_table_comparison(cudf::table_view lhs,
   auto output = cudf::make_numeric_column(
     cudf::data_type(cudf::type_id::BOOL8), lhs.num_rows(), cudf::mask_state::UNALLOCATED);
 
-  if (cudf::detail::has_nested_columns(lhs) || cudf::detail::has_nested_columns(rhs)) {
+  if (cudf::has_nested_columns(lhs) || cudf::has_nested_columns(rhs)) {
     thrust::transform(rmm::exec_policy(stream),
                       lhs_it,
                       lhs_it + lhs.num_rows(),
@@ -129,7 +129,7 @@ std::unique_ptr<cudf::column> two_table_equality(cudf::table_view lhs,
   auto output = cudf::make_numeric_column(
     cudf::data_type(cudf::type_id::BOOL8), lhs.num_rows(), cudf::mask_state::UNALLOCATED);
 
-  if (cudf::detail::has_nested_columns(lhs) or cudf::detail::has_nested_columns(rhs)) {
+  if (cudf::has_nested_columns(lhs) or cudf::has_nested_columns(rhs)) {
     auto const equal_comparator =
       table_comparator.equal_to<true>(cudf::nullate::NO{}, cudf::null_equality::EQUAL, comparator);
 
diff --git a/cpp/tests/table/row_operator_tests_utilities2.cu b/cpp/tests/table/row_operator_tests_utilities2.cu
index 057d9ee1004..17d274eba13 100644
--- a/cpp/tests/table/row_operator_tests_utilities2.cu
+++ b/cpp/tests/table/row_operator_tests_utilities2.cu
@@ -41,7 +41,7 @@ std::unique_ptr<cudf::column> self_comparison(cudf::table_view input,
   auto output = cudf::make_numeric_column(
     cudf::data_type(cudf::type_id::BOOL8), input.num_rows(), cudf::mask_state::UNALLOCATED);
 
-  if (cudf::detail::has_nested_columns(input)) {
+  if (cudf::has_nested_columns(input)) {
     thrust::transform(rmm::exec_policy(stream),
                       thrust::make_counting_iterator(0),
                       thrust::make_counting_iterator(input.num_rows()),

From 1a4c2aa38c6e7de8c6937b787a1263a4ccddadea Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Tue, 2 Jul 2024 07:38:18 -0700
Subject: [PATCH 26/27] Start migrating I/O writers to pylibcudf (starting with
 JSON) (#15952)

Switches the JSON writer to use pylibcudf.
xref #15162

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15952
---
 .../api_docs/pylibcudf/io/index.rst           |   1 +
 .../user_guide/api_docs/pylibcudf/io/json.rst |   6 +
 python/cudf/cudf/_lib/json.pyx                |  98 +++-----
 .../cudf/_lib/pylibcudf/io/CMakeLists.txt     |   6 +-
 .../cudf/cudf/_lib/pylibcudf/io/__init__.pxd  |   2 +-
 .../cudf/cudf/_lib/pylibcudf/io/__init__.py   |   4 +-
 python/cudf/cudf/_lib/pylibcudf/io/avro.pyx   |   4 +-
 python/cudf/cudf/_lib/pylibcudf/io/json.pxd   |  18 ++
 python/cudf/cudf/_lib/pylibcudf/io/json.pyx   |  68 ++++++
 python/cudf/cudf/_lib/pylibcudf/io/types.pxd  |  11 +
 python/cudf/cudf/_lib/pylibcudf/io/types.pyx  | 125 +++++++++-
 .../cudf/cudf/pylibcudf_tests/common/utils.py | 122 ++++++++--
 python/cudf/cudf/pylibcudf_tests/conftest.py  | 104 ++++++--
 .../pylibcudf_tests/{ => io}/test_avro.py     |   0
 .../cudf/cudf/pylibcudf_tests/io/test_json.py | 116 +++++++++
 .../test_source_sink_info.py}                 |  34 ++-
 .../cudf/cudf/pylibcudf_tests/test_copying.py | 226 +++++++++++++-----
 17 files changed, 768 insertions(+), 177 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/json.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/json.pyx
 rename python/cudf/cudf/pylibcudf_tests/{ => io}/test_avro.py (100%)
 create mode 100644 python/cudf/cudf/pylibcudf_tests/io/test_json.py
 rename python/cudf/cudf/pylibcudf_tests/{test_source_info.py => io/test_source_sink_info.py} (72%)

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index 0d53ac92db9..bde6d8094ce 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -16,3 +16,4 @@ I/O Functions
     :maxdepth: 1
 
     avro
+    json
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst
new file mode 100644
index 00000000000..6aeae1f322a
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst
@@ -0,0 +1,6 @@
+====
+JSON
+====
+
+.. automodule:: cudf._lib.pylibcudf.io.json
+   :members:
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index a8fef907bad..22e34feb547 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -9,38 +9,27 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 from libcpp.map cimport map
-from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-from cudf._lib.column cimport Column
-from cudf._lib.io.utils cimport (
-    make_sink_info,
-    make_source_info,
-    update_struct_field_names,
-)
-from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.io.utils cimport make_source_info, update_struct_field_names
 from cudf._lib.pylibcudf.libcudf.io.json cimport (
     json_reader_options,
     json_recovery_mode_t,
-    json_writer_options,
     read_json as libcudf_read_json,
     schema_element,
-    write_json as libcudf_write_json,
 )
 from cudf._lib.pylibcudf.libcudf.io.types cimport (
-    column_name_info,
     compression_type,
-    sink_info,
-    table_metadata,
     table_with_metadata,
 )
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 from cudf._lib.types cimport dtype_to_data_type
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport data_from_unique_ptr
+
+import cudf._lib.pylibcudf as plc
 
 
 cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines):
@@ -175,45 +164,27 @@ def write_json(
     --------
     cudf.to_json
     """
-    cdef table_view input_table_view = table_view_from_table(
-        table, ignore_index=True
-    )
-
-    cdef unique_ptr[data_sink] data_sink_c
-    cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
-    cdef string na_c = na_rep.encode()
-    cdef bool include_nulls_c = include_nulls
-    cdef bool lines_c = lines
-    cdef int rows_per_chunk_c = rows_per_chunk
-    cdef string true_value_c = 'true'.encode()
-    cdef string false_value_c = 'false'.encode()
-    cdef table_metadata tbl_meta
-
-    num_index_cols_meta = 0
-    cdef column_name_info child_info
-    for i, name in enumerate(table._column_names, num_index_cols_meta):
-        child_info.name = name.encode()
-        tbl_meta.schema_info.push_back(child_info)
-        _set_col_children_metadata(
-            table[name]._column,
-            tbl_meta.schema_info[i]
-        )
+    cdef list colnames = []
 
-    cdef json_writer_options options = move(
-        json_writer_options.builder(sink_info_c, input_table_view)
-        .metadata(tbl_meta)
-        .na_rep(na_c)
-        .include_nulls(include_nulls_c)
-        .lines(lines_c)
-        .rows_per_chunk(rows_per_chunk_c)
-        .true_value(true_value_c)
-        .false_value(false_value_c)
-        .build()
-    )
+    for name in table._column_names:
+        colnames.append((name, _dtype_to_names_list(table[name]._column)))
 
     try:
-        with nogil:
-            libcudf_write_json(options)
+        plc.io.json.write_json(
+            plc.io.SinkInfo([path_or_buf]),
+            plc.io.TableWithMetadata(
+                plc.Table([
+                    c.to_pylibcudf(mode="read") for c in table._columns
+                ]),
+                colnames
+            ),
+            na_rep,
+            include_nulls,
+            lines,
+            rows_per_chunk,
+            true_value="true",
+            false_value="false"
+        )
     except OverflowError:
         raise OverflowError(
             f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. "
@@ -254,23 +225,12 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
         )
     return dtype_to_data_type(dtype)
 
-cdef _set_col_children_metadata(Column col,
-                                column_name_info& col_meta):
-    cdef column_name_info child_info
+
+def _dtype_to_names_list(col):
     if isinstance(col.dtype, cudf.StructDtype):
-        for i, (child_col, name) in enumerate(
-            zip(col.children, list(col.dtype.fields))
-        ):
-            child_info.name = name.encode()
-            col_meta.children.push_back(child_info)
-            _set_col_children_metadata(
-                child_col, col_meta.children[i]
-            )
+        return [(name, _dtype_to_names_list(child))
+                for name, child in zip(col.dtype.fields, col.children)]
     elif isinstance(col.dtype, cudf.ListDtype):
-        for i, child_col in enumerate(col.children):
-            col_meta.children.push_back(child_info)
-            _set_col_children_metadata(
-                child_col, col_meta.children[i]
-            )
-    else:
-        return
+        return [("", _dtype_to_names_list(child))
+                for child in col.children]
+    return []
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
index 32f0f5543e4..084b341ec48 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources avro.pyx datasource.pyx types.pyx)
+set(cython_sources avro.pyx datasource.pyx json.pyx types.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
@@ -21,5 +21,7 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
 )
 
-set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_types)
+set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_json
+                                pylibcudf_io_types
+)
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
index cfd6d2cd281..ef4c65b277e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . cimport avro, datasource, types
+from . cimport avro, datasource, json, types
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
index a54ba1834dc..fb4e4c7e4bb 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, datasource, types
-from .types import SourceInfo, TableWithMetadata
+from . import avro, datasource, json, types
+from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
index 946e0896fc8..538bd8aa322 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
@@ -19,7 +19,7 @@ cpdef TableWithMetadata read_avro(
     size_type num_rows = -1
 ):
     """
-    Reads an Avro dataset into a set of columns.
+    Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`.
 
     Parameters
     ----------
@@ -36,7 +36,7 @@ cpdef TableWithMetadata read_avro(
     Returns
     -------
     TableWithMetadata
-        The Table and its corresponding metadata that was read in.
+        The Table and its corresponding metadata (column names) that were read in.
     """
     cdef vector[string] c_columns
     if columns is not None and len(columns) > 0:
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
new file mode 100644
index 00000000000..a91d574131f
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+
+cpdef void write_json(
+    SinkInfo sink_info,
+    TableWithMetadata tbl,
+    str na_rep = *,
+    bool include_nulls = *,
+    bool lines = *,
+    size_type rows_per_chunk = *,
+    str true_value = *,
+    str false_value = *
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
new file mode 100644
index 00000000000..7530eba3803
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
@@ -0,0 +1,68 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.limits cimport numeric_limits
+from libcpp.string cimport string
+
+from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
+from cudf._lib.pylibcudf.libcudf.io.json cimport (
+    json_writer_options,
+    write_json as cpp_write_json,
+)
+from cudf._lib.pylibcudf.libcudf.io.types cimport table_metadata
+from cudf._lib.pylibcudf.types cimport size_type
+
+
+cpdef void write_json(
+    SinkInfo sink_info,
+    TableWithMetadata table_w_meta,
+    str na_rep = "",
+    bool include_nulls = False,
+    bool lines = False,
+    size_type rows_per_chunk = numeric_limits[size_type].max(),
+    str true_value = "true",
+    str false_value = "false"
+):
+    """
+    Writes a :py:class:`~cudf._lib.pylibcudf.table.Table` to JSON format.
+
+    Parameters
+    ----------
+    sink_info: SinkInfo
+        The SinkInfo object to write the JSON to.
+    table_w_meta: TableWithMetadata
+        The TableWithMetadata object containing the Table to write
+    na_rep: str, default ""
+        The string representation for null values.
+    include_nulls: bool, default False
+        Enables/Disables output of nulls as 'null'.
+    lines: bool, default False
+        If `True`, write output in the JSON lines format.
+    rows_per_chunk: size_type, defaults to length of the input table
+        The maximum number of rows to write at a time.
+    true_value: str, default "true"
+        The string representation for values != 0 in INT8 types.
+    false_value: str, default "false"
+        The string representation for values == 0 in INT8 types.
+    """
+    cdef table_metadata tbl_meta = table_w_meta.metadata
+    cdef string na_rep_c = na_rep.encode()
+
+    cdef json_writer_options options = (
+        json_writer_options.builder(sink_info.c_obj, table_w_meta.tbl.view())
+        .metadata(tbl_meta)
+        .na_rep(na_rep_c)
+        .include_nulls(include_nulls)
+        .lines(lines)
+        .build()
+    )
+
+    if rows_per_chunk != numeric_limits[size_type].max():
+        options.set_rows_per_chunk(rows_per_chunk)
+    if true_value != "true":
+        options.set_true_value(<string>true_value.encode())
+    if false_value != "false":
+        options.set_false_value(<string>false_value.encode())
+
+    with nogil:
+        cpp_write_json(options)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
index aa846a47343..88daf54f33b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
@@ -1,4 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
 from cudf._lib.pylibcudf.libcudf.io.types cimport (
     column_encoding,
     column_in_metadata,
@@ -22,8 +26,15 @@ cdef class TableWithMetadata:
     cdef public Table tbl
     cdef table_metadata metadata
 
+    cdef vector[column_name_info] _make_column_info(self, list column_names)
+
     @staticmethod
     cdef TableWithMetadata from_libcudf(table_with_metadata& tbl)
 
 cdef class SourceInfo:
     cdef source_info c_obj
+
+cdef class SinkInfo:
+    # This vector just exists to keep the unique_ptrs to the sinks alive
+    cdef vector[unique_ptr[data_sink]] sink_storage
+    cdef sink_info c_obj
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
index ab3375da662..f94e20970a4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
@@ -1,17 +1,23 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from cpython.buffer cimport PyBUF_READ
+from cpython.memoryview cimport PyMemoryView_FromMemory
+from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.pylibcudf.io.datasource cimport Datasource
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
 from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    column_name_info,
     host_buffer,
     source_info,
     table_with_metadata,
 )
 
+import codecs
 import errno
 import io
 import os
@@ -22,7 +28,39 @@ cdef class TableWithMetadata:
     (e.g. column names)
 
     For details, see :cpp:class:`cudf::io::table_with_metadata`.
+
+    Parameters
+    ----------
+    tbl : Table
+        The input table.
+    column_names : list
+        A list of tuples each containing the name of each column
+        and the names of its child columns (in the same format).
+        e.g.
+        [("id", []), ("name", [("first", []), ("last", [])])]
+
     """
+    def __init__(self, Table tbl, list column_names):
+        self.tbl = tbl
+
+        self.metadata.schema_info = self._make_column_info(column_names)
+
+    cdef vector[column_name_info] _make_column_info(self, list column_names):
+        cdef vector[column_name_info] col_name_infos
+        cdef column_name_info info
+
+        col_name_infos.reserve(len(column_names))
+
+        for name, child_names in column_names:
+            if not isinstance(name, str):
+                raise ValueError("Column name must be a string!")
+
+            info.name = <string> name.encode()
+            info.children = self._make_column_info(child_names)
+
+            col_name_infos.push_back(info)
+
+        return col_name_infos
 
     @property
     def columns(self):
@@ -51,6 +89,7 @@ cdef class TableWithMetadata:
         out.metadata = tbl_with_meta.metadata
         return out
 
+
 cdef class SourceInfo:
     """A class containing details on a source to read from.
 
@@ -119,7 +158,87 @@ cdef class SourceInfo:
             raise ValueError("Sources must be a list of str/paths, "
                              "bytes, io.BytesIO, or a Datasource")
 
-        if empty_buffer is True:
-            c_host_buffers.push_back(host_buffer(<char*>NULL, 0))
+        self.c_obj = source_info(c_host_buffers)
+
+
+# Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you
+# write from cudf to any python file-like object (File/BytesIO/SocketIO etc)
+cdef cppclass iobase_data_sink(data_sink):
+    object buf
+
+    iobase_data_sink(object buf_):
+        this.buf = buf_
+
+    void host_write(const void * data, size_t size) with gil:
+        if isinstance(buf, io.TextIOBase):
+            buf.write(PyMemoryView_FromMemory(<char*>data, size, PyBUF_READ)
+                      .tobytes().decode())
+        else:
+            buf.write(PyMemoryView_FromMemory(<char*>data, size, PyBUF_READ))
+
+    void flush() with gil:
+        buf.flush()
+
+    size_t bytes_written() with gil:
+        return buf.tell()
+
+
+cdef class SinkInfo:
+    """A class containing details on a source to read from.
+
+    For details, see :cpp:class:`cudf::io::sink_info`.
+
+    Parameters
+    ----------
+    sinks : list of str, PathLike, BytesIO, StringIO
+
+        A homogeneous list of sinks (this can be a string filename,
+        bytes, or one of the Python I/O classes) to read from.
+
+        Mixing different types of sinks will raise a `ValueError`.
+    """
+
+    def __init__(self, list sinks):
+        cdef vector[data_sink *] data_sinks
+        cdef vector[string] paths
+
+        if not sinks:
+            raise ValueError("Need to pass at least one sink")
+
+        if isinstance(sinks[0], os.PathLike):
+            sinks = [os.path.expanduser(s) for s in sinks]
+
+        cdef object initial_sink_cls = type(sinks[0])
+
+        if not all(isinstance(s, initial_sink_cls) for s in sinks):
+            raise ValueError("All sinks must be of the same type!")
+
+        if initial_sink_cls in {io.StringIO, io.BytesIO, io.TextIOBase}:
+            data_sinks.reserve(len(sinks))
+            if isinstance(sinks[0], (io.StringIO, io.BytesIO)):
+                for s in sinks:
+                    self.sink_storage.push_back(
+                        unique_ptr[data_sink](new iobase_data_sink(s))
+                    )
+            elif isinstance(sinks[0], io.TextIOBase):
+                for s in sinks:
+                    if codecs.lookup(s).name not in ('utf-8', 'ascii'):
+                        raise NotImplementedError(f"Unsupported encoding {s.encoding}")
+                    self.sink_storage.push_back(
+                        unique_ptr[data_sink](new iobase_data_sink(s.buffer))
+                    )
+            data_sinks.push_back(self.sink_storage.back().get())
+        elif initial_sink_cls is str:
+            paths.reserve(len(sinks))
+            for s in sinks:
+                paths.push_back(<string> s.encode())
+        else:
+            raise TypeError(
+                "Unrecognized input type: {}".format(type(sinks[0]))
+            )
 
-        self.c_obj = move(source_info(c_host_buffers))
+        if data_sinks.size() > 0:
+            self.c_obj = sink_info(data_sinks)
+        else:
+            # we don't have sinks so we must have paths to sinks
+            self.c_obj = sink_info(paths)
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index bf927e661fe..f8bfe340ae5 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -1,24 +1,39 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from __future__ import annotations
 
+import io
+import os
+
 import pyarrow as pa
 import pytest
 
 from cudf._lib import pylibcudf as plc
 
 
-def metadata_from_arrow_array(
-    pa_array: pa.Array,
+def metadata_from_arrow_type(
+    pa_type: pa.Array,
+    name: str = "",
 ) -> plc.interop.ColumnMetadata | None:
-    metadata = None
-    if pa.types.is_list(dtype := pa_array.type) or pa.types.is_struct(dtype):
+    metadata = plc.interop.ColumnMetadata(name)  # None
+    if pa.types.is_list(pa_type):
+        child_meta = [plc.interop.ColumnMetadata("offsets")]
+        for i in range(pa_type.num_fields):
+            field_meta = metadata_from_arrow_type(
+                pa_type.field(i).type, pa_type.field(i).name
+            )
+            child_meta.append(field_meta)
+        metadata = plc.interop.ColumnMetadata(name, child_meta)
+    elif pa.types.is_struct(pa_type):
+        child_meta = []
+        for i in range(pa_type.num_fields):
+            field_meta = metadata_from_arrow_type(
+                pa_type.field(i).type, pa_type.field(i).name
+            )
+            child_meta.append(field_meta)
         metadata = plc.interop.ColumnMetadata(
-            "",
+            name,
             # libcudf does not store field names, so just match pyarrow's.
-            [
-                plc.interop.ColumnMetadata(pa_array.type.field(i).name)
-                for i in range(pa_array.type.num_fields)
-            ],
+            child_meta,
         )
     return metadata
 
@@ -32,13 +47,13 @@ def assert_column_eq(
         rhs, plc.Column
     ):
         rhs = plc.interop.to_arrow(
-            rhs, metadata=metadata_from_arrow_array(lhs)
+            rhs, metadata=metadata_from_arrow_type(lhs.type)
         )
     elif isinstance(lhs, plc.Column) and isinstance(
         rhs, (pa.Array, pa.ChunkedArray)
     ):
         lhs = plc.interop.to_arrow(
-            lhs, metadata=metadata_from_arrow_array(rhs)
+            lhs, metadata=metadata_from_arrow_type(rhs.type)
         )
     else:
         raise ValueError(
@@ -94,21 +109,16 @@ def is_signed_integer(plc_dtype: plc.DataType):
     )
 
 
-def is_unsigned_integer(plc_dtype: plc.DataType):
-    return plc_dtype.id() in (
-        plc.TypeId.UINT8,
-        plc.TypeId.UINT16,
-        plc.TypeId.UINT32,
-        plc.TypeId.UINT64,
-    )
-
-
 def is_integer(plc_dtype: plc.DataType):
     return plc_dtype.id() in (
         plc.TypeId.INT8,
         plc.TypeId.INT16,
         plc.TypeId.INT32,
         plc.TypeId.INT64,
+        plc.TypeId.UINT8,
+        plc.TypeId.UINT16,
+        plc.TypeId.UINT32,
+        plc.TypeId.UINT64,
     )
 
 
@@ -135,8 +145,80 @@ def is_fixed_width(plc_dtype: plc.DataType):
     )
 
 
+def nesting_level(typ) -> tuple[int, int]:
+    """Return list and struct nesting of a pyarrow type."""
+    if isinstance(typ, pa.ListType):
+        list_, struct = nesting_level(typ.value_type)
+        return list_ + 1, struct
+    elif isinstance(typ, pa.StructType):
+        lists, structs = map(max, zip(*(nesting_level(t.type) for t in typ)))
+        return lists, structs + 1
+    else:
+        return 0, 0
+
+
+def is_nested_struct(typ):
+    return nesting_level(typ)[1] > 1
+
+
+def is_nested_list(typ):
+    return nesting_level(typ)[0] > 1
+
+
+def sink_to_str(sink):
+    """
+    Takes a sink (e.g. StringIO/BytesIO, filepath, etc.)
+    and reads in the contents into a string (str not bytes)
+    for comparison
+    """
+    if isinstance(sink, (str, os.PathLike)):
+        with open(sink, "r") as f:
+            str_result = f.read()
+    elif isinstance(sink, io.BytesIO):
+        sink.seek(0)
+        str_result = sink.read().decode()
+    else:
+        sink.seek(0)
+        str_result = sink.read()
+    return str_result
+
+
+NUMERIC_PA_TYPES = [pa.int64(), pa.float64(), pa.uint64()]
+STRING_PA_TYPES = [pa.string()]
+BOOL_PA_TYPES = [pa.bool_()]
+LIST_PA_TYPES = [
+    pa.list_(pa.int64()),
+    # Nested case
+    pa.list_(pa.list_(pa.int64())),
+]
+
 # We must explicitly specify this type via a field to ensure we don't include
 # nullability accidentally.
 DEFAULT_STRUCT_TESTING_TYPE = pa.struct(
     [pa.field("v", pa.int64(), nullable=False)]
 )
+NESTED_STRUCT_TESTING_TYPE = pa.struct(
+    [
+        pa.field("a", pa.int64(), nullable=False),
+        pa.field(
+            "b_struct",
+            pa.struct([pa.field("b", pa.float64(), nullable=False)]),
+            nullable=False,
+        ),
+    ]
+)
+
+DEFAULT_PA_STRUCT_TESTING_TYPES = [
+    DEFAULT_STRUCT_TESTING_TYPE,
+    NESTED_STRUCT_TESTING_TYPE,
+]
+
+DEFAULT_PA_TYPES = (
+    NUMERIC_PA_TYPES
+    + STRING_PA_TYPES
+    + BOOL_PA_TYPES
+    + LIST_PA_TYPES
+    + DEFAULT_PA_STRUCT_TESTING_TYPES
+)
+
+ALL_PA_TYPES = DEFAULT_PA_TYPES
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
index b169bbdee5b..e4760ea7ac8 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -1,9 +1,12 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 # Tell ruff it's OK that some imports occur after the sys.path.insert
 # ruff: noqa: E402
+import io
 import os
+import pathlib
 import sys
 
+import numpy as np
 import pyarrow as pa
 import pytest
 
@@ -11,7 +14,7 @@
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
 
-from utils import DEFAULT_STRUCT_TESTING_TYPE
+from utils import ALL_PA_TYPES, DEFAULT_PA_TYPES, NUMERIC_PA_TYPES
 
 
 # This fixture defines the standard set of types that all tests should default to
@@ -20,14 +23,7 @@
 # across modules. Otherwise it may be defined on a per-module basis.
 @pytest.fixture(
     scope="session",
-    params=[
-        pa.int64(),
-        pa.float64(),
-        pa.string(),
-        pa.bool_(),
-        pa.list_(pa.int64()),
-        DEFAULT_STRUCT_TESTING_TYPE,
-    ],
+    params=DEFAULT_PA_TYPES,
 )
 def pa_type(request):
     return request.param
@@ -35,16 +31,96 @@ def pa_type(request):
 
 @pytest.fixture(
     scope="session",
-    params=[
-        pa.int64(),
-        pa.float64(),
-        pa.uint64(),
-    ],
+    params=NUMERIC_PA_TYPES,
 )
 def numeric_pa_type(request):
     return request.param
 
 
+# TODO: Consider adding another fixture/adapting this
+# fixture to consider nullability
+@pytest.fixture(scope="session", params=[0, 100])
+def table_data(request):
+    """
+    Returns (TableWithMetadata, pa_table).
+
+    This is the default fixture you should be using for testing
+    pylibcudf I/O writers.
+
+    Contains one of each category (e.g. int, bool, list, struct)
+    of dtypes.
+    """
+    nrows = request.param
+
+    table_dict = {}
+    # Colnames in the format expected by
+    # plc.io.TableWithMetadata
+    colnames = []
+
+    np.random.seed(42)
+
+    for typ in ALL_PA_TYPES:
+        rand_vals = np.random.randint(0, nrows, nrows)
+        child_colnames = []
+
+        def _generate_nested_data(typ):
+            child_colnames = []
+
+            # recurse to get vals for children
+            rand_arrs = []
+            for i in range(typ.num_fields):
+                rand_arr, grandchild_colnames = _generate_nested_data(
+                    typ.field(i).type
+                )
+                rand_arrs.append(rand_arr)
+                child_colnames.append((typ.field(i).name, grandchild_colnames))
+
+            if isinstance(typ, pa.StructType):
+                pa_array = pa.StructArray.from_arrays(
+                    [rand_arr for rand_arr in rand_arrs],
+                    names=[typ.field(i).name for i in range(typ.num_fields)],
+                )
+            elif isinstance(typ, pa.ListType):
+                pa_array = pa.array(
+                    [list(row_vals) for row_vals in zip(rand_arrs[0])],
+                    type=typ,
+                )
+                child_colnames.append(("", grandchild_colnames))
+            else:
+                # typ is scalar type
+                pa_array = pa.array(rand_vals).cast(typ)
+            return pa_array, child_colnames
+
+        if isinstance(typ, (pa.ListType, pa.StructType)):
+            rand_arr, child_colnames = _generate_nested_data(typ)
+        else:
+            rand_arr = pa.array(rand_vals).cast(typ)
+
+        table_dict[f"col_{typ}"] = rand_arr
+        colnames.append((f"col_{typ}", child_colnames))
+
+    pa_table = pa.Table.from_pydict(table_dict)
+
+    return plc.io.TableWithMetadata(
+        plc.interop.from_arrow(pa_table), column_names=colnames
+    ), pa_table
+
+
+@pytest.fixture(
+    params=["a.txt", pathlib.Path("a.txt"), io.BytesIO, io.StringIO],
+)
+def source_or_sink(request, tmp_path):
+    fp_or_buf = request.param
+    if isinstance(fp_or_buf, str):
+        return f"{tmp_path}/{fp_or_buf}"
+    elif isinstance(fp_or_buf, os.PathLike):
+        return tmp_path.joinpath(fp_or_buf)
+    elif issubclass(fp_or_buf, io.IOBase):
+        # Must construct io.StringIO/io.BytesIO inside
+        # fixture, or we'll end up re-using it
+        return fp_or_buf()
+
+
 @pytest.fixture(
     scope="session", params=[opt for opt in plc.types.Interpolation]
 )
diff --git a/python/cudf/cudf/pylibcudf_tests/test_avro.py b/python/cudf/cudf/pylibcudf_tests/io/test_avro.py
similarity index 100%
rename from python/cudf/cudf/pylibcudf_tests/test_avro.py
rename to python/cudf/cudf/pylibcudf_tests/io/test_avro.py
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_json.py b/python/cudf/cudf/pylibcudf_tests/io/test_json.py
new file mode 100644
index 00000000000..d6b8bfa6976
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_json.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import io
+
+import pyarrow as pa
+import pytest
+from utils import sink_to_str
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.mark.parametrize("rows_per_chunk", [8, 100])
+@pytest.mark.parametrize("lines", [True, False])
+def test_write_json_basic(table_data, source_or_sink, lines, rows_per_chunk):
+    plc_table_w_meta, pa_table = table_data
+    sink = source_or_sink
+
+    plc.io.json.write_json(
+        plc.io.SinkInfo([sink]),
+        plc_table_w_meta,
+        lines=lines,
+        rows_per_chunk=rows_per_chunk,
+    )
+
+    exp = pa_table.to_pandas()
+
+    # Convert everything to string to make
+    # comparisons easier
+    str_result = sink_to_str(sink)
+
+    pd_result = exp.to_json(orient="records", lines=lines)
+
+    assert str_result == pd_result
+
+
+@pytest.mark.parametrize("include_nulls", [True, False])
+@pytest.mark.parametrize("na_rep", ["null", "awef", ""])
+def test_write_json_nulls(na_rep, include_nulls):
+    names = ["a", "b"]
+    pa_tbl = pa.Table.from_arrays(
+        [pa.array([1.0, 2.0, None]), pa.array([True, None, False])],
+        names=names,
+    )
+    plc_tbl = plc.interop.from_arrow(pa_tbl)
+    plc_tbl_w_meta = plc.io.types.TableWithMetadata(
+        plc_tbl, column_names=[(name, []) for name in names]
+    )
+
+    sink = io.StringIO()
+
+    plc.io.json.write_json(
+        plc.io.SinkInfo([sink]),
+        plc_tbl_w_meta,
+        na_rep=na_rep,
+        include_nulls=include_nulls,
+    )
+
+    exp = pa_tbl.to_pandas()
+
+    # Convert everything to string to make
+    # comparisons easier
+    str_result = sink_to_str(sink)
+    pd_result = exp.to_json(orient="records")
+
+    if not include_nulls:
+        # No equivalent in pandas, so we just
+        # sanity check by making sure na_rep
+        # doesn't appear in the output
+
+        # don't quote null
+        for name in names:
+            assert f'{{"{name}":{na_rep}}}' not in str_result
+        return
+
+    # pandas doesn't suppport na_rep
+    # let's just manually do str.replace
+    pd_result = pd_result.replace("null", na_rep)
+
+    assert str_result == pd_result
+
+
+@pytest.mark.parametrize("true_value", ["True", "correct"])
+@pytest.mark.parametrize("false_value", ["False", "wrong"])
+def test_write_json_bool_opts(true_value, false_value):
+    names = ["a"]
+    pa_tbl = pa.Table.from_arrays([pa.array([True, None, False])], names=names)
+    plc_tbl = plc.interop.from_arrow(pa_tbl)
+    plc_tbl_w_meta = plc.io.types.TableWithMetadata(
+        plc_tbl, column_names=[(name, []) for name in names]
+    )
+
+    sink = io.StringIO()
+
+    plc.io.json.write_json(
+        plc.io.SinkInfo([sink]),
+        plc_tbl_w_meta,
+        include_nulls=True,
+        na_rep="null",
+        true_value=true_value,
+        false_value=false_value,
+    )
+
+    exp = pa_tbl.to_pandas()
+
+    # Convert everything to string to make
+    # comparisons easier
+    str_result = sink_to_str(sink)
+    pd_result = exp.to_json(orient="records")
+
+    # pandas doesn't suppport na_rep
+    # let's just manually do str.replace
+    if true_value != "true":
+        pd_result = pd_result.replace("true", true_value)
+    if false_value != "false":
+        pd_result = pd_result.replace("false", false_value)
+
+    assert str_result == pd_result
diff --git a/python/cudf/cudf/pylibcudf_tests/test_source_info.py b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
similarity index 72%
rename from python/cudf/cudf/pylibcudf_tests/test_source_info.py
rename to python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
index 019321b7259..287dd8f21c8 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_source_info.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
@@ -9,6 +9,21 @@
 from cudf._lib.pylibcudf.io.datasource import NativeFileDatasource
 
 
+@pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
+def io_class(request):
+    return request.param
+
+
+def _skip_invalid_sinks(io_class, sink):
+    """
+    Skip invalid sinks for SinkInfo
+    """
+    if io_class is plc.io.SinkInfo and isinstance(
+        sink, (bytes, NativeFileDatasource)
+    ):
+        pytest.skip(f"{sink} is not a valid input for SinkInfo")
+
+
 @pytest.mark.parametrize(
     "source",
     [
@@ -18,16 +33,15 @@
         NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
     ],
 )
-def test_source_info_ctor(source, tmp_path):
+def test_source_info_ctor(io_class, source, tmp_path):
     if isinstance(source, str):
         file = tmp_path / source
         file.write_bytes("hello world".encode("utf-8"))
         source = str(file)
 
-    plc.io.SourceInfo([source])
+    _skip_invalid_sinks(io_class, source)
 
-    # TODO: test contents of source_info buffer is correct
-    # once buffers are exposed on python side
+    io_class([source])
 
 
 @pytest.mark.parametrize(
@@ -42,7 +56,7 @@ def test_source_info_ctor(source, tmp_path):
         ],
     ],
 )
-def test_source_info_ctor_multiple(sources, tmp_path):
+def test_source_info_ctor_multiple(io_class, sources, tmp_path):
     for i in range(len(sources)):
         source = sources[i]
         if isinstance(source, str):
@@ -50,10 +64,9 @@ def test_source_info_ctor_multiple(sources, tmp_path):
             file.write_bytes("hello world".encode("utf-8"))
             sources[i] = str(file)
 
-    plc.io.SourceInfo(sources)
+        _skip_invalid_sinks(io_class, source)
 
-    # TODO: test contents of source_info buffer is correct
-    # once buffers are exposed on python side
+    io_class(sources)
 
 
 @pytest.mark.parametrize(
@@ -73,7 +86,7 @@ def test_source_info_ctor_multiple(sources, tmp_path):
         ],
     ],
 )
-def test_source_info_ctor_mixing_invalid(sources, tmp_path):
+def test_source_info_ctor_mixing_invalid(io_class, sources, tmp_path):
     # Unlike the previous test
     # don't create files so that they are missing
     for i in range(len(sources)):
@@ -82,8 +95,9 @@ def test_source_info_ctor_mixing_invalid(sources, tmp_path):
             file = tmp_path / source
             file.write_bytes("hello world".encode("utf-8"))
             sources[i] = str(file)
+        _skip_invalid_sinks(io_class, source)
     with pytest.raises(ValueError):
-        plc.io.SourceInfo(sources)
+        io_class(sources)
 
 
 def test_source_info_invalid():
diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py
index da3ca3a6d1e..0a6df198d46 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_copying.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py
@@ -5,19 +5,24 @@
 import pytest
 from utils import (
     DEFAULT_STRUCT_TESTING_TYPE,
+    NESTED_STRUCT_TESTING_TYPE,
     assert_column_eq,
     assert_table_eq,
     cudf_raises,
     is_fixed_width,
     is_floating,
     is_integer,
+    is_nested_list,
+    is_nested_struct,
     is_string,
-    metadata_from_arrow_array,
+    metadata_from_arrow_type,
 )
 
 from cudf._lib import pylibcudf as plc
 
 
+# TODO: consider moving this to conftest and "pairing"
+# it with pa_type, so that they don't get out of sync
 # TODO: Test nullable data
 @pytest.fixture(scope="module")
 def input_column(pa_type):
@@ -28,10 +33,27 @@ def input_column(pa_type):
     elif pa.types.is_boolean(pa_type):
         pa_array = pa.array([True, True, False], type=pa_type)
     elif pa.types.is_list(pa_type):
-        # TODO: Add heterogenous sizes
-        pa_array = pa.array([[1], [2], [3]], type=pa_type)
+        if pa_type.value_type == pa.int64():
+            pa_array = pa.array([[1], [2, 3], [3]], type=pa_type)
+        elif (
+            isinstance(pa_type.value_type, pa.ListType)
+            and pa_type.value_type.value_type == pa.int64()
+        ):
+            pa_array = pa.array([[[1]], [[2, 3]], [[3]]], type=pa_type)
+        else:
+            raise ValueError("Unsupported type " + pa_type.value_type)
     elif pa.types.is_struct(pa_type):
-        pa_array = pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type)
+        if not is_nested_struct(pa_type):
+            pa_array = pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type)
+        else:
+            pa_array = pa.array(
+                [
+                    {"a": 1, "b_struct": {"b": 1.0}},
+                    {"a": 2, "b_struct": {"b": 2.0}},
+                    {"a": 3, "b_struct": {"b": 3.0}},
+                ],
+                type=pa_type,
+            )
     else:
         raise ValueError("Unsupported type")
     return pa_array, plc.interop.from_arrow(pa_array)
@@ -55,13 +77,37 @@ def target_column(pa_type):
             [False, True, True, False, True, False], type=pa_type
         )
     elif pa.types.is_list(pa_type):
-        # TODO: Add heterogenous sizes
-        pa_array = pa.array([[4], [5], [6], [7], [8], [9]], type=pa_type)
+        if pa_type.value_type == pa.int64():
+            pa_array = pa.array(
+                [[4], [5, 6], [7], [8], [9], [10]], type=pa_type
+            )
+        elif (
+            isinstance(pa_type.value_type, pa.ListType)
+            and pa_type.value_type.value_type == pa.int64()
+        ):
+            pa_array = pa.array(
+                [[[4]], [[5, 6]], [[7]], [[8]], [[9]], [[10]]], type=pa_type
+            )
+        else:
+            raise ValueError("Unsupported type")
     elif pa.types.is_struct(pa_type):
-        pa_array = pa.array(
-            [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}],
-            type=pa_type,
-        )
+        if not is_nested_struct(pa_type):
+            pa_array = pa.array(
+                [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}],
+                type=pa_type,
+            )
+        else:
+            pa_array = pa.array(
+                [
+                    {"a": 4, "b_struct": {"b": 4.0}},
+                    {"a": 5, "b_struct": {"b": 5.0}},
+                    {"a": 6, "b_struct": {"b": 6.0}},
+                    {"a": 7, "b_struct": {"b": 7.0}},
+                    {"a": 8, "b_struct": {"b": 8.0}},
+                    {"a": 9, "b_struct": {"b": 9.0}},
+                ],
+                type=pa_type,
+            )
     else:
         raise ValueError("Unsupported type")
     return pa_array, plc.interop.from_arrow(pa_array)
@@ -96,10 +142,22 @@ def source_scalar(pa_type):
     elif pa.types.is_boolean(pa_type):
         pa_scalar = pa.scalar(False, type=pa_type)
     elif pa.types.is_list(pa_type):
-        # TODO: Longer list?
-        pa_scalar = pa.scalar([1], type=pa_type)
+        if pa_type.value_type == pa.int64():
+            pa_scalar = pa.scalar([1, 2, 3, 4], type=pa_type)
+        elif (
+            isinstance(pa_type.value_type, pa.ListType)
+            and pa_type.value_type.value_type == pa.int64()
+        ):
+            pa_scalar = pa.scalar([[1, 2, 3, 4]], type=pa_type)
+        else:
+            raise ValueError("Unsupported type")
     elif pa.types.is_struct(pa_type):
-        pa_scalar = pa.scalar({"v": 1}, type=pa_type)
+        if not is_nested_struct(pa_type):
+            pa_scalar = pa.scalar({"v": 1}, type=pa_type)
+        else:
+            pa_scalar = pa.scalar(
+                {"a": 1, "b_struct": {"b": 1.0}}, type=pa_type
+            )
     else:
         raise ValueError("Unsupported type")
     return pa_scalar, plc.interop.from_arrow(pa_scalar)
@@ -196,27 +254,54 @@ def test_scatter_table(
             )
 
         if pa.types.is_list(dtype := pa_target_table[0].type):
-            expected = pa.table(
-                [pa.array([[4], [1], [2], [3], [8], [9]])] * 3, [""] * 3
-            )
+            if is_nested_list(dtype):
+                expected = pa.table(
+                    [pa.array([[[4]], [[1]], [[2, 3]], [[3]], [[9]], [[10]]])]
+                    * 3,
+                    [""] * 3,
+                )
+            else:
+                expected = pa.table(
+                    [pa.array([[4], [1], [2, 3], [3], [9], [10]])] * 3,
+                    [""] * 3,
+                )
         elif pa.types.is_struct(dtype):
-            expected = pa.table(
-                [
-                    pa.array(
-                        [
-                            {"v": 4},
-                            {"v": 1},
-                            {"v": 2},
-                            {"v": 3},
-                            {"v": 8},
-                            {"v": 9},
-                        ],
-                        type=DEFAULT_STRUCT_TESTING_TYPE,
-                    )
-                ]
-                * 3,
-                [""] * 3,
-            )
+            if is_nested_struct(dtype):
+                expected = pa.table(
+                    [
+                        pa.array(
+                            [
+                                {"a": 4, "b_struct": {"b": 4.0}},
+                                {"a": 1, "b_struct": {"b": 1.0}},
+                                {"a": 2, "b_struct": {"b": 2.0}},
+                                {"a": 3, "b_struct": {"b": 3.0}},
+                                {"a": 8, "b_struct": {"b": 8.0}},
+                                {"a": 9, "b_struct": {"b": 9.0}},
+                            ],
+                            type=NESTED_STRUCT_TESTING_TYPE,
+                        )
+                    ]
+                    * 3,
+                    [""] * 3,
+                )
+            else:
+                expected = pa.table(
+                    [
+                        pa.array(
+                            [
+                                {"v": 4},
+                                {"v": 1},
+                                {"v": 2},
+                                {"v": 3},
+                                {"v": 8},
+                                {"v": 9},
+                            ],
+                            type=DEFAULT_STRUCT_TESTING_TYPE,
+                        )
+                    ]
+                    * 3,
+                    [""] * 3,
+                )
     else:
         expected = _pyarrow_boolean_mask_scatter_table(
             pa_source_table,
@@ -627,6 +712,7 @@ def test_split_column_out_of_bounds(target_column):
 
 def test_split_table(target_table):
     pa_target_table, plc_target_table = target_table
+
     upper_bounds = [1, 3, 5]
     lower_bounds = [0] + upper_bounds[:-1]
     result = plc.copying.split(plc_target_table, upper_bounds)
@@ -718,6 +804,7 @@ def test_copy_if_else_column_scalar(
     pa_target_column, plc_target_column = target_column
     pa_source_scalar, plc_source_scalar = source_scalar
     pa_mask, plc_mask = mask
+
     args = (
         (plc_target_column, plc_source_scalar)
         if array_left
@@ -766,27 +853,58 @@ def test_boolean_mask_scatter_from_table(
             )
 
         if pa.types.is_list(dtype := pa_target_table[0].type):
-            expected = pa.table(
-                [pa.array([[1], [5], [2], [7], [3], [9]])] * 3, [""] * 3
-            )
+            if is_nested_list(dtype):
+                expected = pa.table(
+                    [
+                        pa.array(
+                            [[[1]], [[5, 6]], [[2, 3]], [[8]], [[3]], [[10]]]
+                        )
+                    ]
+                    * 3,
+                    [""] * 3,
+                )
+            else:
+                expected = pa.table(
+                    [pa.array([[1], [5, 6], [2, 3], [8], [3], [10]])] * 3,
+                    [""] * 3,
+                )
         elif pa.types.is_struct(dtype):
-            expected = pa.table(
-                [
-                    pa.array(
-                        [
-                            {"v": 1},
-                            {"v": 5},
-                            {"v": 2},
-                            {"v": 7},
-                            {"v": 3},
-                            {"v": 9},
-                        ],
-                        type=DEFAULT_STRUCT_TESTING_TYPE,
-                    )
-                ]
-                * 3,
-                [""] * 3,
-            )
+            if is_nested_struct(dtype):
+                expected = pa.table(
+                    [
+                        pa.array(
+                            [
+                                {"a": 1, "b_struct": {"b": 1.0}},
+                                {"a": 5, "b_struct": {"b": 5.0}},
+                                {"a": 2, "b_struct": {"b": 2.0}},
+                                {"a": 7, "b_struct": {"b": 7.0}},
+                                {"a": 3, "b_struct": {"b": 3.0}},
+                                {"a": 9, "b_struct": {"b": 9.0}},
+                            ],
+                            type=NESTED_STRUCT_TESTING_TYPE,
+                        )
+                    ]
+                    * 3,
+                    [""] * 3,
+                )
+            else:
+                expected = pa.table(
+                    [
+                        pa.array(
+                            [
+                                {"v": 1},
+                                {"v": 5},
+                                {"v": 2},
+                                {"v": 7},
+                                {"v": 3},
+                                {"v": 9},
+                            ],
+                            type=DEFAULT_STRUCT_TESTING_TYPE,
+                        )
+                    ]
+                    * 3,
+                    [""] * 3,
+                )
     else:
         expected = _pyarrow_boolean_mask_scatter_table(
             pa_source_table, pa_mask, pa_target_table
@@ -887,7 +1005,7 @@ def test_get_element(input_column):
 
     assert (
         plc.interop.to_arrow(
-            result, metadata_from_arrow_array(pa_input_column)
+            result, metadata_from_arrow_type(pa_input_column.type)
         ).as_py()
         == pa_input_column[index].as_py()
     )

From 64325a1bafeb97e8399e497cc9f4f6ffaee0fd14 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 2 Jul 2024 11:52:02 -0400
Subject: [PATCH 27/27] Run DFG after verify-alpha-spec (#16151)

Because `verify-alpha-spec` potentially modifies `dependencies.yaml`, we want to run DFG after it. This should have been included in #16144 but was forgotten.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16151
---
 .pre-commit-config.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d0457d2c641..bbcd78d051f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -136,11 +136,6 @@ repos:
             .*test.*|
             ^CHANGELOG.md$
           )
-  - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.13.11
-    hooks:
-      - id: rapids-dependency-file-generator
-        args: ["--clean"]
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.4.8
     hooks:
@@ -159,6 +154,11 @@ repos:
             cpp/src/io/parquet/ipc/Schema_generated[.]h$
           )
       - id: verify-alpha-spec
+  - repo: https://github.com/rapidsai/dependency-file-generator
+    rev: v1.13.11
+    hooks:
+      - id: rapids-dependency-file-generator
+        args: ["--clean"]
 
 default_language_version:
       python: python3