From 892e7d850b060adbf198d4a5f6bfefa4941f77fb Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 26 Jun 2024 16:12:00 +0100
Subject: [PATCH 01/29] Expose and then implement support for cross joins in
 cudf-polars (#16097)

libcudf supports cross joins, but until now this wasn't exposed to python. Do that in pylibcudf and implement the evaluation rule in cudf-polars.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16097
---
 python/cudf/cudf/_lib/pylibcudf/join.pxd      |  2 ++
 python/cudf/cudf/_lib/pylibcudf/join.pyx      | 30 +++++++++++++++----
 .../cudf/cudf/_lib/pylibcudf/libcudf/join.pxd |  5 ++++
 python/cudf/cudf/pylibcudf_tests/test_join.py | 29 ++++++++++++++++++
 python/cudf_polars/cudf_polars/dsl/ir.py      | 29 +++++++++++++-----
 python/cudf_polars/tests/test_join.py         | 24 ++++++++++++---
 6 files changed, 102 insertions(+), 17 deletions(-)
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_join.py

diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pxd b/python/cudf/cudf/_lib/pylibcudf/join.pxd
index f560eeef06d..83b4776c16e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/join.pxd
@@ -35,3 +35,5 @@ cpdef Column left_anti_join(
     Table right_keys,
     null_equality nulls_equal
 )
+
+cpdef Table cross_join(Table left, Table right)
diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pyx b/python/cudf/cudf/_lib/pylibcudf/join.pyx
index cf2a6a8187f..308b1b39291 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/join.pyx
@@ -2,13 +2,14 @@
 
 from cython.operator import dereference
 
-from libcpp.memory cimport make_unique
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 
 from rmm._lib.device_buffer cimport device_buffer
 
 from cudf._lib.pylibcudf.libcudf cimport join as cpp_join
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.types cimport (
     data_type,
     null_equality,
@@ -88,7 +89,6 @@ cpdef tuple left_join(
     nulls_equal : NullEquality
         Should nulls compare equal?
 
-
     Returns
     -------
     Tuple[Column, Column]
@@ -122,7 +122,6 @@ cpdef tuple full_join(
     nulls_equal : NullEquality
         Should nulls compare equal?
 
-
     Returns
     -------
     Tuple[Column, Column]
@@ -156,7 +155,6 @@ cpdef Column left_semi_join(
     nulls_equal : NullEquality
         Should nulls compare equal?
 
-
     Returns
     -------
     Column
@@ -190,7 +188,6 @@ cpdef Column left_anti_join(
     nulls_equal : NullEquality
         Should nulls compare equal?
 
-
     Returns
     -------
     Column
@@ -204,3 +201,26 @@ cpdef Column left_anti_join(
             nulls_equal
         )
     return _column_from_gather_map(move(c_result))
+
+
+cpdef Table cross_join(Table left, Table right):
+    """Perform a cross join on two tables.
+
+    For details see :cpp:func:`cross_join`.
+
+    Parameters
+    ----------
+    left : Table
+        The left table to join.
+    right: Table
+        The right table to join.
+
+    Returns
+    -------
+    Table
+        The result of cross joining the two inputs.
+    """
+    cdef unique_ptr[table] result
+    with nogil:
+        result = move(cpp_join.cross_join(left.view(), right.view()))
+    return Table.from_libcudf(move(result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
index 89a30f0f255..32cd17f7c11 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
@@ -70,3 +70,8 @@ cdef extern from "cudf/join.hpp" namespace "cudf" nogil:
         const table_view right_keys,
         null_equality nulls_equal,
     ) except +
+
+    cdef unique_ptr[table] cross_join(
+        const table_view left,
+        const table_view right,
+    ) except +
diff --git a/python/cudf/cudf/pylibcudf_tests/test_join.py b/python/cudf/cudf/pylibcudf_tests/test_join.py
new file mode 100644
index 00000000000..eb25ed915b1
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_join.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import numpy as np
+import pyarrow as pa
+from utils import assert_table_eq
+
+from cudf._lib import pylibcudf as plc
+
+
+def test_cross_join():
+    left = pa.Table.from_arrays([[0, 1, 2], [3, 4, 5]], names=["a", "b"])
+    right = pa.Table.from_arrays(
+        [[6, 7, 8, 9], [10, 11, 12, 13]], names=["c", "d"]
+    )
+
+    pleft = plc.interop.from_arrow(left)
+    pright = plc.interop.from_arrow(right)
+
+    expect = pa.Table.from_arrays(
+        [
+            *(np.repeat(c.to_numpy(), len(right)) for c in left.columns),
+            *(np.tile(c.to_numpy(), len(left)) for c in right.columns),
+        ],
+        names=["a", "b", "c", "d"],
+    )
+
+    got = plc.join.cross_join(pleft, pright)
+
+    assert_table_eq(expect, got)
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index b3dd6ae7cc3..4ad6e75fb2e 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -503,7 +503,7 @@ class Join(IR):
     right_on: list[expr.NamedExpr]
     """List of expressions used as keys in the right frame."""
     options: tuple[
-        Literal["inner", "left", "full", "leftsemi", "leftanti"],
+        Literal["inner", "left", "full", "leftsemi", "leftanti", "cross"],
         bool,
         tuple[int, int] | None,
         str | None,
@@ -518,11 +518,6 @@ class Join(IR):
     - coalesce: should key columns be coalesced (only makes sense for outer joins)
     """
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        if self.options[0] == "cross":
-            raise NotImplementedError("cross join not implemented")
-
     @cache
     @staticmethod
     def _joiners(
@@ -567,6 +562,26 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         left = self.left.evaluate(cache=cache)
         right = self.right.evaluate(cache=cache)
+        how, join_nulls, zlice, suffix, coalesce = self.options
+        suffix = "_right" if suffix is None else suffix
+        if how == "cross":
+            # Separate implementation, since cross_join returns the
+            # result, not the gather maps
+            columns = plc.join.cross_join(left.table, right.table).columns()
+            left_cols = [
+                NamedColumn(new, old.name).sorted_like(old)
+                for new, old in zip(columns[: left.num_columns], left.columns)
+            ]
+            right_cols = [
+                NamedColumn(
+                    new,
+                    old.name
+                    if old.name not in left.column_names_set
+                    else f"{old.name}{suffix}",
+                )
+                for new, old in zip(columns[left.num_columns :], right.columns)
+            ]
+            return DataFrame([*left_cols, *right_cols])
         left_on = DataFrame(
             broadcast(
                 *(e.evaluate(left) for e in self.left_on), target_length=left.num_rows
@@ -578,13 +593,11 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 target_length=right.num_rows,
             )
         )
-        how, join_nulls, zlice, suffix, coalesce = self.options
         null_equality = (
             plc.types.NullEquality.EQUAL
             if join_nulls
             else plc.types.NullEquality.UNEQUAL
         )
-        suffix = "_right" if suffix is None else suffix
         join_fn, left_policy, right_policy = Join._joiners(how)
         if right_policy is None:
             # Semi join
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index f4a4704f3cc..81166b0b2f6 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -16,10 +16,6 @@
         "left",
         "semi",
         "anti",
-        pytest.param(
-            "cross",
-            marks=pytest.mark.xfail(reason="cross join not implemented"),
-        ),
         "full",
     ],
 )
@@ -55,3 +51,23 @@ def test_join(how, coalesce, join_nulls, join_expr):
         right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=coalesce
     )
     assert_gpu_result_equal(query, check_row_order=False)
+
+
+def test_cross_join():
+    left = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 1, None],
+            "b": [1, 2, 3, 4, 5],
+            "c": [2, 3, 4, 5, 6],
+        }
+    ).lazy()
+    right = pl.DataFrame(
+        {
+            "a": [1, 4, 3, 7, None, None],
+            "c": [2, 3, 4, 5, 6, 7],
+        }
+    ).lazy()
+
+    q = left.join(right, how="cross")
+
+    assert_gpu_result_equal(q)

From d53e409f06058d59671e0377b40a71a08789fccf Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 26 Jun 2024 10:22:44 -0500
Subject: [PATCH 02/29] Fix `is_monotonic_*` APIs to include `nan's` (#16085)

Fixes: #15776

This PR changes `is_monotonic_*` API's to factor in `np.nan` while performing the operations.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16085
---
 python/cudf/cudf/core/column/column.py   |  4 ++--
 python/cudf/cudf/tests/test_monotonic.py | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index dfcdfbb9d91..5db6fd904a9 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -927,13 +927,13 @@ def is_unique(self) -> bool:
 
     @property
     def is_monotonic_increasing(self) -> bool:
-        return not self.has_nulls() and libcudf.sort.is_sorted(
+        return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
             [self], [True], None
         )
 
     @property
     def is_monotonic_decreasing(self) -> bool:
-        return not self.has_nulls() and libcudf.sort.is_sorted(
+        return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
             [self], [False], None
         )
 
diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py
index 0896d91570e..790e84559a9 100644
--- a/python/cudf/cudf/tests/test_monotonic.py
+++ b/python/cudf/cudf/tests/test_monotonic.py
@@ -33,11 +33,13 @@ def test_range_index(testrange):
     "testlist",
     [
         [1, 2, 3, 4],
+        [1, 2, 3, 4, None],
         [1, 2, 3, 3, 4],
         [10, 9, 8, 7],
         [10, 9, 8, 8, 7],
         ["c", "d", "e", "f"],
         ["c", "d", "e", "e", "f"],
+        ["c", "d", "e", "f", None],
         ["z", "y", "x", "r"],
         ["z", "y", "x", "x", "r"],
     ],
@@ -51,6 +53,23 @@ def test_generic_index(testlist):
     assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing
 
 
+@pytest.mark.parametrize(
+    "testlist",
+    [
+        [1, 2, 3, 4, np.nan],
+        [10, 9, 8, np.nan, 7],
+        [10, 9, 8, 8, 7, np.nan],
+    ],
+)
+def test_float_index(testlist):
+    index_pd = pd.Index(testlist)
+    index = cudf.from_pandas(index_pd, nan_as_null=False)
+
+    assert index.is_unique == index_pd.is_unique
+    assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing
+    assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing
+
+
 @pytest.mark.parametrize(
     "testlist",
     [

From bfaddd3bcccac3ef38bf7c5d0e6fd55267b2f3ab Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 26 Jun 2024 11:49:21 -0400
Subject: [PATCH 03/29] Add exception when trying to create large strings with
 cudf::test::strings_column_wrapper (#16049)

Throws an exception in the `cudf::test::strings_column_wrapper` if the column size (accumulated offset values) would exceed max size_type.
Large strings created by the wrapper are not supported and discouraged due to the size and time impact on testing and CI.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16049
---
 cpp/include/cudf_test/column_wrapper.hpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 47d17988775..7363f965af8 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -314,7 +314,12 @@ auto make_chars_and_offsets(StringsIterator begin, StringsIterator end, Validity
   for (auto str = begin; str < end; ++str) {
     std::string tmp = (*v++) ? std::string(*str) : std::string{};
     chars.insert(chars.end(), std::cbegin(tmp), std::cend(tmp));
-    offsets.push_back(offsets.back() + tmp.length());
+    auto const last_offset = static_cast<std::size_t>(offsets.back());
+    auto const next_offset = last_offset + tmp.length();
+    CUDF_EXPECTS(
+      next_offset < static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+      "Cannot use strings_column_wrapper to build a large strings column");
+    offsets.push_back(static_cast<cudf::size_type>(next_offset));
   }
   return std::pair(std::move(chars), std::move(offsets));
 };

From e0b8ab01deb66ea0726373ecb9cc77cbbf0666cd Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 26 Jun 2024 11:00:25 -0500
Subject: [PATCH 04/29] Migrate string `slice` APIs to `pylibcudf` (#15988)

This PR introduces pylibcudf string `slice` APIs and migrates the cuDF cython to use them. Part of https://github.com/rapidsai/cudf/issues/15162

Authors:
  - https://github.com/brandon-b-miller
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15988
---
 .../api_docs/pylibcudf/strings/index.rst      |   1 +
 .../api_docs/pylibcudf/strings/slice.rst      |   6 +
 .../libcudf/scalar/scalar_factories.pxd       |   1 +
 .../_lib/pylibcudf/strings/CMakeLists.txt     |   2 +-
 .../cudf/_lib/pylibcudf/strings/__init__.pxd  |   1 +
 .../cudf/_lib/pylibcudf/strings/__init__.py   |   1 +
 .../cudf/_lib/pylibcudf/strings/slice.pxd     |  15 +++
 .../cudf/_lib/pylibcudf/strings/slice.pyx     | 102 +++++++++++++++
 python/cudf/cudf/_lib/strings/substring.pyx   |  88 ++++---------
 .../cudf/pylibcudf_tests/test_string_slice.py | 116 ++++++++++++++++++
 10 files changed, 270 insertions(+), 63 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_slice.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index bfaef732555..cecf1ccc9bb 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -6,3 +6,4 @@ strings
 
     contains
     replace
+    slice
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst
new file mode 100644
index 00000000000..0ee5af71c03
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst
@@ -0,0 +1,6 @@
+=====
+slice
+=====
+
+.. automodule:: cudf._lib.pylibcudf.strings.slice
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
index 5c4e5bf346f..c8220df8938 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
@@ -8,3 +8,4 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil:
     cdef unique_ptr[scalar] make_string_scalar(const string & _string) except +
+    cdef unique_ptr[scalar] make_fixed_width_scalar[T](T value) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
index cb7f71b1912..b499a127541 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
-                   regex_program.pyx replace.pyx
+                   regex_program.pyx replace.pyx slice.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
index 959aa94737d..d1f632d6d8e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
@@ -9,4 +9,5 @@ from . cimport (
     regex_flags,
     regex_program,
     replace,
+    slice,
 )
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
index b7384913286..ef102aff2af 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
@@ -9,4 +9,5 @@
     regex_flags,
     regex_program,
     replace,
+    slice,
 )
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd
new file mode 100644
index 00000000000..7d8d0006ef4
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.scalar cimport Scalar
+
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
+
+cpdef Column slice_strings(
+    Column input,
+    ColumnOrScalar start=*,
+    ColumnOrScalar stop=*,
+    Scalar step=*
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx
new file mode 100644
index 00000000000..df75134fb71
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx
@@ -0,0 +1,102 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_fixed_width_scalar as cpp_make_fixed_width_scalar,
+)
+from cudf._lib.pylibcudf.libcudf.strings cimport substring as cpp_slice
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.scalar cimport Scalar
+
+from cython.operator import dereference
+
+
+cpdef Column slice_strings(
+    Column input,
+    ColumnOrScalar start=None,
+    ColumnOrScalar stop=None,
+    Scalar step=None
+):
+    """Perform a slice operation on a strings column.
+
+    ``start`` and ``stop`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`. But ``step`` must be a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    For details, see :cpp:func:`cudf::strings::slice_strings`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column for this operation
+    start : Union[Column, Scalar]
+        The start character position or positions.
+    stop : Union[Column, Scalar]
+        The end character position or positions
+    step : Scalar
+        Distance between input characters retrieved
+
+    Returns
+    -------
+    pylibcudf.Column
+        The result of the slice operation
+    """
+    cdef unique_ptr[column] c_result
+    cdef numeric_scalar[size_type]* cpp_start
+    cdef numeric_scalar[size_type]* cpp_stop
+    cdef numeric_scalar[size_type]* cpp_step
+
+    if input is None:
+        raise ValueError("input cannot be None")
+
+    if ColumnOrScalar is Column:
+        if step is not None:
+            raise ValueError("Column-wise slice does not support step")
+
+        if start is None or stop is None:
+            raise ValueError(
+                "start and stop must be provided for Column-wise slice"
+            )
+
+        with nogil:
+            c_result = cpp_slice.slice_strings(
+                input.view(),
+                start.view(),
+                stop.view()
+            )
+
+    elif ColumnOrScalar is Scalar:
+        if start is None:
+            start = Scalar.from_libcudf(
+                cpp_make_fixed_width_scalar(0)
+            )
+        if stop is None:
+            stop = Scalar.from_libcudf(
+                cpp_make_fixed_width_scalar(0)
+            )
+        if step is None:
+            step = Scalar.from_libcudf(
+                cpp_make_fixed_width_scalar(1)
+            )
+
+        cpp_start = <numeric_scalar[size_type]*>start.c_obj.get()
+        cpp_stop = <numeric_scalar[size_type]*>stop.c_obj.get()
+        cpp_step = <numeric_scalar[size_type]*>step.c_obj.get()
+
+        with nogil:
+            c_result = cpp_slice.slice_strings(
+                input.view(),
+                dereference(cpp_start),
+                dereference(cpp_stop),
+                dereference(cpp_step)
+            )
+    else:
+        raise ValueError("start, stop, and step must be either Column or Scalar")
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/substring.pyx b/python/cudf/cudf/_lib/strings/substring.pyx
index 170c1016b89..706c21c0634 100644
--- a/python/cudf/cudf/_lib/strings/substring.pyx
+++ b/python/cudf/cudf/_lib/strings/substring.pyx
@@ -2,24 +2,16 @@
 
 import numpy as np
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.substring cimport (
-    slice_strings as cpp_slice_strings,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.scalar import as_device_scalar
 
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
 from cudf._lib.scalar cimport DeviceScalar
 
+import cudf._lib.pylibcudf as plc
+
 
 @acquire_spill_lock()
 def slice_strings(Column source_strings,
@@ -32,30 +24,18 @@ def slice_strings(Column source_strings,
     performed in steps by skipping `step` number of
     characters in a string.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
     cdef DeviceScalar start_scalar = as_device_scalar(start, np.int32)
     cdef DeviceScalar end_scalar = as_device_scalar(end, np.int32)
     cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32)
 
-    cdef numeric_scalar[size_type]* start_numeric_scalar = \
-        <numeric_scalar[size_type]*>(
-            start_scalar.get_raw_ptr())
-    cdef numeric_scalar[size_type]* end_numeric_scalar = \
-        <numeric_scalar[size_type]*>(end_scalar.get_raw_ptr())
-    cdef numeric_scalar[size_type]* step_numeric_scalar = \
-        <numeric_scalar[size_type]*>(step_scalar.get_raw_ptr())
-
-    with nogil:
-        c_result = move(cpp_slice_strings(
-            source_view,
-            start_numeric_scalar[0],
-            end_numeric_scalar[0],
-            step_numeric_scalar[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.slice.slice_strings(
+            source_strings.to_pylibcudf(mode="read"),
+            start_scalar.c_value,
+            end_scalar.c_value,
+            step_scalar.c_value
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -67,19 +47,13 @@ def slice_from(Column source_strings,
     at given starts and stops positions. `starts` and `stops`
     here are positions per element in the string-column.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view starts_view = starts.view()
-    cdef column_view stops_view = stops.view()
-
-    with nogil:
-        c_result = move(cpp_slice_strings(
-            source_view,
-            starts_view,
-            stops_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.slice.slice_strings(
+            source_strings.to_pylibcudf(mode="read"),
+            starts.to_pylibcudf(mode="read"),
+            stops.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -90,8 +64,7 @@ def get(Column source_strings,
     character from each input string. The index of
     characters required can be controlled by passing `index`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
+
     if index < 0:
         next_index = index - 1
         step = -1
@@ -102,20 +75,11 @@ def get(Column source_strings,
     cdef DeviceScalar end_scalar = as_device_scalar(next_index, np.int32)
     cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32)
 
-    cdef numeric_scalar[size_type]* start_numeric_scalar = \
-        <numeric_scalar[size_type]*>(
-            start_scalar.get_raw_ptr())
-    cdef numeric_scalar[size_type]* end_numeric_scalar = \
-        <numeric_scalar[size_type]*>(end_scalar.get_raw_ptr())
-    cdef numeric_scalar[size_type]* step_numeric_scalar = \
-        <numeric_scalar[size_type]*>(step_scalar.get_raw_ptr())
-
-    with nogil:
-        c_result = move(cpp_slice_strings(
-            source_view,
-            start_numeric_scalar[0],
-            end_numeric_scalar[0],
-            step_numeric_scalar[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.slice.slice_strings(
+            source_strings.to_pylibcudf(mode="read"),
+            start_scalar.c_value,
+            end_scalar.c_value,
+            step_scalar.c_value
+        )
+    )
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_slice.py b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py
new file mode 100644
index 00000000000..bd63987b30f
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def pa_col():
+    return pa.array(["AbC", "123abc", "", " ", None])
+
+
+@pytest.fixture(scope="module")
+def plc_col(pa_col):
+    return plc.interop.from_arrow(pa_col)
+
+
+@pytest.fixture(
+    scope="module",
+    params=[(1, 3, 1), (0, 3, -1), (3, 2, 1), (1, 5, 5), (1, 100, 2)],
+)
+def pa_start_stop_step(request):
+    return tuple(pa.scalar(x, type=pa.int32()) for x in request.param)
+
+
+@pytest.fixture(scope="module")
+def plc_start_stop_step(pa_start_stop_step):
+    return tuple(plc.interop.from_arrow(x) for x in pa_start_stop_step)
+
+
+@pytest.fixture(scope="module")
+def pa_starts_col():
+    return pa.array([0, 1, 3, -1, 100])
+
+
+@pytest.fixture(scope="module")
+def plc_starts_col(pa_starts_col):
+    return plc.interop.from_arrow(pa_starts_col)
+
+
+@pytest.fixture(scope="module")
+def pa_stops_col():
+    return pa.array([1, 3, 4, -1, 100])
+
+
+@pytest.fixture(scope="module")
+def plc_stops_col(pa_stops_col):
+    return plc.interop.from_arrow(pa_stops_col)
+
+
+def test_slice(pa_col, plc_col, pa_start_stop_step, plc_start_stop_step):
+    pa_start, pa_stop, pa_step = pa_start_stop_step
+    plc_start, plc_stop, plc_step = plc_start_stop_step
+
+    def slice_string(st, start, stop, step):
+        return st[start:stop:step] if st is not None else None
+
+    expected = pa.array(
+        [
+            slice_string(x, pa_start.as_py(), pa_stop.as_py(), pa_step.as_py())
+            for x in pa_col.to_pylist()
+        ],
+        type=pa.string(),
+    )
+
+    got = plc.strings.slice.slice_strings(
+        plc_col, start=plc_start, stop=plc_stop, step=plc_step
+    )
+
+    assert_column_eq(expected, got)
+
+
+def test_slice_column(
+    pa_col, plc_col, pa_starts_col, plc_starts_col, pa_stops_col, plc_stops_col
+):
+    def slice_string(st, start, stop):
+        if stop < 0:
+            stop = len(st)
+        return st[start:stop] if st is not None else None
+
+    expected = pa.array(
+        [
+            slice_string(x, start, stop)
+            for x, start, stop in zip(
+                pa_col.to_pylist(),
+                pa_starts_col.to_pylist(),
+                pa_stops_col.to_pylist(),
+            )
+        ],
+        type=pa.string(),
+    )
+
+    got = plc.strings.slice.slice_strings(
+        plc_col, plc_starts_col, plc_stops_col
+    )
+
+    assert_column_eq(expected, got)
+
+
+def test_slice_invalid(plc_col, plc_starts_col, plc_stops_col):
+    with pytest.raises(TypeError):
+        # no maching signature
+        plc.strings.slice.slice_strings(None, pa_starts_col, pa_stops_col)
+    with pytest.raises(ValueError):
+        # signature found but wrong value passed
+        plc.strings.slice.slice_strings(plc_col, plc_starts_col, None)
+    with pytest.raises(TypeError):
+        # no matching signature (2nd arg)
+        plc.strings.slice.slice_strings(plc_col, None, plc_stops_col)
+    with pytest.raises(TypeError):
+        # can't provide step for columnwise api
+        plc.strings.slice.slice_strings(
+            plc_col, plc_starts_col, plc_stops_col, plc_starts_col
+        )

From 65b64f675d5e87e45b7350782eab88293b633a49 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 26 Jun 2024 11:55:16 -0500
Subject: [PATCH 05/29] Fix segfault in conditional join (#16094)

Closes #16066.

I found a bug that would cause the reported segfault and have fixed it in this PR. When the right table has zero rows, conditional left anti-joins were returning a vector of indices containing garbage data.

Along the way, I refactored several parts of the conditional join tests and added coverage for more cases.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16094
---
 cpp/src/join/conditional_join.cu         | 13 +---
 cpp/tests/join/conditional_join_tests.cu | 92 +++++++++++++++++-------
 2 files changed, 70 insertions(+), 35 deletions(-)

diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index f02dee5f7f5..97a06d5a923 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -48,8 +48,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
 {
   if (right.num_rows() == 0) {
     switch (join_type) {
-      case join_kind::LEFT_ANTI_JOIN:
-        return std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
+      case join_kind::LEFT_ANTI_JOIN: return get_trivial_left_join_indices(left, stream, mr).first;
       case join_kind::LEFT_SEMI_JOIN:
         return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
       default: CUDF_FAIL("Invalid join kind."); break;
@@ -96,10 +95,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
     join_size = size.value(stream);
   }
 
-  if (left.num_rows() == 0) {
-    return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
-  }
-
   rmm::device_scalar<size_type> write_index(0, stream);
 
   auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
@@ -149,8 +144,7 @@ conditional_join(table_view const& left,
       // with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
       case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::FULL_JOIN:
-        return get_trivial_left_join_indices(left, stream, rmm::mr::get_current_device_resource());
+      case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left, stream, mr);
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN:
@@ -169,8 +163,7 @@ conditional_join(table_view const& left,
                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       // Full joins need to return the trivial complement.
       case join_kind::FULL_JOIN: {
-        auto ret_flipped =
-          get_trivial_left_join_indices(right, stream, rmm::mr::get_current_device_resource());
+        auto ret_flipped = get_trivial_left_join_indices(right, stream, mr);
         return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
       }
       default: CUDF_FAIL("Invalid join kind."); break;
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
index 79968bcd7f4..7ab4a2ea465 100644
--- a/cpp/tests/join/conditional_join_tests.cu
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -20,6 +20,7 @@
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -222,21 +223,25 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
              std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs)
   {
     auto result_size = this->join_size(left, right, predicate);
-    EXPECT_TRUE(result_size == expected_outputs.size());
-
-    auto result = this->join(left, right, predicate);
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs;
-    for (size_t i = 0; i < result.first->size(); ++i) {
-      // Note: Not trying to be terribly efficient here since these tests are
-      // small, otherwise a batch copy to host before constructing the tuples
-      // would be important.
-      result_pairs.push_back({result.first->element(i, cudf::get_default_stream()),
-                              result.second->element(i, cudf::get_default_stream())});
-    }
+    EXPECT_EQ(result_size, expected_outputs.size());
+
+    auto result     = this->join(left, right, predicate);
+    auto lhs_result = cudf::detail::make_std_vector_sync(*result.first, cudf::get_default_stream());
+    auto rhs_result =
+      cudf::detail::make_std_vector_sync(*result.second, cudf::get_default_stream());
+    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs(lhs_result.size());
+    std::transform(lhs_result.begin(),
+                   lhs_result.end(),
+                   rhs_result.begin(),
+                   result_pairs.begin(),
+                   [](cudf::size_type lhs, cudf::size_type rhs) {
+                     return std::pair{lhs, rhs};
+                   });
     std::sort(result_pairs.begin(), result_pairs.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
 
-    EXPECT_TRUE(std::equal(expected_outputs.begin(), expected_outputs.end(), result_pairs.begin()));
+    EXPECT_TRUE(std::equal(
+      expected_outputs.begin(), expected_outputs.end(), result_pairs.begin(), result_pairs.end()));
   }
 
   /*
@@ -411,6 +416,11 @@ TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnLeftEmpty)
   this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
 };
 
+TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {});
+};
+
 TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoRowAllEqual)
 {
   this->test({{0, 1}}, {{0, 0}}, left_zero_eq_right_zero, {{0, 0}, {0, 1}});
@@ -600,6 +610,14 @@ TYPED_TEST(ConditionalLeftJoinTest, TestOneColumnLeftEmpty)
   this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
 };
 
+TYPED_TEST(ConditionalLeftJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}},
+             {{}},
+             left_zero_eq_right_zero,
+             {{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}});
+};
+
 TYPED_TEST(ConditionalLeftJoinTest, TestCompareRandomToHash)
 {
   auto [left, right] = gen_random_repeated_columns<TypeParam>();
@@ -666,6 +684,14 @@ TYPED_TEST(ConditionalFullJoinTest, TestOneColumnLeftEmpty)
              {{JoinNoneValue, 0}, {JoinNoneValue, 1}, {JoinNoneValue, 2}});
 };
 
+TYPED_TEST(ConditionalFullJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}},
+             {{}},
+             left_zero_eq_right_zero,
+             {{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}});
+};
+
 TYPED_TEST(ConditionalFullJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}},
@@ -705,20 +731,16 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
     auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] =
       this->parse_input(left_data, right_data);
     auto result_size = this->join_size(left, right, predicate);
-    EXPECT_TRUE(result_size == expected_outputs.size());
-
-    auto result = this->join(left, right, predicate);
-    std::vector<cudf::size_type> resulting_indices;
-    for (size_t i = 0; i < result->size(); ++i) {
-      // Note: Not trying to be terribly efficient here since these tests are
-      // small, otherwise a batch copy to host before constructing the tuples
-      // would be important.
-      resulting_indices.push_back(result->element(i, cudf::get_default_stream()));
-    }
-    std::sort(resulting_indices.begin(), resulting_indices.end());
+    EXPECT_EQ(result_size, expected_outputs.size());
+
+    auto result         = this->join(left, right, predicate);
+    auto result_indices = cudf::detail::make_std_vector_sync(*result, cudf::get_default_stream());
+    std::sort(result_indices.begin(), result_indices.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
-    EXPECT_TRUE(
-      std::equal(resulting_indices.begin(), resulting_indices.end(), expected_outputs.begin()));
+    EXPECT_TRUE(std::equal(result_indices.begin(),
+                           result_indices.end(),
+                           expected_outputs.begin(),
+                           expected_outputs.end()));
   }
 
   void _compare_to_hash_join(std::unique_ptr<rmm::device_uvector<cudf::size_type>> const& result,
@@ -826,6 +848,16 @@ struct ConditionalLeftSemiJoinTest : public ConditionalJoinSingleReturnTest<T> {
 
 TYPED_TEST_SUITE(ConditionalLeftSemiJoinTest, cudf::test::IntegralTypesNotBool);
 
+TYPED_TEST(ConditionalLeftSemiJoinTest, TestOneColumnLeftEmpty)
+{
+  this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
+};
+
+TYPED_TEST(ConditionalLeftSemiJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {});
+};
+
 TYPED_TEST(ConditionalLeftSemiJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {0, 1});
@@ -873,6 +905,16 @@ struct ConditionalLeftAntiJoinTest : public ConditionalJoinSingleReturnTest<T> {
 
 TYPED_TEST_SUITE(ConditionalLeftAntiJoinTest, cudf::test::IntegralTypesNotBool);
 
+TYPED_TEST(ConditionalLeftAntiJoinTest, TestOneColumnLeftEmpty)
+{
+  this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
+};
+
+TYPED_TEST(ConditionalLeftAntiJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {0, 1, 2});
+};
+
 TYPED_TEST(ConditionalLeftAntiJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {2});

From f1efa40fb73fcd3f50813f5426064e5f2f1d48cc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 26 Jun 2024 07:49:20 -1000
Subject: [PATCH 06/29] Reduce/clean copy usage in Series, reshaping (#16080)

* Clean up copy usages in `concat`
* Avoid always shallow copying in `unstack`
* Don't extra copy pandas objects in the Series constructor

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16080
---
 python/cudf/cudf/core/reshape.py | 62 +++++++++++---------------------
 python/cudf/cudf/core/series.py  |  4 ++-
 2 files changed, 24 insertions(+), 42 deletions(-)

diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 903c4fe7df5..1120642947b 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -300,51 +300,31 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         obj = objs[0]
         if ignore_index:
             if axis == 1:
-                result = cudf.DataFrame._from_data(
-                    data=obj._data.copy(deep=True),
-                    index=obj.index.copy(deep=True),
-                )
-                # The DataFrame constructor for dict-like data (such as the
-                # ColumnAccessor given by obj._data here) will drop any columns
-                # in the data that are not in `columns`, so we have to rename
-                # after construction.
-                result.columns = pd.RangeIndex(len(obj._data.names))
-            else:
                 if isinstance(obj, cudf.Series):
-                    result = cudf.Series._from_data(
-                        data=obj._data.copy(deep=True),
-                        index=cudf.RangeIndex(len(obj)),
-                    )
-                elif isinstance(obj, pd.Series):
-                    result = cudf.Series(
-                        data=obj,
-                        index=cudf.RangeIndex(len(obj)),
-                    )
+                    result = obj.to_frame()
                 else:
-                    result = cudf.DataFrame._from_data(
-                        data=obj._data.copy(deep=True),
-                        index=cudf.RangeIndex(len(obj)),
-                    )
+                    result = obj.copy(deep=True)
+                result.columns = pd.RangeIndex(len(result._data))
+            else:
+                result = type(obj)._from_data(
+                    data=obj._data.copy(deep=True),
+                    index=cudf.RangeIndex(len(obj)),
+                )
+        elif axis == 0:
+            result = obj.copy(deep=True)
         else:
-            if axis == 0:
-                result = obj.copy()
+            if isinstance(obj, cudf.Series):
+                result = obj.to_frame()
             else:
-                data = obj._data.copy(deep=True)
-                if isinstance(obj, cudf.Series) and obj.name is None:
-                    # If the Series has no name, pandas renames it to 0.
-                    data[0] = data.pop(None)
-                result = cudf.DataFrame._from_data(
-                    data, index=obj.index.copy(deep=True)
+                result = obj.copy(deep=True)
+            if keys is not None and isinstance(result, cudf.DataFrame):
+                k = keys[0]
+                result.columns = cudf.MultiIndex.from_tuples(
+                    [
+                        (k, *c) if isinstance(c, tuple) else (k, c)
+                        for c in result._column_names
+                    ]
                 )
-                if keys is not None:
-                    if isinstance(result, cudf.DataFrame):
-                        k = keys[0]
-                        result.columns = cudf.MultiIndex.from_tuples(
-                            [
-                                (k, *c) if isinstance(c, tuple) else (k, c)
-                                for c in result._column_names
-                            ]
-                        )
 
         if isinstance(result, cudf.Series) and axis == 0:
             # sort has no effect for series concatted along axis 0
@@ -1179,7 +1159,6 @@ def unstack(df, level, fill_value=None):
     if pd.api.types.is_list_like(level):
         if not level:
             return df
-    df = df.copy(deep=False)
     if not isinstance(df.index, cudf.MultiIndex):
         dtype = df._columns[0].dtype
         for col in df._columns:
@@ -1195,6 +1174,7 @@ def unstack(df, level, fill_value=None):
         )
         return res
     else:
+        df = df.copy(deep=False)
         columns = df.index._poplevels(level)
         index = df.index
     result = _pivot(df, index, columns)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 15ad0813601..ea25d482578 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -584,7 +584,7 @@ def __init__(
             data = {}
 
         if isinstance(data, (pd.Series, pd.Index, BaseIndex, Series)):
-            if copy:
+            if copy and not isinstance(data, (pd.Series, pd.Index)):
                 data = data.copy(deep=True)
             name_from_data = data.name
             column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
@@ -3434,6 +3434,7 @@ def rename(self, index=None, copy=True):
     @_cudf_nvtx_annotate
     def add_prefix(self, prefix):
         return Series._from_data(
+            # TODO: Change to deep=False when copy-on-write is default
             data=self._data.copy(deep=True),
             index=prefix + self.index.astype(str),
         )
@@ -3441,6 +3442,7 @@ def add_prefix(self, prefix):
     @_cudf_nvtx_annotate
     def add_suffix(self, suffix):
         return Series._from_data(
+            # TODO: Change to deep=False when copy-on-write is default
             data=self._data.copy(deep=True),
             index=self.index.astype(str) + suffix,
         )

From e7cf69dc932636e9dfd32ea1bebefddc3f31d2f2 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 26 Jun 2024 13:28:15 -0500
Subject: [PATCH 07/29] Implement chunked column wise concat in chunked parquet
 reader (#16052)

This PR implements column wise concat in chunked parquet reader which prevents over-utilizing memory for the regular concat.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16052
---
 python/cudf/cudf/_lib/parquet.pyx      | 36 +++++++++++++++++++++-----
 python/cudf/cudf/io/parquet.py         | 20 +++++++++-----
 python/cudf/cudf/tests/test_parquet.py | 11 ++++++++
 3 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 7914ed7e9d9..d1ec5be9e62 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -20,6 +20,7 @@ from cudf.api.types import is_list_like
 
 from cudf._lib.utils cimport data_from_unique_ptr
 
+from cudf._lib import pylibcudf
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
 from libc.stdint cimport uint8_t
@@ -70,8 +71,11 @@ from cudf._lib.utils cimport table_view_from_table
 
 from pyarrow.lib import NativeFile
 
+from cudf._lib.concat import concat_columns
 from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
 
+from cudf._lib.utils cimport data_from_pylibcudf_table
+
 
 cdef class BufferArrayFromVector:
     cdef Py_ssize_t length
@@ -878,14 +882,32 @@ cdef class ParquetReader:
         return df
 
     def read(self):
-        dfs = []
+        dfs = self._read_chunk()
+        column_names = dfs._column_names
+        concatenated_columns = list(dfs._columns)
+        del dfs
         while self._has_next():
-            dfs.append(self._read_chunk())
-        df = cudf.concat(dfs)
-        df = _process_metadata(df, self.result_meta, self.names, self.row_groups,
-                               self.filepaths_or_buffers, self.pa_buffers,
-                               self.allow_range_index, self.cpp_use_pandas_metadata)
-        return df
+            new_chunk = list(self._read_chunk()._columns)
+            for i in range(len(column_names)):
+                concatenated_columns[i] = concat_columns(
+                    [concatenated_columns[i], new_chunk[i]]
+                )
+                # Must drop any residual GPU columns to save memory
+                new_chunk[i] = None
+
+        dfs = cudf.DataFrame._from_data(
+            *data_from_pylibcudf_table(
+                pylibcudf.Table(
+                    [col.to_pylibcudf(mode="read") for col in concatenated_columns]
+                ),
+                column_names=column_names,
+                index_names=None
+                )
+            )
+
+        return _process_metadata(dfs, self.result_meta, self.names, self.row_groups,
+                                 self.filepaths_or_buffers, self.pa_buffers,
+                                 self.allow_range_index, self.cpp_use_pandas_metadata)
 
 cpdef merge_filemetadata(object filemetadata_list):
     """
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 58b104b84e9..2a838ca7417 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -908,12 +908,20 @@ def _read_parquet(
                 "cudf engine doesn't support the "
                 f"following positional arguments: {list(args)}"
             )
-        return libparquet.read_parquet(
-            filepaths_or_buffers,
-            columns=columns,
-            row_groups=row_groups,
-            use_pandas_metadata=use_pandas_metadata,
-        )
+        if cudf.get_option("mode.pandas_compatible"):
+            return libparquet.ParquetReader(
+                filepaths_or_buffers,
+                columns=columns,
+                row_groups=row_groups,
+                use_pandas_metadata=use_pandas_metadata,
+            ).read()
+        else:
+            return libparquet.read_parquet(
+                filepaths_or_buffers,
+                columns=columns,
+                row_groups=row_groups,
+                use_pandas_metadata=use_pandas_metadata,
+            )
     else:
         if (
             isinstance(filepaths_or_buffers, list)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index e1e7952605b..588bc87d268 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3485,3 +3485,14 @@ def test_parquet_chunked_reader(
     )
     actual = reader.read()
     assert_eq(expected, actual)
+
+
+def test_parquet_reader_pandas_compatibility():
+    df = pd.DataFrame(
+        {"a": [1, 2, 3, 4] * 10000, "b": ["av", "qw", "hi", "xyz"] * 10000}
+    )
+    buffer = BytesIO()
+    df.to_parquet(buffer)
+    with cudf.option_context("mode.pandas_compatible", True):
+        expected = cudf.read_parquet(buffer)
+    assert_eq(expected, df)

From 7ca4f480cc93d333398d2c5dd32802a60ebb5366 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 26 Jun 2024 15:54:15 -0500
Subject: [PATCH 08/29] Explain line profiler and how to know which functions
 are GPU-accelerated. (#16079)

The `cudf.pandas` docs could be more explicit about which functions are accelerated. The answer is "most of the cudf API is accelerated, and you can check the results with the profiler." I've answered this question a few times so I wanted to document it.

This closes #16074.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16079
---
 .../_static/cudf-pandas-line-profile.png      | Bin 0 -> 15125 bytes
 docs/cudf/source/cudf_pandas/faq.md           |  16 +++++++++
 docs/cudf/source/cudf_pandas/usage.md         |  34 ++++++++++++++++--
 .../cudf/source/user_guide/api_docs/index.rst |   2 ++
 4 files changed, 49 insertions(+), 3 deletions(-)
 create mode 100644 docs/cudf/source/_static/cudf-pandas-line-profile.png

diff --git a/docs/cudf/source/_static/cudf-pandas-line-profile.png b/docs/cudf/source/_static/cudf-pandas-line-profile.png
new file mode 100644
index 0000000000000000000000000000000000000000..1d5a07c72eb06ab9270ab13e06aa31d55d4981ca
GIT binary patch
literal 15125
zcmeIZcT|&Y_vecR5l~TSBJCka7m?mU)KH`e2uN2zy7XREdXN&3-b9)ZYC>-jX`xFe
zRDsX~NC`+nnFOBaecy3@bIw|4%{lALTJs0FNyyE8wY~RefA@8TKUY(@PIi}!h=}OA
zlHyZMBBD!G_}|f2FXCVO-xRmtFBe=i6&@3n^f9dB-(0qoQ<WnkDvu^VdQFUfPwJ?s
z=R!nu+m-NhVa9^llZZ&7Q0b}M3s2*X^waKp9e&GOCSKo5ZZ~+c4C=qMLf*L*#9Hd0
zS{68Wf3A)54rBGjtV%DH+nN1o->#V5dj0(w?cS3wj@Z}1U!)T_?*J}pB;4dnyBA0)
zseF;1-Y(eX@~!R@=oqFzU0qjuEU_ecHN!i@tI?CS*Hu?rSG!*jlak?|ajbrTZyXWP
zw+p~4gzK+I;0uJS)N^yfg@}TRj&S*Q>5o5u0r_I)<-T27-jn+{l#h}7IP0y#+=>{L
zI(R1&<|Kd0ypToCJf^_!EKJAVBJ@xt#QIgzHCW(lcl@tNr=n1=?ZUfd<RJ9-hje-G
zUkj*QYB5h@Bz!@jC(2A(g-Y*dTk}K7JO93S=-a~s&m;_pq4)xEk~9&~IIpa4pZ>Sn
zW&f`Rua-ZPY=EUXICn;0?-LQds{V|){(-521k`Z9hY3GPG*q>6KLYn3ffIv~mx$g?
ze*lPH_5Q)cxdVOZLqt?&@EmnTj$+~x(3r-8f{4hS95nUUik#19ME7^}1)^{Ea$x$)
z(pD5yUV7&b=5!9kKTo3+5sIZv>}OcJ^7Y->zjfaDuA~>PW?Yl^KWmXfBVgC)8~gcF
zYX-g$TJ;q^@Bvms`afx?`ut|UvEL4*dJ`5_k8NlTmHoYB7A<`cNP)3tTWO}U=~sH|
zm1#kB7&_~RX63lCi5m~_SxBr~d@0b6-snW(+fVZd%<=1FBqG0FTw)th&7n3Zd$R2?
zN!H&8i;BrYyeaJO3~0cNgsQm;tC=)lo9qi;-vW+iwxcOnL20Jn(OC#0qKEOq)Q6k>
zCu@g%CrhOs-MwViSq^HoTiFx=*koOQbH2Q_FUuuf?poX9$Ijvaca=7gi7njl!HKp$
zcCrPxSj!|Z>NuWSza@2`DBAOEdPIMn6nWSC)$d;lrBb`Ap|akY!;M>Jnf^;70Rqx}
z>6FvUQF@~BQH>|!tB{PkJwNN!M7{u>jCGc5?ZBBL549`B#tU)E`)T(J5Iz@SBt%5@
zxwNq<Zi~n8ntNjlzJoTd^Cy-~vs~d1yf;%UMX95w`4U+NJBH<b*kqix3&CFkj`mr1
zDrC)z=01y_9yO{LvzT2m*!{W3Cit6s^hcoDX+}eUt7gvoDe>@^!S>vYKF2OlO;hPL
zdUYH42EG-?GerSvoBpS<iaYsD2mN(#xREZ<FaQbVg0OA0+jcy2<IbWwVtEuNvoAU8
zV$tEBQ<<5uiCb~-uRRTW@%cy5<XU*NukUYEC2^IMPi-epUxx4gQ4M-E<WREzcqB!1
z9dC6SUrZ$CA<iF7j2k_>hQ+pj#Fb7RRliaiUP6<(9?Uv4dU1|vwmo+naM9(cx5kz?
zE<G`g?5xttS3@c%$oHv9{eETHOA)Y}KC(Hry>-&2Wl=`Kbn>=_^NROPBhk~717@Lm
zX1}Jv4%a;Hy`PBzSTDi;bP=W|ul3pbsHEC}6Zp}AXq{iJ%NN`f9iq=EXw3yc-6LK;
zw%?ndM8YgYiTMq&xDwGZi%wfnN%vmkldp%G_bai2eCLB2u6i>a9mA`AFl)8l74$El
zz4H1-TNnL4@%9_r@9eao;S|o|$M-~Sh=>?`u(_<+;WDS#M)l+!4jG&?`pU((_AlSN
zwy@5gboiflZ>PMYx4!jm+*?`-N%DHQU?GH7J<WUV&>=;{jxu*4_3pq$xbH$InCYoE
zroUg-W`D2RO9g1j&EIzw85b+bf&Ibw$~}tpMJ8o|<L0klFcOkXzr(~OWOl!-Ll@FC
zcfrA1ZMY0KDU}y1%@HwUbu0Dr$;)YloK3UlX%Cd}OtqP8LAuX=Zva2DmF!WXXr{1F
zzRIIKz4Z1bWYfqAE+sPAdjCsl$rKtH{UTomr?54lp0RZ@VA)$^8<2B>=qDmeS^l-Q
z`;W!e`gu$K?x^aEZtI=OnY6ax#h12qkhp1~!|`q65{|*7W3|)yku4L65nh`B=hYY0
z@8GO&$Jw!!18r}oh9w`m7x(L~^j^rsnVeeW&9wy7A7Zyo_jP;OCW#th*;;bb55t=b
zww>gEkZrf1`ocF|ICv#HZrSvQHy$odxy)`=oDAp6uAWS8Bsf<kRV~knlx>R-d;rrp
zpU>Uewl0P>R&uMHR2UTvFi)w>mzyQI-1t6kAExjj*@92R3yECE-O?Q3BO(%u-CJJ{
z!VG><4l!bB7#XtzJ?Oyl*cz)w)YGW4)P2J|y$f69GmhtprjR`<11tKpjXc;W$Mjnp
z()uY}^4^1PG~>-+KBE5^Ig4n#u3@9Vxi<}p7W|+Sme@~l1N(gEO`jfpD?r^V&R@d)
z{mqttxZ9ZZEAr(W_n^o7X+mERPoxjLW}{<{G=fxMb0s&VpHDaBt@t!9A|o<zJr667
zk~TixCL)^1HMs~~2PvB~x+X9N0Kq16UaMPsAwLv*9p7@_GWzKah*a`2GcHu$!F>G$
zYa{FEXV~2AFOfD2sO)f_Kbdt8XtoS~Yl^)}oL<l>#%dKDChC4RzPLm>*H3nb<CnWd
zhO~WH25VDz-%>?oBBe2DR;BFG&k-4MdDO_0E*nhmScnR2U$eo-AlTotpg|(jG!Gfk
z<SWoJET?@cfe*b5Uap;pQl1C;%|=e?Z#e2Ma&7tUzJ~R0i#1+<jrH@ZT~Z)73T4o&
zQdIT%i_0<sSXi@yO=?n|--OR%Q)PWNBP=&&^&%?GQ~_Q=VpRsz-k$?%7KYm(;EW~q
zjZH{&svmYT;={)FUTL>&f4FoG*MqVBjrVq}gYs34#c7m&$lR!#96o9(I_8aDlSkgg
zQ|bnCd3w?mO9xLj8WE-ACNq-9JqG$U3&#+XlX1W4d}v3L(Xq0xV6XAsNPMbvu_IPi
z*3x0|3eh`1Io)?hb@N&95AKfG-Tu?a`*%G8ev!;KSs=qp9z8pid=u@v8*;SN-G9;_
z5HI_?4BLhiIPahxStvVwF_nCwXU>+-@ft^_SHbCn(-wM@I8w1~)JmT&qs!5b{q4!(
zyV5`4=`1|8dtJ5EqLS9$%B!mprQ@@U*a45e2z9>`_52xWBOf)p!_PI-oI{`9mYOZN
zeBrHOCTEY4<}lDf5luew&Fp(C)_=0g?C-NDN|)!xGiHb8lfE2q#dl@L&!nkjG7mb9
z>{c`O-#3L5uYDpWdbjWl?*5`-dDd@vGI~PURmOsMaA|&B-?h8|TP^&hw<A%OE~dzB
z?g#oJ=i9yYvIql1zHgCiGCKS_BrzDxNp!#ffaQMGd$f4?n$}3>W}!5PyY3JA`T->K
zPpa9yv^`O#7biM;_?2CPn9eyKg0#T7&5{R;AH$eWR^s^%PW^xQF(3Ex$uOUQ8`BMe
zH)5_p<8;#+4^AbpTlerGzdX-Ga&2aJgZc9YZ6qc0AtqGzxJOxb<%Mpla<T2qGe)Bq
zyH8W2`2s8+?MogzT2(bv43xG!Fy21l3knu>p1~Vq<M65NMT?_t@HqJ<wXS)*@ZRSZ
zNv=K=O<y<%k#oNDv@DT^m)f3xYMH2jIg+_}jRqeC_5!rMu)4GfXmMl$poQ#=rj+^a
zWhs34Lg~7hIx^Ng6M%67mv75DX<U>>z)r*m;9jY}Av5C7QE&c1H#V%nox;tlbT%WW
zyF)8-U`(IV=~i>`cE5TyLb>H{BIEc08Oc7~Jv|Av@7f(YJti(FTaxXFVER-b!*RNO
zaB9X_yjh@&Y7wkzPku5@M1782B}QUgJOiENW3J>+YjEm(0dth<Y95e-Q|zyDVSfcB
zIf}gJK591IIP-o|P=nhaxaUh-_BW>4S6JQL`z*cYQ1SD={W*RR1MLYP{YO9k7a(X+
zDdG~vcxI(oelI*IcD|Zvh=d<1+%Nr^!gPfh8nN>uIC?$%PHoDyxYu|8W@Y3#HlxKx
zTb_c6jTz3E`V6nHgmgZ<e$6F1G#o(QAa=)s1drV3pAqg}Vxn-V0~jbRStPri&qPu^
zxR$b(ACIVrG)gM3EB`qf?*V+|XVmroO1Fo3cIJ`;0|i5|m$}c|Vp#9E^Ai<pi@laY
zByAPlBQ6KF{A{uBj~Vk1zWMlS@*N{75CQrPEB{99;U42(GYVH(g3P}}DBL2<7pm}|
z`2wk2WiiiKnb)|CpELZ&_&E~?fqV}!lO8fPUTjnTi%TRPn|rl~kIOtPYn>PR(hbJ=
zqO0OedxocLZrgn_Z1PuD+p{udKJD?5_217Im2v5u7UPO;7|}l3UYNaJ?GQ_AUt%0n
zu8ZDQ40v#_+5|4eUFJV;Ih0m#U|ctg6u4pO(reylu@l<IzCBckZy-KFiGdIjl*JlO
z=x?Vu_PS4iRJ!q!&u9_n1P7A<iK!Vj!Q2flXWy$G{F4F1Rr!cC)mw`z>a#;*4hn_;
z?n{D}xmKxJzX?&tQ}E(NbxGZ*X%%hrt-B+?Z>#aIlY)FJ##d@~?|D2{&5T>tpIV7j
zke`gGJsqK6uXSS=FTkQ`{ZA%DH?1?g`N!*(n)ZgPCZ%vo-H;Ohg=G^-N$$UpO@NO6
zB`Totj-3pyvb2c310}f}cri6Epr8SlJ5Yjc_|2FPu}xPMqzI2#0W%xmXQn70kv9q|
ztTY@8V3D$~)2`MD)g6G0+**$^V0RiGzxO+oQ8=x@dH}n6k<@07D1pUA4d20?54E1!
zr}^{;aVbcb+28KTK>$(j_xbYRbQT0#a!PmqI*&7@8X8#gqRpZFN3!@l=swNlWH|MY
z;yW9M9>9U8fmOMPJfA>CFX;nVDG1%Rh8E%V`t1v)2JS;gkv)BZUiG^(X>Qw`rb4Bf
zbLJPe)?ccoRV5t0X|%A>zs~Lbl-mNNyz0_#RC&6qywc=v_QSEnBAQ`5hC8NpI87wC
zj4Ig$$_5lE<uWif7WX;1_%*q4q<-ds=}h8uM7rb)b5=)-?K^c;O#1iyM@A~C^r~wh
zpY$qp-j*=%OX)Csc*VIvqVqGvi&qzC#7jzxw=E*SyI(4=><Oz#k|f=~^sd*_dB;+)
z6w&E>5NR1;)8v~tEK!k;S_@>q;ykJtZK)yuHg0S;uU*KN*9HP&cq(zaLz{8En(x3#
znfh>0SslYsu@eH?$mo4&v6H<EGyd4-zu3!1w>#4WP%sAh4O+T<kIvW9pp6Dt7?o&>
zfuA^R2t*)1vCA)}<rV5P(z;(=dy}&2j+7oWe$F|WWv2?HM=;1n7JRzPj1K7$M`9Zh
zc}q=jA{&c-FJ?m#XwG$quALWDNMT-4YFd*q#G0u7i=jYr4siZD7t~|Y<iOJk?~geW
zD@uMnAM6`{RVQAoM^~nEKnpjAEp2a1R@apyEW-_|d)+-;m$nOcH}41s)5Vz`n(^fg
z6;wCP$X0sPjJ|XZXg^lJy4d>W@dKf{y6o9uhkk<TI7p3{+$(8%)i4bc@6w^JJw*$7
z_u6qei&PgP=F}@5FabYyj&M#XGQMe{x-Mj!q5JIVHxZq&wXYQs9!Ykc)SBF4!&U|r
z1us=!YkLc=aSg9rxc_7+?K48~xu6e3v^kivNsj{5(A7H9-FtYzbUfPV)R)d2f6XlM
zW&Wj-ipb0MTEH9fh5Rn_rQ2G>ZKA&odAU~Im9TL)BeIZkiRmY)14wH+;oE&Lt^Zo*
zbiP^p#n5Gye?Qs$@jRchkI}C<?C{-@i1yfO_DUCS=u9{Ngv>E@jWueMd@LM(@>@ON
z2>t8U*QDZ%-oKpJzgHQ$mu+}a>B*#C=L7Ps!mRy?Ty}rLW!L95T5Aj=zbD7rayn#o
zZd}P-?9o;68?DwEve8t&o>!>VX<cCH90Aqtp6pmC3~xeM7)&rDpQ_|4%DxUY?;{0C
zT@r?gHjq9@8R)8D1RBelX)z?kIn9bZM?JEfMqy*3XP;<w8*0!C?6LUmDTY{(1jnmQ
zl&21Kwuk-1O7qb<qR_euP1&=XnUZ1bCr}0|uNwGc^Tn&2(26w2%7W5ws)BX2dQCK(
z)9DRf&JF^m?IQHE4~IHN=8L(ZI9zb2h@hp`jL>c7c-4mR!M5zM^4hz>f;@@5vQo|;
zAND@y1jJD4$LDvgyv$25<uRz}E9<ns3hCCVo&n8$9+(bKX}NRM5yu}dZx)muD~{KM
zt_}BO?6TLG%kvD)a6%cZMpC%SZtHcv04A{Yb{B>w>jQymLaDl_ZO(g7m<K`kT~g6+
zS)5k~x}&I(@cKPPPAFy}=i-vF@??)9nAKoO@yISj+}7S`{5win5(2@vOk1(ck`}p4
zems8tF#;;8QRhj;AFG?~Xt79E`U}LFkRZrwzJAcLo;gx!^DJM2w-Rf!)5>?pR#>yB
zrOI5i`xa|s_{hiTGBPbL3FmRTWS0+s{ztuNafc7e78L?-6~Lkm1E!=v8j?y90iS7V
zo-q>H!LXqV@X*@ehY-Z71*8;!OJ3l1{ylPJUkZrWylmn~VN&40>cR;%_Wn3y`Z~^s
z_pRVI1VJsTUzA+nATap@`HOYP5X3OEFL=*IM44kEl@sr#?{q>m#CW3osbMMclQC|y
zCi^-3u&k<)sV41g#s%li873r+5J#h1HSz(CZHPh-Mku_1ih<N!<sU2QOZ^#3ny!Sp
zACpr68PgFM)H+y^Q_tDW0?6(qM&Dr!*t+1cV0Cr=gTyazaTK*z1JE=D^U*~Tlx8hb
zig*L&rsjOM&zbF`4WUL#(^Au=Yo)@nXThK&AUUo}sW(8z8d!BHyg_Y;3+I$L91fE$
zo5XEjqt@$N(khi{;(p%c-oM1jpI$7Lx<dx0XKDBH;H+9g?rD}Ha=q@%kg2%hUH!e%
zXLIdQ)--}}Qb^n!nJD+NkZ`Vc=#*M@fpzb*g7K^J!Yzf_DVm&Wh45P{JTv95m;ci$
z=sPM<<%g`K&8C8Gn2Bc>>8LM-$;Q9QTQ4p}<Y)c{qCl0hL9e!VY2DTt<_0<CB~vcf
ziyW}!eyo3yPy4+AV41>5B~plZBgbD%%*;J?Z9_fsl;b`Yexj3w5QrHFN{sW>>nesc
z0@H<MgzAFs3y!kkoB-eoz=37qQs|plW)G!?gy9?OmVDGE;B|+mGg#!_ZMfU(IGFWq
zIH;iyxn~Oh5tf16lYH`;DJ|2R@_b%nL<6heDeP$;J2!OU-dv*Ktg8DilKE05mo7gq
z;M32fXWrLZ+JDuuO{zCD@EiOUQt;^xGdc|K;u8Sf5SP}Brkgev4jA?tUvI7od-IuW
zk79&hKwS`dXBTOXa{JXwbIQg*rU{06YdC}ZYD&CJkZr?Qd6uqP_q6=!-Tpez6x)cM
z5;(9fHU*|viZDVwhN|XkNj8D*JNY;B_(ue1^bRA#TW5F}+G+ptbJ95~3)_vnUCSzd
zs6UvvSw)3Br5v-JGA&<((W&?xxrbq4t87l$z+<G;x=E(mk$A*|b^P*QD8Q2xtJetG
zGVc<2VwA&-CvCGl2p3oEUHP5$iYdR0RUz`T_8ZPyo6PzTMqD3~L~kNqbE}SAdU+Pi
zw9UV!65@9TM`6BsgwvFFS|yd2XlM6RF547CmeG;HOy5y;<HFIZ7K`u5iq5{kLow7b
zY7Osj$Biq$H#hvHyFtKWkRdab*9j`8>;6oK33!9}`&)R=(k<cCPcM=;Iw_<SIMY;9
zd!!K2#k0)+f>wf3&FSp-9K{xEd;%LwO6^L~e8YF$9tX8OPTYwPR~)HwC?B`pKe2l@
zDC;+vt3my*Tk*WhoFOhI=JHkjbL<%??Td5nJm&vEooP7|H44;wP>4xhnEXXEm`UE5
z04ys&<B*0MplInc@C4h^sxhTCII3pKQR3SVKJ%BK5kWT}FNNbHCP5f4naDN<A3k_R
zSu^nA=3|7^KwI`zQGKDS&Ld11QGG`m(F;KKpqm2fc;h1I>iCcGy8560f}pKaCI53f
zUSCIO6ZG{D07^W(bKo5&L1VAAA!zK+%^zQ7*?o$K61>j-&zSy?#4ZOWWeMsbEbnvf
zKN~^;mY^6f0`UKfr-}c!Y8(dS>$&vvXHS;Q`)4*5e4bmZ_?ZyEk2s=|oP&Z;S?K>v
zicb{I6S6-?3SW^Rq^MI?a)0MHLG-+L@uM1zrRNokNPYKT40hBPfQkR{emaG{F=@3)
z?O}7eq@R7$O|%hTRqy0j=8Y=#K5|hrBO^)rgZu|ejs_`8j}c7r|2a3^P8#M;9ry&v
zhhDVb;u&@iGs!$vdyHLl#9{Sbf;V39d5AqmY=i!_n#?W$MPwr$_#EFBV+Ywi7_D+|
z)N<W#k002-vYVT~*|;C+Q!0%yEt^JlMXGgGd$Oe6gvt_zz#D|-UDD>M)jRb5zF=X1
zIc_ev3j!*@MWM=fFfV<m-4Pq;uR8Yk7`@Sm3;|(pqV5=R4wxaZsx<T1kNPYjtcKmA
zsx6L453iR?=|1$36hG-Zq=m<6qF5KPK{0uRl>@u2Ym-)!<&ZTrDgZAUQ_0ds(n(>u
z)QnWJgLB0LC2wgukN-SC1kilUDfJ$4$htW6bNE)XfO%i1il}}P*0#|%idYy%+i<{3
zU#o^T*8+;rDmWP40#l#Hr2^_Yj&rSil%Jqq!krJXytRI&D}%p(<ar;_M4UW&4V+Bz
zDGjS;8)Bxf{x)17hR?fZJr9(ZoYcUz(;sd^jS2C~wZw*z=5%+oq(J%bYueC@rV+zl
z#Sx*rtg@m}iYV~L^D5bCwR&|C@3uVCiA*uTOS~69PnJI%G%eodoN#<UI;6@$Y5z<F
z<h#DVp%w)!b(mel=X~^r4<e1OC7AYU0IC7^;*Yyj^Bpw#?<8)mWWR644gtw1qz|G5
zxb%P-b9pw85i$f5ZC|h%odZjBg>cy(TD6JN55FWYex~G6JvsW-{x%n#tE;x;%FNdy
zm!v602Gh{g5Nn6Quvh#7%jPt|gt1{J_|>yjZg78&8X3mU9xs$Er@m-fT*0@w-u*@&
zoF18-9TA`&GYs}Fv6K8V9a{rXmfSWd?}s0(RBuMQY&MwOFGaW`rflV|7DNyTzhlHI
zO|Ld&XFqvL<GFqxLmNW|rkL75I28-n@#N%H6~E?lF(+ddAAJ02B`S=Nw#>Fh`Gddg
zUKkzR^NDK_TJoTBS6xc8(*moqy(v!YE6=jn$B$dC!0jK!T+qsw8s^*GcY*GH{8Cug
ztZN%4r6kH<y6AOLO*&Jt`8l5vak-f{ABW)6@m5d6UVS^A2L}G0C`q#qSut7-4rZDV
zi$pe;`v#nvk5XW~rFU*D$r`i7U$)eBzjvdYRjoxRfsPgZu17q1S)Lp>F*%ti*NZ0J
z@>5>)<bn!)y=_N1^X;$WP90_*XIk;EReGY-?3@BD<~Y1@gz1IZ3iTC!4wxS2Zxhl-
zjBZ7b$?{a$kgW(pZmg;)4`?_53~+%R8E)eX<Es0=^kpS)lW@`6;b+87=;An6vWvh_
zBjQB;Ta$*MPqWXgrno%F?s&X*RI(0a=g?T=B<Qx~pby5a93%Y#s$8;(@fwOAD*<%7
z9V6TIoc173_a1)WsvK15?z*s3Jke+mwB~VA<YGie*pW*zN-;AwuLhx&Ken=rf~}J-
zU5;#y>O2#b&gY+dF0yJr@55c7{|X;7zwz&xOMUI))Z%QP7WOXznaGl8Oj)^koD+MB
zI}057{?+9Wh>(4QeQH~e!{e@xIyD;N`2}{6rPnzYzQplH&0#qg;1zZtGj4?&Kli?`
zroNdvj(Vw{_j)4w#tnD*rL>-GMBn;b=7QwLpN$UXH5S%PvrWQ-twn%L|9RR4<$M^o
zbD?a;@3qAlIj|pZg8D^&%eq$T!{F3SHcPI!v`Eh#kX`B9juIQn$BGh!kzmrTz0o}}
zU24ovQQ^NzrFjH37Ch8>WAakO&L~E@e3Pof%0?m>eJAAl$u3&aJC9K`#QLF##5f~f
zMc~Do!@xIx_{H<_5WucmtwH4<@lbiCLl98@k2c$vflRY<S>U>fSE4~#NuC=uNUmf4
zfwdIylYD@MRF_8|u5I2}?J7&~`QmGAuZ{4^sls%QljZx|pF1&xyq6KHKbBsYL7eQ|
zRON?%8XLO-ukQp0joe?nA(qsna*jdC>+%ted4xqT6WcA3=uEHr?0}Tz9syPEWI+Z+
zlbFv1GrUVtK2i&3rL8><R7M2F2>%gvGx+!I%?@V5vlGl*dGQ|H_uqK%KL$BU(@hqc
zy2Mw#!N2O@AMw%a3I5yoYI<j29Bs*}pw&hEGX<6}12ZoC$q>Ul0<*++7e)06A|Qby
z=Hw&JkmW?4yl<ZU|6s`fDn`OHAVog|Lz?(7$8)7KuY@NPMRxp%*SgQ4n(>nQ9-Yty
zJp7$yuS9=JtZUTCTNg#%)AQmL*)wQ!%mxzRP9q?j*wTsePm(#~Xep19sT!B%QA~I9
zetzlf$<IE<-i>v2oKN(llBuJgeH#Q(R=2&U#q)QfvG-a&UW#o@+@z>V-2H7igWHD5
zHf|>?H}7|s&PJsLrzGvxNW2j(xmLgTVx|E)0pM3acRRfPvSucM-j9o6{O@?P8;`IF
z=JwN1sgTb`21XN$dkU@ZiBL@HqV~$!fk$H7O6*w(KTS%vLu@gf$tA#w!ge;TxCMcX
z&(ZdNAz~?iu&jgcB9+(XN9x0uE8R;l?x>_1C=Syh3G`oI&)!&++R@2JI7kUh%m5AD
zi)9?XjMYC!(b~@({_N|em7gTlBX|`(buf<2@T-1DZ`T+lljy8XL!NGj97MrAT7F#w
z%{(%_fZi1?U7?Rcm!tqDAHRuL?Nh4Cle}xWM&|NkE-}nQv0JP%FYRG<PnqD#z}JLo
zxVL+*_m|NsYOg!9`XkXKXRN{)Ur~T4h=@{Pqwg;qs6j0U=}~zdQ*fkSbN2#f9AJf5
zgwKLZl!6FY!7#hjEOFP`HrZs<Ht&Z`mBqVH8xh{>?Li$*F-9$N0&J2h(K>G$Ic#6+
zD|$%CyYE9$Hdyt>ShK0u>ZwxJKYj{v0|kxvT)%MKx5(fK5x!MQoXN`rE%4Z^g#?F-
zZJv{w9qM~hyKvu+iYu~?4p-b@?;eDP&O`y|kvl0yMqFqPxlNL??TG{ILYJ2JKH5xB
zx230^ZR98|0mu66@KtQCnSJ8uNDa=Uaof(aU&MAQoW<p(-&D}j8ZS+3>*6oP>DMyH
zo{h1G14iYPC<Y1iOHVaY<m)rS%Ccj)VHR4h>|Gm9h$+61+7y;NbQzD<L%8593p1A_
z`WbB+$eLwj^^R~lrgM}*I5zpA!wq(f0y(jdK<G-6L=o3U&Vf9?_HE&+Ld5aS-AiI8
z=e;sKl9Vf!u6djgE0r2F+mrB!Bou;(o8qMI-#8+5(jUDjK+660#1-kTBa@gaZQ%5E
zJzCiUGrH#SQ(nQ4@e3dNy4JPQ6Sa3+P20ne=mYh|XD*d&B$gbhBiAmI61$4Efpd)g
zH(!u!=PAnsUFP%8r*K}IPVR9Ji3!d=gIo2u8{Jlm3D>n+b4_r%N!M+-4|QCGFXd!t
zxeu4MeAP=V%TiGuq_<SBnpR4pxt$)xzm{T`G#GBzJ0NEMc{}6K<*#BCheIWsSXT||
zk-dpxQ_(E`o7~}d4PKyLi`ZQ8VK|Sd1s~nI8Vg*Rfc`%R8x`QfeLfT)pM}t2+>7q7
z<13aSt9|b_G3{aufHY>4LT8xe+n&{YQK11b4?{Ir(+Ch-)_3hAtL`T=cyeN|=i@G}
zhM+14BD=FQ<5StOoAE|a4-GyfCB={Le!kdpS4JvzFTD(oT8A}N+Ch*hG8XI4E1o=)
z6w^)%@ou<MSy*AgTz|bZ#>g004SZ6ePc!rw$Q$qg4YQsdTDDm`T9L7=YG@`~5HF*&
zfJ}B6o1|3*(p1iqRcq%5NE<^gE!^;;a#!FdQD=B-3qq?z9C-fC4Ou}=Z{`)BsO{L%
z#D|n0y#<0>l&(IV8W{R(X=ScFjkL4Z*q>biwx(b2@N0n%A+flJm-2<F#FOfj2t6u9
zfH%}|P*Fj*^^Z|8hv)gn7~=aYp_XhV$l-GY*Gx6|t+2rm&^?siNF}%Fg5RL*ddziy
zH){<=VV?L95&3Z%=XB7kyS){3rl3jIYtd;1T`>^xik}t2R4xzcSv#OlM1zE<biVNJ
zD|Y)0P0TzWw|BfD#lSzB$z;Io)pI$ICtRdD2QgTxO?-+xIj6tJ;iS2&WVkGZI~Bw!
zS!9}K2gGnUerTA68aX@9FlI-ArCA@z8HcYfEsb+Qxn|RQA5Jvwc12$SvcClqstIf*
z<;1o^C&FDykXjlj2DgCehZbAdN+hY~+Kasbv^B~vmSJCX$=^*BkQ05J_lfjjD9^Rl
zHg}&$m)PQh(}ShfOcrHVU#LcPio_<0+Q1duRb_*E<Y&^M*r-@vWwtG$rxF;hZ=iv4
z<Xp|#ICo;apK<_+Z2fsZMF)!fvfB(n_tzH0;!GNi!ETSMz9QbI73+&od~YtvM~p_N
z!*$B}biwHeW2$6^WCAi3wbDJI%L2`5^F#D%uY+j<kOtO|2+0*SG<Tdlf4GL!<1c5G
zYVpZ7r^8oTLDsp3Dv=<vD%+Ev=~?E8(NF$3Mx`FTaXn;0QbU>9+%8c;0j>kcdaQTL
z_N7FEq~TpHVW*t8oY0JL&NzB;wR3myq8-vy&lV$vT%?zu5wA+O3b3I%+@pj)l<;4T
zo{l8W7a~y&)A*73O4)v2es_ubMz8vnMJB|HEwZGHX!d|~3ex>qN+*9K2UQ@Gde}XQ
z4Zo%5DDz@FTQ=YWxA>ShXEraS%hfXeto^~4b-%LjkzL}diw3<TFI2$4-kIghe9~Eb
zjk;>@Ra!1I<h}H|aCkUOJ#<!B@=F=gu2+L~VQ@gaEc9+A_aaIzt&A>k%)bi8Dvv{F
zX|*XQO*Er0cH3i^EA4^PeCOc$K>qHP>>XaYBDM5yca6Uys=Ql4OW{tRnl8=rv}F4I
zFel!IP#hbhKyJ-kk^8wNj8`-l-T7pF-UD#dm#AGPS6ThkdKx~JHjb|zlSZ7B!biBF
zr~3MAG8=wJ%~7%O6Xr&evS6h$4$~wXQ6R9-<6n+VS3mHpJlxysMm9p^Nm(nQh)0HB
za_+6({PJsT29Z)OWTE|LjbX1K@y~*j^48bPE&E?d-d<HM$vYJi_}R)Z9=OQv@J;B}
zY<6$G<naPqlzo|k>%I~a^`8CJ-7scyU`AjHl6(e76Pcu%`Hku6O}6%SmComUPw!*b
z=*Ez0Iwg1@XMd4}DCka}xoH>j9z|8F?EbqGNgjRY#a^GHJZ8R*J+g!MAtjDNYq9}@
zaVr6E1iM8-5XY88e17GK!@B|citxhTTcJiRd(U8Am(&c|Im~I#0&n<wTQgZGo8~0h
zGrQm+=>*4ssu4rgf;)`CljcQ=CE5-qq1BU{A3r`UqT$i}B2S@H<fjG|?xv*LvU*6q
zI+qkENy3|6W^NgH2`k((!zt`y7nRB>8v)poJd6L}HdDWD*cwV0QT#?a!tHi_g;dUG
z4D8<N9ID3rgFGelIUz8jSO9=B<?}i<F;IpKYzul{+*gv6%I(~#z&a!Au+!zA1}Kee
zyu)QJjc1qt(DYE*G3BZgeDju=?p`Xd*ye6dnz|V}T#}Qo;A!HuAeH}4Z2SW>%{yNz
zy9?BB6c->MlCB(w$+gh>)<{^Mu8;ec96HbUF-;Uc^~Z1R@N(SwF6?(j^&Nf}z*$M+
zPsJ<R8@N>ePW7ie`9Bill+^%1iX5J`YSx>&{AYIYe-z69r}F84?%+7%foOb!)=7Zu
zu?J~VVqSA8GJ))mN+wuc)Sjd09{r)^{=J98F`M`wJ@@~11BV=#@-R~NGysmHNa;V8
ztMTZSI+>sxGJtS5(0XC98x)&U<+X0d&Sf4v{Gl~|$EyST?t}gmq;!Aet$mjD86bcI
zo34FN4no)7eaCO1%gV=qzvcAjQv87X;|YSybS{gZ3z-*ixa@Giav(r7$&2c62V(#;
zN}F+Z9~iNO@5lR;`Yp_Kn|QicM8ue%XTS)35SnxpYeO?C<hM%K_}p1?+HWZJx-h@U
z&0>1y?8#qWpH^8gX$fmMKtK&@vr(*P2D(9breFjB?zr1`tM0wrrudn}>chS~>$7J7
z_gu*u*mssW49<k_M`i|W9HsJQBXR`JZ7@f_qzH#u9<W(zz2Yo<oNwHDq=ro7PH`Az
z`#Q!F%x64uy4s%M<P>VxOPT}<_dctb;&;zf(<*rt+HE=LoOI?hZB?H(PvQsH;WT8^
z4=3usF_=G-AP{g83eh*RZHer2fV;Qr3?I!cqxWStv!*ArE1Sc?rAx=-lccQ`_D#Pa
zi6-#rtsUWkFf)@jn5K*R2Z+_+2Y;LcQ|fo;*$?*sE`!eLD2x&}bkg9!pA^=(o@(gJ
zxFrlW3OBeP3mY@2)cOWLWfAuAYir8!*i(Y#Hx|}S&*Ov144y9Cwx$kW`LN(O<ZbXc
z_(OdhX@Kk_XO1&r!T+LUwoXaA{TuxxXMLEkLS#*@i{$QcrwIMec;Q7mewcU>*gE&Q
z%k4bVFHC3EZ@hTB0h|XNg=V@8>-1Dsn`a|rz2@;sX(U=^dtW`nb9*qEHULm(XRy9s
za#=UXldaylsH6es>b6n>&Z39f7P9#~U>g+S99yIP>HXe?yn7+OQx9$^uBQafMXELR
z8jF9G+053b6Q0&onbrV`uM}3F7_~JP+Ay~#zQRS5wIs;YBG*$lC{WX<F{#p8{1Ak_
zgBdJwGNboW;zu^womSnz4K3$?cT(Eq{j?UBE^B4%_4lFjoufjHzfvC3OXT>v=U2>c
zx<$to(+P0ffw}#IpLWJ5_F5CB*=0IH>a|!$;#pAJ2RH%u7Yd6t%yBRGbXD}J^ccKz
zs#+dECN_bK=&ZJGV7)S_;pmW<HY)y^yB>0K{H`?YJ`YrDOzTnPos8vzI7%&4nQTT#
zNCz2i!F@*EeP8~`or%u=j7ibBFaWJZzqt^~TKu?`PeR#$Z;+W}GC~fl<VT1q35$)!
zAcGvu5|!;ah6zhuMlRJj$g0i8GiMwbGX0o__cilDu(GgG&r()Vi)ZN2bM>5Nl{hw)
zwx^A6xuDUEU01F%ygj3j$p{q<bDw6<=jM=x7V7y&;KtFX1#cpT!c_Nvw8XDImMYzt
z$LZ8~goUL*oQ$UGfWZBag5J{}8&CDgbB7X#i(=iC2r%*mkwJ36)nlv&&hIf61A&ro
zv0Ktup8zk3@r>&rl{ErYWz%RRzMIzI(#@-p4#n?Mf2xJDxxp*;7Sjfmq(-ue(dQ2f
z^;1>n1%4{irY;t=i@UzCwva-}Pfdwg(m-9udNbfR3*m)FzSv(c_+Kpd{j8++8rOVb
zZvCS>2Z8{+<XI#uD*24a_*xKG0IAGHI6tjjZtl+mrDZmL0OVVi-U601@EzU<sT2}g
zv%p14@!p8g8c)yZ@$LR=wJF#DgVsvj-TU>hC;RJ^FfYeOO;w7?9?v_2In`bbY%^9l
zQ$mC>tfkVEXHHG<!|$fU3*C*1D&S&flAXF{ZB&-8J~&r+S>$7<F1T1RvUtGaI#7qh
z_JkXnilG*l^s(iHO0TF<C)4vHKAhQF$blQReZsR^C}1^He(338gN1Dh38TG2PJ<oi
zmMeW_M|m*b{#reh=x0V1#kH)2i2oiD3r%x6%|iwzt-q)7DV5%dV(>vC=cumva)PP#
zXss&p<d}?CYAQM_xd`Fqp~3_8$bM|DjAozC0d#DpP{Thm*L?~Ti1Jy_%R%sG$D=IO
zLvG4i`R{+Qc%!QiKC>Nic!@$nB;I}ZNHkP;J3@xLM#d>r$xp1~1+p#99fP3cMwWcB
znvKZ4CiRrvYBf7yO_#fPzc6zjbe|D(rNZPK+iAm%fcqyw!Bi_pZj8YFHV;k<X83Vj
zR+;oOz8|9*JexqZx@@$nbproD^T<;k5`MQ#eOWbb=xkIyzuQJt%!;<Q^}{R}8&IuJ
z(|=z8KdI6l6N}}^pdf&PW&b^&y_cg$*$$A;h#VQb3+D%vcaS@!4tK|2)iwd59*Pw}
zdv8{yLRMul&n!G90*N=S^}~9-5lva?)a4TXkQ3PZsR1a##3wb)S0_P=%Ln0SMjqw%
z)Ul=LQ^l!mN1fA-wt1;3E%m&%_dzAKD*1c$N!N}PUuwO0<$HGVUm<N@0JfC=`3@zf
z6GnN&W_C8tbGNdY?uY2Q-4YVG2@gg&MSvKPU-0>@lBNrc!;LM^EH5V?q1Lv7Mz&Md
z&+8pz-VnF#`Ca)?_H+WNq4ZduutYS%rA<CgX6u(+D|rR%`#Lgg1*=Wb@4c<lhu(8{
zyR022?5BFIkXSN4OQlXWH|32AdH4?APc@94jk`kBqap{dyOw+g?XHdn;K23=zEx>S
zTUC-igOMqC5gL7uFj_kA0isv_sX}*1mY1$BuJM5w+GifjyVZ{%cO54G%9;N6hDrS9
zPdRY3NBfxEM)fJo_?=V=XbQMDrqM5A$;ZAKR6+Nz3iBV8c6=&;7zo2ialj@=z}n_M
zmjVwY1A2RBzJ>+~G2n0d&n>I?8thpWj_A*7GzQe;aUu0vB+Wk+cfz4CJYoNRC=6fT
z`QN*s{r{V-0AC4v!AVFgZ(vAYXc9_X%q&4SV+m=;f0a0V`;T*fe<uyE&JGq5WNX6N
zzjN98|GOR0f1E{(#!w*_-@9P{YF{0~zc@(eQJ9+BX81f}ZWYTujo$vh+85s5Qlxcu
rw)34I@gIje|LZfKjx`=zr<cmDQy1+-t2+s&L6v}NPfH%Z`taWXH<h@6

literal 0
HcmV?d00001

diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index 55976740105..cdf32216619 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -53,6 +53,22 @@ print(pd)
 <module 'pandas' (ModuleAccelerator(fast=cudf, slow=pandas))>
 ```
 
+## Which functions will run on the GPU?
+
+Generally, `cudf.pandas` will accelerate all the features in the
+{ref}`cuDF API <cudf-api>` on the GPU. There are some exceptions. For
+example, some functions are GPU-accelerated by cuDF but do not support
+every combination of keyword arguments. In cases like unsupported
+keyword arguments, cuDF is not able to provide GPU acceleration and
+`cudf.pandas` will fall back to the CPU.
+
+The most accurate way to assess which functions run on the GPU is to try
+running the code while using the `cudf.pandas` profiling features. The
+profiler will indicate which functions ran on GPU / CPU. To improve
+performance, try to use only functionality that can run entirely on GPU.
+This helps reduce the number of memory transfers needed to fallback to
+CPU.
+
 ## Does it work with third-party libraries?
 
 `cudf.pandas` is tested with numerous popular third-party libraries.
diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md
index 376784439aa..0398a8d7086 100644
--- a/docs/cudf/source/cudf_pandas/usage.md
+++ b/docs/cudf/source/cudf_pandas/usage.md
@@ -63,16 +63,22 @@ back to CPU for certain operations. Running your code with the
 `cudf.pandas.profile` magic generates a report showing which
 operations used the GPU and which used the CPU. This can help you
 identify parts of your code that could be rewritten to be more
-GPU-friendly:
+GPU-friendly.
+
+### Using the Function Profiler
+
+First, enable `cudf.pandas`:
 
 ```python
 %load_ext cudf.pandas
 import pandas as pd
 ```
 
+Next, use the IPython/Jupyter magic `cudf.pandas.profile`:
+
 ```python
 %%cudf.pandas.profile
-df = pd.DataFrame({'a': [0, 1, 2], 'b': [3,4,3]})
+df = pd.DataFrame({'a': [0, 1, 2], 'b': [3, 4, 3]})
 
 df.min(axis=1)
 out = df.groupby('a').filter(
@@ -80,13 +86,35 @@ out = df.groupby('a').filter(
 )
 ```
 
+This gives a profiler output after the cell runs, shown below.
+
 ![cudf-pandas-profile](../_static/cudf-pandas-profile.png)
 
 When an operation falls back to using the CPU, it's typically because
 that operation isn't implemented by cuDF. The profiler generates a
 handy link to report the missing functionality to the cuDF team.
 
-To profile a script being run from the command-line, pass the
+### Using the Line Profiler
+
+There is a line profiler activated by the IPython/Jupyter magic `cudf.pandas.line_profile`:
+
+```python
+%%cudf.pandas.line_profile
+df = pd.DataFrame({'a': [0, 1, 2], 'b': [3, 4, 3]})
+
+df.min(axis=1)
+out = df.groupby('a').filter(
+    lambda group: len(group) > 1
+)
+```
+
+The output of the line profiler shows the source code and how much time each line spent executing on the GPU and CPU.
+
+![cudf-pandas-line-profile](../_static/cudf-pandas-line-profile.png)
+
+### Profiling from the command line
+
+To profile a script being run from the command line, pass the
 `--profile` argument:
 
 ```bash
diff --git a/docs/cudf/source/user_guide/api_docs/index.rst b/docs/cudf/source/user_guide/api_docs/index.rst
index b3442908531..5f26a921012 100644
--- a/docs/cudf/source/user_guide/api_docs/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/index.rst
@@ -1,3 +1,5 @@
+.. _cudf-api:
+
 =============
 API reference
 =============

From 563556e13d081a6ed07fd5b3577f64e95a1717d0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 26 Jun 2024 13:25:04 -1000
Subject: [PATCH 09/29] Reduce (shallow) copies in DataFrame ops (#16060)

In particular for ops which only modify the axes

* Reduce multiple shallow copies in `DataFrame.rename`
* Avoid a shallow copy in `DataFrame.to_arrow` until necessary

Also fixes a bug in `DataFrame.rename` to maintain the original `dtype` of the `columns` after renaming

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16060
---
 python/cudf/cudf/core/column_accessor.py |  4 +-
 python/cudf/cudf/core/dataframe.py       | 48 +++++++++++++-----------
 python/cudf/cudf/tests/test_dataframe.py |  8 ++++
 3 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 1bf9a393566..f30a557efb0 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -472,6 +472,7 @@ def swaplevel(self, i=-2, j=-1):
             new_keys[n][i], new_keys[n][j] = row[j], row[i]
             new_dict.update({row: tuple(new_keys[n])})
 
+        # TODO: Change to deep=False when copy-on-write is default
         new_data = {new_dict[k]: v.copy(deep=True) for k, v in self.items()}
 
         # swap level_names for i and j
@@ -669,10 +670,11 @@ def rename_column(x):
                 raise ValueError("Duplicate column names are not allowed")
 
         data = dict(zip(new_col_names, self.values()))
-        return self.__class__(
+        return type(self)(
             data=data,
             level_names=self.level_names,
             multiindex=self.multiindex,
+            label_dtype=self.label_dtype,
             verify=False,
         )
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f0d8157011d..f7f5ef792d6 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1121,8 +1121,6 @@ def _from_data(
     @staticmethod
     @_cudf_nvtx_annotate
     def _align_input_series_indices(data, index):
-        data = data.copy()
-
         input_series = [
             Series(val)
             for val in data.values()
@@ -1142,6 +1140,7 @@ def _align_input_series_indices(data, index):
                 )
                 index = aligned_input_series[0].index
 
+            data = data.copy()
             for name, val in data.items():
                 if isinstance(val, (pd.Series, Series, dict)):
                     data[name] = aligned_input_series.pop(0)
@@ -2969,6 +2968,7 @@ def set_index(
             idx = MultiIndex._from_data(dict(enumerate(data_to_add)))
             idx.names = names
 
+        # TODO: Change to deep=False when copy-on-write is default
         df = self if inplace else self.copy(deep=True)
 
         if verify_integrity and not idx.is_unique:
@@ -3565,6 +3565,9 @@ def rename(
             mapper if columns is None and axis in (1, "columns") else columns
         )
 
+        result = self if inplace else self.copy(deep=copy)
+
+        out_index = None
         if index:
             if (
                 any(isinstance(item, str) for item in index.values())
@@ -3586,36 +3589,36 @@ def rename(
                 )
                 out_index._data[level] = column.as_column(level_values)
                 out_index._compute_levels_and_codes()
-                out = DataFrame(index=out_index)
             else:
                 to_replace = list(index.keys())
                 vals = list(index.values())
                 is_all_na = vals.count(None) == len(vals)
 
                 try:
-                    index_data = {
-                        name: col.find_and_replace(to_replace, vals, is_all_na)
-                        for name, col in self.index._data.items()
-                    }
+                    out_index = _index_from_data(
+                        {
+                            name: col.find_and_replace(
+                                to_replace, vals, is_all_na
+                            )
+                            for name, col in self.index._data.items()
+                        }
+                    )
                 except OverflowError:
-                    index_data = self.index._data.copy(deep=True)
+                    pass
 
-                out = DataFrame(index=_index_from_data(index_data))
-        else:
-            out = DataFrame(index=self.index)
+        if out_index is not None:
+            result.index = out_index
 
         if columns:
-            out._data = self._data.rename_levels(mapper=columns, level=level)
-        else:
-            out._data = self._data.copy(deep=copy)
+            result._data = result._data.rename_levels(
+                mapper=columns, level=level
+            )
 
-        if inplace:
-            self._data = out._data
-        else:
-            return out.copy(deep=copy)
+        return result
 
     @_cudf_nvtx_annotate
     def add_prefix(self, prefix):
+        # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
         out.columns = [
             prefix + col_name for col_name in list(self._data.keys())
@@ -3624,6 +3627,7 @@ def add_prefix(self, prefix):
 
     @_cudf_nvtx_annotate
     def add_suffix(self, suffix):
+        # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
         out.columns = [
             col_name + suffix for col_name in list(self._data.keys())
@@ -3956,7 +3960,8 @@ def swaplevel(self, i=-2, j=-1, axis=0):
                            weight    1.0    0.8
                            length    0.3    0.2
         """
-        result = self.copy()
+        # TODO: Change to deep=False when copy-on-write is default
+        result = self.copy(deep=True)
 
         # To get axis number
         axis = self._get_axis_from_axis_arg(axis)
@@ -4027,7 +4032,7 @@ def transpose(self):
 
         # Set the old column names as the new index
         result = self.__class__._from_data(
-            {i: col for i, col in enumerate(result_columns)},
+            ColumnAccessor(dict(enumerate(result_columns)), verify=False),
             index=as_index(index),
         )
         # Set the old index as the new column names
@@ -5528,7 +5533,7 @@ def to_arrow(self, preserve_index=None):
         b: [[4,5,6]]
         """
 
-        data = self.copy(deep=False)
+        data = self
         index_descr = []
         write_index = preserve_index is not False
         keep_range_index = write_index and preserve_index is None
@@ -5556,6 +5561,7 @@ def to_arrow(self, preserve_index=None):
                     index_descr = (
                         index.names if index.name is not None else ("index",)
                     )
+                data = data.copy(deep=False)
                 for gen_name, col_name in zip(index_descr, index._data.names):
                     data._insert(
                         data.shape[1],
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 05ee8346afa..fc7fd87d4c5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10024,6 +10024,14 @@ def test_dataframe_rename_duplicate_column():
         gdf.rename(columns={"a": "b"}, inplace=True)
 
 
+def test_dataframe_rename_columns_keep_type():
+    gdf = cudf.DataFrame([[1, 2, 3]])
+    gdf.columns = cudf.Index([4, 5, 6], dtype=np.int8)
+    result = gdf.rename({4: 50}, axis="columns").columns
+    expected = pd.Index([50, 5, 6], dtype=np.int8)
+    assert_eq(result, expected)
+
+
 @pytest_unmark_spilling
 @pytest.mark.skipif(
     PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,

From 6eac9207ca0804aeca64c83c533e16ad5963b0ba Mon Sep 17 00:00:00 2001
From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
Date: Wed, 26 Jun 2024 17:16:57 -0700
Subject: [PATCH 10/29] Refactor distinct with hashset-based algorithms
 (#15984)

Refactor **distinct** algorithm to use `cuco::static_set`.

Authors:
  - Srinivas Yadav (https://github.com/srinivasyadav18)
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15984
---
 cpp/src/stream_compaction/distinct.cu         | 146 ++++++--------
 cpp/src/stream_compaction/distinct_count.cu   |   3 +-
 cpp/src/stream_compaction/distinct_helpers.cu | 189 ++++++++++--------
 .../stream_compaction/distinct_helpers.hpp    |  58 +++---
 .../stream_compaction_common.cuh              |   5 +-
 .../stream_compaction_common.hpp              |  35 ----
 cpp/src/stream_compaction/unique.cu           |   1 -
 7 files changed, 208 insertions(+), 229 deletions(-)
 delete mode 100644 cpp/src/stream_compaction/stream_compaction_common.hpp

diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index a6f15cc49ec..e5cf29f3ebf 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -17,28 +17,62 @@
 #include "distinct_helpers.hpp"
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/stream_compaction.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <cuda/functional>
-#include <thrust/copy.h>
-#include <thrust/distance.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/discard_iterator.h>
-
 #include <utility>
 #include <vector>
 
 namespace cudf {
 namespace detail {
+namespace {
+/**
+ * @brief Invokes the given `func` with desired the row equality
+ *
+ * @tparam HasNested Flag indicating whether there are nested columns in the input
+ * @tparam Func Type of the helper function doing `distinct` check
+ *
+ * @param compare_nulls Control whether nulls should be compared as equal or not
+ * @param compare_nans Control whether floating-point NaNs values should be compared as equal or not
+ * @param has_nulls Flag indicating whether the input has nulls or not
+ * @param row_equal Self table comparator
+ * @param func The input functor to invoke
+ */
+template <bool HasNested, typename Func>
+rmm::device_uvector<cudf::size_type> dipatch_row_equal(
+  null_equality compare_nulls,
+  nan_equality compare_nans,
+  bool has_nulls,
+  cudf::experimental::row::equality::self_comparator row_equal,
+  Func&& func)
+{
+  if (compare_nans == nan_equality::ALL_EQUAL) {
+    auto const d_equal = row_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{has_nulls},
+      compare_nulls,
+      cudf::experimental::row::equality::nan_equal_physical_equality_comparator{});
+    return func(d_equal);
+  } else {
+    auto const d_equal = row_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{has_nulls},
+      compare_nulls,
+      cudf::experimental::row::equality::physical_equality_comparator{});
+    return func(d_equal);
+  }
+}
+}  // namespace
 
 rmm::device_uvector<size_type> distinct_indices(table_view const& input,
                                                 duplicate_keep_option keep,
@@ -47,97 +81,39 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
                                                 rmm::cuda_stream_view stream,
                                                 rmm::device_async_resource_ref mr)
 {
-  if (input.num_rows() == 0 or input.num_columns() == 0) {
+  auto const num_rows = input.num_rows();
+
+  if (num_rows == 0 or input.num_columns() == 0) {
     return rmm::device_uvector<size_type>(0, stream, mr);
   }
 
-  auto map = hash_map_type{compute_hash_table_size(input.num_rows()),
-                           cuco::empty_key{-1},
-                           cuco::empty_value{std::numeric_limits<size_type>::min()},
-                           cudf::detail::cuco_allocator{stream},
-                           stream.value()};
-
   auto const preprocessed_input =
     cudf::experimental::row::hash::preprocessed_table::create(input, stream);
   auto const has_nulls          = nullate::DYNAMIC{cudf::has_nested_nulls(input)};
   auto const has_nested_columns = cudf::detail::has_nested_columns(input);
 
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher = row_hasher.device_hasher(has_nulls);
-
-  auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
-
-  auto const pair_iter = cudf::detail::make_counting_transform_iterator(
-    size_type{0},
-    cuda::proclaim_return_type<cuco::pair<size_type, size_type>>(
-      [] __device__(size_type const i) { return cuco::make_pair(i, i); }));
-
-  auto const insert_keys = [&](auto const value_comp) {
-    if (has_nested_columns) {
-      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
-      map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
-    } else {
-      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
-      map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
-    }
+  auto const row_hash  = cudf::experimental::row::hash::row_hasher(preprocessed_input);
+  auto const row_equal = cudf::experimental::row::equality::self_comparator(preprocessed_input);
+
+  auto const helper_func = [&](auto const& d_equal) {
+    using RowHasher = std::decay_t<decltype(d_equal)>;
+    auto set        = hash_set_type<RowHasher>{num_rows,
+                                               0.5,  // desired load factor
+                                               cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+                                               d_equal,
+                                               {row_hash.device_hasher(has_nulls)},
+                                               {},
+                                               {},
+                                               cudf::detail::cuco_allocator{stream},
+                                               stream.value()};
+    return detail::reduce_by_row(set, num_rows, keep, stream, mr);
   };
 
-  if (nans_equal == nan_equality::ALL_EQUAL) {
-    using nan_equal_comparator =
-      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    insert_keys(nan_equal_comparator{});
+  if (cudf::detail::has_nested_columns(input)) {
+    return dipatch_row_equal<true>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
   } else {
-    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    insert_keys(nan_unequal_comparator{});
+    return dipatch_row_equal<false>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
   }
-
-  auto output_indices = rmm::device_uvector<size_type>(map.get_size(), stream, mr);
-
-  // If we don't care about order, just gather indices of distinct keys taken from map.
-  if (keep == duplicate_keep_option::KEEP_ANY) {
-    map.retrieve_all(output_indices.begin(), thrust::make_discard_iterator(), stream.value());
-    return output_indices;
-  }
-
-  // For other keep options, reduce by row on rows that compare equal.
-  auto const reduction_results = reduce_by_row(map,
-                                               std::move(preprocessed_input),
-                                               input.num_rows(),
-                                               has_nulls,
-                                               has_nested_columns,
-                                               keep,
-                                               nulls_equal,
-                                               nans_equal,
-                                               stream,
-                                               rmm::mr::get_current_device_resource());
-
-  // Extract the desired output indices from reduction results.
-  auto const map_end = [&] {
-    if (keep == duplicate_keep_option::KEEP_NONE) {
-      // Reduction results with `KEEP_NONE` are either group sizes of equal rows, or `0`.
-      // Thus, we only output index of the rows in the groups having group size of `1`.
-      return thrust::copy_if(rmm::exec_policy(stream),
-                             thrust::make_counting_iterator(0),
-                             thrust::make_counting_iterator(input.num_rows()),
-                             output_indices.begin(),
-                             [reduction_results = reduction_results.begin()] __device__(
-                               auto const idx) { return reduction_results[idx] == size_type{1}; });
-    }
-
-    // Reduction results with `KEEP_FIRST` and `KEEP_LAST` are row indices of the first/last row in
-    // each group of equal rows (which are the desired output indices), or the value given by
-    // `reduction_init_value()`.
-    return thrust::copy_if(rmm::exec_policy(stream),
-                           reduction_results.begin(),
-                           reduction_results.end(),
-                           output_indices.begin(),
-                           [init_value = reduction_init_value(keep)] __device__(auto const idx) {
-                             return idx != init_value;
-                           });
-  }();
-
-  output_indices.resize(thrust::distance(output_indices.begin(), map_end), stream);
-  return output_indices;
 }
 
 std::unique_ptr<table> distinct(table_view const& input,
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 99ca89cc021..9843bb889f4 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -15,16 +15,17 @@
  */
 
 #include "stream_compaction_common.cuh"
-#include "stream_compaction_common.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/stream_compaction.hpp>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu
index 13e89b15bb7..c3a004b7f28 100644
--- a/cpp/src/stream_compaction/distinct_helpers.cu
+++ b/cpp/src/stream_compaction/distinct_helpers.cu
@@ -16,96 +16,127 @@
 
 #include "distinct_helpers.hpp"
 
-#include <cudf/detail/hash_reduce_by_row.cuh>
-
-#include <rmm/resource_ref.hpp>
+#include <cuda/functional>
+#include <cuda/std/atomic>
 
 namespace cudf::detail {
 
-namespace {
-/**
- * @brief The functor to find the first/last/all duplicate row for rows that compared equal.
- */
-template <typename MapView, typename KeyHasher, typename KeyEqual>
-struct reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type> {
-  duplicate_keep_option const keep;
-
-  reduce_fn(MapView const& d_map,
-            KeyHasher const& d_hasher,
-            KeyEqual const& d_equal,
-            duplicate_keep_option const keep,
-            size_type* const d_output)
-    : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>{d_map,
-                                                                     d_hasher,
-                                                                     d_equal,
-                                                                     d_output},
-      keep{keep}
-  {
+template <typename RowHasher>
+rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
+                                             size_type num_rows,
+                                             duplicate_keep_option keep,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  auto output_indices = rmm::device_uvector<size_type>(num_rows, stream, mr);
+
+  // If we don't care about order, just gather indices of distinct keys taken from set.
+  if (keep == duplicate_keep_option::KEEP_ANY) {
+    auto const iter = thrust::counting_iterator<cudf::size_type>{0};
+    set.insert_async(iter, iter + num_rows, stream.value());
+    auto const output_end = set.retrieve_all(output_indices.begin(), stream.value());
+    output_indices.resize(thrust::distance(output_indices.begin(), output_end), stream);
+    return output_indices;
   }
 
-  __device__ void operator()(size_type const idx) const
-  {
-    auto const out_ptr = this->get_output_ptr(idx);
-
-    if (keep == duplicate_keep_option::KEEP_FIRST) {
-      // Store the smallest index of all rows that are equal.
-      atomicMin(out_ptr, idx);
-    } else if (keep == duplicate_keep_option::KEEP_LAST) {
-      // Store the greatest index of all rows that are equal.
-      atomicMax(out_ptr, idx);
-    } else {
-      // Count the number of rows in each group of rows that are compared equal.
-      atomicAdd(out_ptr, size_type{1});
+  auto reduction_results = rmm::device_uvector<size_type>(num_rows, stream, mr);
+  thrust::uninitialized_fill(rmm::exec_policy_nosync(stream),
+                             reduction_results.begin(),
+                             reduction_results.end(),
+                             reduction_init_value(keep));
+
+  auto set_ref = set.ref(cuco::op::insert_and_find);
+
+  thrust::for_each(rmm::exec_policy_nosync(stream),
+                   thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(num_rows),
+                   [set_ref, keep, reduction_results = reduction_results.begin()] __device__(
+                     size_type const idx) mutable {
+                     auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx);
+
+                     auto ref = cuda::atomic_ref<size_type, cuda::thread_scope_device>{
+                       reduction_results[*inserted_idx_ptr]};
+                     if (keep == duplicate_keep_option::KEEP_FIRST) {
+                       // Store the smallest index of all rows that are equal.
+                       ref.fetch_min(idx, cuda::memory_order_relaxed);
+                     } else if (keep == duplicate_keep_option::KEEP_LAST) {
+                       // Store the greatest index of all rows that are equal.
+                       ref.fetch_max(idx, cuda::memory_order_relaxed);
+                     } else {
+                       // Count the number of rows in each group of rows that are compared equal.
+                       ref.fetch_add(size_type{1}, cuda::memory_order_relaxed);
+                     }
+                   });
+
+  auto const map_end = [&] {
+    if (keep == duplicate_keep_option::KEEP_NONE) {
+      // Reduction results with `KEEP_NONE` are either group sizes of equal rows, or `0`.
+      // Thus, we only output index of the rows in the groups having group size of `1`.
+      return thrust::copy_if(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_rows),
+        output_indices.begin(),
+        cuda::proclaim_return_type<bool>(
+          [reduction_results = reduction_results.begin()] __device__(auto const idx) {
+            return reduction_results[idx] == size_type{1};
+          }));
     }
-  }
-};
 
-/**
- * @brief The builder to construct an instance of `reduce_fn` functor base on the given
- * value of the `duplicate_keep_option` member variable.
- */
-struct reduce_func_builder {
-  duplicate_keep_option const keep;
-
-  template <typename MapView, typename KeyHasher, typename KeyEqual>
-  auto build(MapView const& d_map,
-             KeyHasher const& d_hasher,
-             KeyEqual const& d_equal,
-             size_type* const d_output)
-  {
-    return reduce_fn<MapView, KeyHasher, KeyEqual>{d_map, d_hasher, d_equal, keep, d_output};
-  }
-};
+    // Reduction results with `KEEP_FIRST` and `KEEP_LAST` are row indices of the first/last row in
+    // each group of equal rows (which are the desired output indices), or the value given by
+    // `reduction_init_value()`.
+    return thrust::copy_if(
+      rmm::exec_policy(stream),
+      reduction_results.begin(),
+      reduction_results.end(),
+      output_indices.begin(),
+      cuda::proclaim_return_type<bool>([init_value = reduction_init_value(keep)] __device__(
+                                         auto const idx) { return idx != init_value; }));
+  }();
 
-}  // namespace
+  output_indices.resize(thrust::distance(output_indices.begin(), map_end), stream);
+  return output_indices;
+}
 
-// This function is split from `distinct.cu` to improve compile time.
-rmm::device_uvector<size_type> reduce_by_row(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
+template rmm::device_uvector<size_type> reduce_by_row(
+  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+    false,
+    cudf::nullate::DYNAMIC,
+    cudf::experimental::row::equality::nan_equal_physical_equality_comparator>>& set,
   size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
   duplicate_keep_option keep,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
   rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
-               "This function should not be called with KEEP_ANY");
-
-  return hash_reduce_by_row(map,
-                            preprocessed_input,
-                            num_rows,
-                            has_nulls,
-                            has_nested_columns,
-                            nulls_equal,
-                            nans_equal,
-                            reduce_func_builder{keep},
-                            reduction_init_value(keep),
-                            stream,
-                            mr);
-}
+  rmm::device_async_resource_ref mr);
+
+template rmm::device_uvector<size_type> reduce_by_row(
+  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+    true,
+    cudf::nullate::DYNAMIC,
+    cudf::experimental::row::equality::nan_equal_physical_equality_comparator>>& set,
+  size_type num_rows,
+  duplicate_keep_option keep,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template rmm::device_uvector<size_type> reduce_by_row(
+  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+    false,
+    cudf::nullate::DYNAMIC,
+    cudf::experimental::row::equality::physical_equality_comparator>>& set,
+  size_type num_rows,
+  duplicate_keep_option keep,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template rmm::device_uvector<size_type> reduce_by_row(
+  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+    true,
+    cudf::nullate::DYNAMIC,
+    cudf::experimental::row::equality::physical_equality_comparator>>& set,
+  size_type num_rows,
+  duplicate_keep_option keep,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp
index 40f97e00ce5..fca67c98873 100644
--- a/cpp/src/stream_compaction/distinct_helpers.hpp
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#include "stream_compaction_common.hpp"
-
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
@@ -24,6 +23,12 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cuco/static_set.cuh>
+#include <cuda/functional>
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/counting_iterator.h>
+
 namespace cudf::detail {
 
 /**
@@ -42,13 +47,28 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
   }
 }
 
+template <typename RowHasher>
+using hash_set_type =
+  cuco::static_set<size_type,
+                   cuco::extent<int64_t>,
+                   cuda::thread_scope_device,
+                   RowHasher,
+                   cuco::linear_probing<1,
+                                        cudf::experimental::row::hash::device_row_hasher<
+                                          cudf::hashing::detail::default_hash,
+                                          cudf::nullate::DYNAMIC>>,
+                   cudf::detail::cuco_allocator,
+                   cuco::storage<1>>;
+
 /**
- * @brief Perform a reduction on groups of rows that are compared equal.
+ * @brief Perform a reduction on groups of rows that are compared equal and returns output indices
+ * of the occurrences of the distinct elements based on `keep` parameter.
  *
  * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared
- * equal. A hash table is used to find groups of equal rows.
+ * equal. A hash set is used to find groups of equal rows.
  *
  * Depending on the `keep` parameter, the reduction operation for each row group is:
+ * - If `keep == KEEP_ANY` : order does not matter.
  * - If `keep == KEEP_FIRST`: min of row indices in the group.
  * - If `keep == KEEP_LAST`: max of row indices in the group.
  * - If `keep == KEEP_NONE`: count of equivalent rows (group size).
@@ -59,30 +79,18 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
  * the `reduction_init_value()` function. Then, the reduction result for each row group is written
  * into the output array at the index of an unspecified row in the group.
  *
- * @param map The auxiliary map to perform reduction
- * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
- *        comparisons
+ * @param set The auxiliary set to perform reduction
+ * @param set_size The number of elements in set
  * @param num_rows The number of all input rows
- * @param has_nulls Indicate whether the input rows has any nulls at any nested levels
- * @param has_nested_columns Indicates whether the input table has any nested columns
  * @param keep The parameter to determine what type of reduction to perform
- * @param nulls_equal Flag to specify whether null elements should be considered as equal
- * @param nans_equal Flag to specify whether NaN values in floating point column should be
- *        considered equal.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned vector
- * @return A device_uvector containing the reduction results
+ * @return A device_uvector containing the output indices
  */
-rmm::device_uvector<size_type> reduce_by_row(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
-  size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
-  duplicate_keep_option keep,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
-
+template <typename RowHasher>
+rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
+                                             size_type num_rows,
+                                             duplicate_keep_option keep,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr);
 }  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh
index 839672d6a56..0f9bc18e258 100644
--- a/cpp/src/stream_compaction/stream_compaction_common.cuh
+++ b/cpp/src/stream_compaction/stream_compaction_common.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,8 @@
  */
 #pragma once
 
-#include "stream_compaction_common.hpp"
-
 #include <cudf/stream_compaction.hpp>
+#include <cudf/utilities/bit.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp
deleted file mode 100644
index 13795f49781..00000000000
--- a/cpp/src/stream_compaction/stream_compaction_common.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/detail/cuco_helpers.hpp>
-#include <cudf/hashing/detail/helper_functions.cuh>
-#include <cudf/table/row_operators.cuh>
-#include <cudf/table/table_device_view.cuh>
-
-#include <cuco/static_map.cuh>
-#include <cuda/std/atomic>
-
-#include <limits>
-
-namespace cudf {
-namespace detail {
-
-using hash_map_type = cuco::legacy::
-  static_map<size_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator>;
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu
index c1f8b17938c..edb47984d13 100644
--- a/cpp/src/stream_compaction/unique.cu
+++ b/cpp/src/stream_compaction/unique.cu
@@ -15,7 +15,6 @@
  */
 
 #include "stream_compaction_common.cuh"
-#include "stream_compaction_common.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>

From f267b1f068ec3e8fd49599fc28afa2fc0464118b Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 26 Jun 2024 19:44:54 -0700
Subject: [PATCH 11/29] Kernel copy for pinned memory (#15934)

Issue https://github.com/rapidsai/cudf/issues/15620

Added an API that enables users to set the threshold under which we perform pinned memory copies using a kernel. The default threshold is zero, so there's no change in default behavior.
The API currently only impacts `hostdevice_vector` H<->D synchronization.

The PR adds wrappers for `cudaMemcpyAsync` so we can implement configurable behavior for pageable copies as well (e.g. copy to pinned + kernel copy).

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15934
---
 cpp/CMakeLists.txt                            |  1 +
 .../cudf/detail/utilities/cuda_memcpy.hpp     | 53 ++++++++++++++
 cpp/include/cudf/utilities/pinned_memory.hpp  | 16 +++++
 cpp/src/io/utilities/hostdevice_vector.hpp    | 13 ++--
 cpp/src/utilities/cuda_memcpy.cu              | 71 +++++++++++++++++++
 cpp/src/utilities/pinned_memory.cpp           | 14 ++++
 6 files changed, 160 insertions(+), 8 deletions(-)
 create mode 100644 cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
 create mode 100644 cpp/src/utilities/cuda_memcpy.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5fd68bfb26c..35cf90411f2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -662,6 +662,7 @@ add_library(
   src/unary/math_ops.cu
   src/unary/nan_ops.cu
   src/unary/null_ops.cu
+  src/utilities/cuda_memcpy.cu
   src/utilities/default_stream.cpp
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
new file mode 100644
index 00000000000..b66c461ab12
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::detail {
+
+enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };
+
+/**
+ * @brief Asynchronously copies data between the host and device.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination memory address
+ * @param src Source memory address
+ * @param size Number of bytes to copy
+ * @param kind Type of host memory
+ * @param stream CUDA stream used for the copy
+ */
+void cuda_memcpy_async(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+
+/**
+ * @brief Synchronously copies data between the host and device.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination memory address
+ * @param src Source memory address
+ * @param size Number of bytes to copy
+ * @param kind Type of host memory
+ * @param stream CUDA stream used for the copy
+ */
+void cuda_memcpy(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index b423eab6d38..3e2fa43cb50 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -55,4 +55,20 @@ struct pinned_mr_options {
  */
 bool config_default_pinned_memory_resource(pinned_mr_options const& opts);
 
+/**
+ * @brief Set the threshold size for using kernels for pinned memory copies.
+ *
+ * @param threshold The threshold size in bytes. If the size of the copy is less than this
+ * threshold, the copy will be done using kernels. If the size is greater than or equal to this
+ * threshold, the copy will be done using cudaMemcpyAsync.
+ */
+void set_kernel_pinned_copy_threshold(size_t threshold);
+
+/**
+ * @brief Get the threshold size for using kernels for pinned memory copies.
+ *
+ * @return The threshold size in bytes.
+ */
+size_t get_kernel_pinned_copy_threshold();
+
 }  // namespace cudf
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 9acd6a1e3a9..aed745c42dd 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -18,6 +18,7 @@
 
 #include "hostdevice_span.hpp"
 
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -124,26 +125,22 @@ class hostdevice_vector {
 
   void host_to_device_async(rmm::cuda_stream_view stream)
   {
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(device_ptr(), host_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
+    cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   void host_to_device_sync(rmm::cuda_stream_view stream)
   {
-    host_to_device_async(stream);
-    stream.synchronize();
+    cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   void device_to_host_async(rmm::cuda_stream_view stream)
   {
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(host_ptr(), device_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
+    cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   void device_to_host_sync(rmm::cuda_stream_view stream)
   {
-    device_to_host_async(stream);
-    stream.synchronize();
+    cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   /**
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
new file mode 100644
index 00000000000..3d0822d8545
--- /dev/null
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+
+namespace cudf::detail {
+
+namespace {
+
+void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
+{
+  if (size == 0) return;
+
+  if (size < get_kernel_pinned_copy_threshold()) {
+    thrust::copy_n(rmm::exec_policy_nosync(stream),
+                   static_cast<const char*>(src),
+                   size,
+                   static_cast<char*>(dst));
+  } else {
+    CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
+  }
+}
+
+void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
+{
+  if (size == 0) return;
+
+  CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
+}
+
+};  // namespace
+
+void cuda_memcpy_async(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
+{
+  if (kind == host_memory_kind::PINNED) {
+    copy_pinned(dst, src, size, stream);
+  } else if (kind == host_memory_kind::PAGEABLE) {
+    copy_pageable(dst, src, size, stream);
+  } else {
+    CUDF_FAIL("Unsupported host memory kind");
+  }
+}
+
+void cuda_memcpy(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
+{
+  cuda_memcpy_async(dst, src, size, kind, stream);
+  stream.synchronize();
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp
index e90b7969b4d..3ea4293fc60 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/pinned_memory.cpp
@@ -211,4 +211,18 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts)
   return did_configure;
 }
 
+CUDF_EXPORT auto& kernel_pinned_copy_threshold()
+{
+  // use cudaMemcpyAsync for all pinned copies
+  static std::atomic<size_t> threshold = 0;
+  return threshold;
+}
+
+void set_kernel_pinned_copy_threshold(size_t threshold)
+{
+  kernel_pinned_copy_threshold() = threshold;
+}
+
+size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(); }
+
 }  // namespace cudf

From e98d456a77621591aa6f9a3d63c191a29cf1689b Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 27 Jun 2024 02:14:08 -0700
Subject: [PATCH 12/29] Fix pylibcudf Table.num_rows for 0 columns case and add
 interop to docs (#16108)

There was a bug where Table.num_rows raised when we had 0 columns instead of returning 0.
I also added interop to the docs since that was missing.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16108
---
 .../user_guide/api_docs/pylibcudf/index.rst   |  1 +
 .../user_guide/api_docs/pylibcudf/interop.rst |  6 +++++
 python/cudf/cudf/_lib/pylibcudf/table.pyx     |  2 ++
 .../cudf/cudf/pylibcudf_tests/test_table.py   | 22 +++++++++++++++++++
 4 files changed, 31 insertions(+)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_table.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index f98298ff052..e9dad705cbf 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -19,6 +19,7 @@ This page provides API documentation for pylibcudf.
     gpumemoryview
     groupby
     io/index.rst
+    interop
     join
     lists
     merge
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst
new file mode 100644
index 00000000000..881ab8d7be4
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst
@@ -0,0 +1,6 @@
+=======
+interop
+=======
+
+.. automodule:: cudf._lib.pylibcudf.interop
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx
index d93ac78721b..d91fa0474b0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx
@@ -83,6 +83,8 @@ cdef class Table:
 
     cpdef int num_rows(self):
         """The number of rows in this table."""
+        if self.num_columns() == 0:
+            return 0
         return self._columns[0].size()
 
     cpdef list columns(self):
diff --git a/python/cudf/cudf/pylibcudf_tests/test_table.py b/python/cudf/cudf/pylibcudf_tests/test_table.py
new file mode 100644
index 00000000000..cf1d51f6491
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_table.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.mark.parametrize(
+    "arrow_tbl",
+    [
+        pa.table([]),
+        pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}),
+        pa.table({"a": [1, 2, 3]}),
+        pa.table({"a": [1], "b": [2], "c": [3]}),
+    ],
+)
+def test_table_shape(arrow_tbl):
+    plc_tbl = plc.interop.from_arrow(arrow_tbl)
+
+    plc_tbl_shape = (plc_tbl.num_rows(), plc_tbl.num_columns())
+    assert plc_tbl_shape == arrow_tbl.shape

From fa8284ddb2de808573d5b21cc9e650578ddf6acc Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 27 Jun 2024 11:51:41 +0100
Subject: [PATCH 13/29] Adapt to polars upstream changes and turn on CI testing
 (#16081)

They changed the semantics of join keys when those keys are expressions to more closely match SQL.

Dtype inference is also tighter, so update tests to adapt to those changes, and some other small deprecation warnings.

Finish the final missing coverage piece and turn on testing in CI (failing if we don't hit 100% coverage as well).

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16081
---
 .github/workflows/pr.yaml                     | 12 +++
 ci/test_cudf_polars.sh                        | 68 +++++++++++++++++
 python/cudf_polars/cudf_polars/dsl/ir.py      | 70 +++++++++---------
 .../cudf_polars/typing/__init__.py            | 74 ++++++++++---------
 .../cudf_polars/tests/expressions/test_agg.py |  4 +-
 .../tests/expressions/test_booleanfunction.py | 12 ++-
 .../tests/expressions/test_rolling.py         | 12 ++-
 .../tests/expressions/test_stringfunction.py  | 16 ++--
 python/cudf_polars/tests/test_groupby.py      | 11 +--
 python/cudf_polars/tests/test_join.py         | 16 +++-
 python/cudf_polars/tests/test_mapfunction.py  | 32 ++++++--
 python/cudf_polars/tests/test_python_scan.py  |  7 +-
 python/cudf_polars/tests/test_union.py        | 12 +--
 13 files changed, 234 insertions(+), 112 deletions(-)
 create mode 100755 ci/test_cudf_polars.sh

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index cb582df21e0..a35802f2ab0 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,6 +25,7 @@ jobs:
       - docs-build
       - wheel-build-cudf
       - wheel-tests-cudf
+      - test-cudf-polars
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
       - devcontainer
@@ -132,6 +133,17 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
+  test-cudf-polars:
+    needs: wheel-build-cudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      # This always runs, but only fails if this PR touches code in
+      # pylibcudf or cudf_polars
+      script: "ci/test_cudf_polars.sh"
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
diff --git a/ci/test_cudf_polars.sh b/ci/test_cudf_polars.sh
new file mode 100755
index 00000000000..669e049ab26
--- /dev/null
+++ b/ci/test_cudf_polars.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -eou pipefail
+
+# We will only fail these tests if the PR touches code in pylibcudf
+# or cudf_polars itself.
+# Note, the three dots mean we are doing diff between the merge-base
+# of upstream and HEAD. So this is asking, "does _this branch_ touch
+# files in cudf_polars/pylibcudf", rather than "are there changes
+# between upstream and this branch which touch cudf_polars/pylibcudf"
+# TODO: is the target branch exposed anywhere in an environment variable?
+if [ -n "$(git diff --name-only origin/branch-24.08...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
+then
+    HAS_CHANGES=1
+else
+    HAS_CHANGES=0
+fi
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+
+RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
+RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
+mkdir -p "${RAPIDS_TESTS_DIR}"
+
+rapids-logger "Install cudf wheel"
+# echo to expand wildcard before adding `[extra]` requires for pip
+python -m pip install $(echo ./dist/cudf*.whl)[test]
+
+rapids-logger "Install polars (allow pre-release versions)"
+python -m pip install 'polars>=1.0.0a0'
+
+rapids-logger "Install cudf_polars"
+python -m pip install --no-deps python/cudf_polars
+
+rapids-logger "Run cudf_polars tests"
+
+function set_exitcode()
+{
+    EXITCODE=$?
+}
+EXITCODE=0
+trap set_exitcode ERR
+set +e
+
+python -m pytest \
+       --cache-clear \
+       --cov cudf_polars \
+       --cov-fail-under=100 \
+       --cov-config=python/cudf_polars/pyproject.toml \
+       --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf_polars.xml" \
+       python/cudf_polars/tests
+
+trap ERR
+set -e
+
+if [ ${EXITCODE} != 0 ]; then
+    rapids-logger "Testing FAILED: exitcode ${EXITCODE}"
+else
+    rapids-logger "Testing PASSED"
+fi
+
+if [ ${HAS_CHANGES} == 1 ]; then
+    exit ${EXITCODE}
+else
+    exit 0
+fi
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 4ad6e75fb2e..3f5f3c74050 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -123,7 +123,7 @@ def broadcast(
     ]
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class IR:
     """Abstract plan node, representing an unevaluated dataframe."""
 
@@ -157,7 +157,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         )  # pragma: no cover
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class PythonScan(IR):
     """Representation of input from a python function."""
 
@@ -171,7 +171,7 @@ def __post_init__(self):
         raise NotImplementedError("PythonScan not implemented")
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Scan(IR):
     """Input from files."""
 
@@ -248,7 +248,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return df.filter(mask)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Cache(IR):
     """
     Return a cached plan node.
@@ -269,7 +269,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return cache.setdefault(self.key, self.value.evaluate(cache=cache))
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class DataFrameScan(IR):
     """
     Input from an existing polars DataFrame.
@@ -315,7 +315,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return df
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Select(IR):
     """Produce a new dataframe selecting given expressions from an input."""
 
@@ -336,7 +336,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(columns)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Reduce(IR):
     """
     Produce a new dataframe selecting given expressions from an input.
@@ -389,7 +389,7 @@ def placeholder_column(n: int) -> plc.Column:
     )
 
 
-@dataclasses.dataclass(slots=False)
+@dataclasses.dataclass
 class GroupBy(IR):
     """Perform a groupby."""
 
@@ -490,7 +490,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame([*result_keys, *results]).slice(self.options.slice)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Join(IR):
     """A join of two dataframes."""
 
@@ -518,8 +518,16 @@ class Join(IR):
     - coalesce: should key columns be coalesced (only makes sense for outer joins)
     """
 
-    @cache
+    def __post_init__(self) -> None:
+        """Validate preconditions."""
+        if any(
+            isinstance(e.value, expr.Literal)
+            for e in itertools.chain(self.left_on, self.right_on)
+        ):
+            raise NotImplementedError("Join with literal as join key.")
+
     @staticmethod
+    @cache
     def _joiners(
         how: Literal["inner", "left", "full", "leftsemi", "leftanti"],
     ) -> tuple[
@@ -582,17 +590,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 for new, old in zip(columns[left.num_columns :], right.columns)
             ]
             return DataFrame([*left_cols, *right_cols])
-        left_on = DataFrame(
-            broadcast(
-                *(e.evaluate(left) for e in self.left_on), target_length=left.num_rows
-            )
-        )
-        right_on = DataFrame(
-            broadcast(
-                *(e.evaluate(right) for e in self.right_on),
-                target_length=right.num_rows,
-            )
-        )
+        # TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184
+        left_on = DataFrame(broadcast(*(e.evaluate(left) for e in self.left_on)))
+        right_on = DataFrame(broadcast(*(e.evaluate(right) for e in self.right_on)))
         null_equality = (
             plc.types.NullEquality.EQUAL
             if join_nulls
@@ -602,13 +602,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         if right_policy is None:
             # Semi join
             lg = join_fn(left_on.table, right_on.table, null_equality)
-            left = left.replace_columns(*left_on.columns)
             table = plc.copying.gather(left.table, lg, left_policy)
             result = DataFrame.from_table(table, left.column_names)
         else:
             lg, rg = join_fn(left_on.table, right_on.table, null_equality)
-            left = left.replace_columns(*left_on.columns)
-            right = right.replace_columns(*right_on.columns)
             if coalesce and how == "inner":
                 right = right.discard_columns(right_on.column_names_set)
             left = DataFrame.from_table(
@@ -642,7 +639,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return result.slice(zlice)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class HStack(IR):
     """Add new columns to a dataframe."""
 
@@ -671,7 +668,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return df.with_columns(columns)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Distinct(IR):
     """Produce a new dataframe with distinct rows."""
 
@@ -741,7 +738,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return result.slice(self.zlice)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Sort(IR):
     """Sort a dataframe."""
 
@@ -810,7 +807,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(columns).slice(self.zlice)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Slice(IR):
     """Slice a dataframe."""
 
@@ -827,7 +824,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return df.slice((self.offset, self.length))
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Filter(IR):
     """Filter a dataframe with a boolean mask."""
 
@@ -843,7 +840,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return df.filter(mask)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Projection(IR):
     """Select a subset of columns from a dataframe."""
 
@@ -860,7 +857,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(columns)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class MapFunction(IR):
     """Apply some function to a dataframe."""
 
@@ -894,6 +891,13 @@ def __post_init__(self) -> None:
                 # polars requires that all to-explode columns have the
                 # same sub-shapes
                 raise NotImplementedError("Explode with more than one column")
+        elif self.name == "rename":
+            old, new, _ = self.options
+            # TODO: perhaps polars should validate renaming in the IR?
+            if len(new) != len(set(new)) or (
+                set(new) & (set(self.df.schema.keys() - set(old)))
+            ):
+                raise NotImplementedError("Duplicate new names in rename.")
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -919,7 +923,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             raise AssertionError("Should never be reached")  # pragma: no cover
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Union(IR):
     """Concatenate dataframes vertically."""
 
@@ -943,7 +947,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         ).slice(self.zlice)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class HConcat(IR):
     """Concatenate dataframes horizontally."""
 
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index 6d597a91724..c04eac41bb7 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -6,7 +6,7 @@
 from __future__ import annotations
 
 from collections.abc import Mapping
-from typing import TYPE_CHECKING, Literal, Protocol, TypeAlias
+from typing import TYPE_CHECKING, Literal, Protocol, Union
 
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
@@ -15,43 +15,45 @@
 if TYPE_CHECKING:
     from typing import Callable
 
+    from typing_extensions import TypeAlias
+
     import polars as pl
 
-IR: TypeAlias = (
-    pl_ir.PythonScan
-    | pl_ir.Scan
-    | pl_ir.Cache
-    | pl_ir.DataFrameScan
-    | pl_ir.Select
-    | pl_ir.GroupBy
-    | pl_ir.Join
-    | pl_ir.HStack
-    | pl_ir.Distinct
-    | pl_ir.Sort
-    | pl_ir.Slice
-    | pl_ir.Filter
-    | pl_ir.SimpleProjection
-    | pl_ir.MapFunction
-    | pl_ir.Union
-    | pl_ir.HConcat
-    | pl_ir.ExtContext
-)
-
-Expr: TypeAlias = (
-    pl_expr.Function
-    | pl_expr.Window
-    | pl_expr.Literal
-    | pl_expr.Sort
-    | pl_expr.SortBy
-    | pl_expr.Gather
-    | pl_expr.Filter
-    | pl_expr.Cast
-    | pl_expr.Column
-    | pl_expr.Agg
-    | pl_expr.BinaryExpr
-    | pl_expr.Len
-    | pl_expr.PyExprIR
-)
+IR: TypeAlias = Union[
+    pl_ir.PythonScan,
+    pl_ir.Scan,
+    pl_ir.Cache,
+    pl_ir.DataFrameScan,
+    pl_ir.Select,
+    pl_ir.GroupBy,
+    pl_ir.Join,
+    pl_ir.HStack,
+    pl_ir.Distinct,
+    pl_ir.Sort,
+    pl_ir.Slice,
+    pl_ir.Filter,
+    pl_ir.SimpleProjection,
+    pl_ir.MapFunction,
+    pl_ir.Union,
+    pl_ir.HConcat,
+    pl_ir.ExtContext,
+]
+
+Expr: TypeAlias = Union[
+    pl_expr.Function,
+    pl_expr.Window,
+    pl_expr.Literal,
+    pl_expr.Sort,
+    pl_expr.SortBy,
+    pl_expr.Gather,
+    pl_expr.Filter,
+    pl_expr.Cast,
+    pl_expr.Column,
+    pl_expr.Agg,
+    pl_expr.BinaryExpr,
+    pl_expr.Len,
+    pl_expr.PyExprIR,
+]
 
 Schema: TypeAlias = Mapping[str, plc.DataType]
 
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index 2ffa1c4af6d..267d0a99692 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -52,7 +52,7 @@ def test_agg(df, agg):
 
     # https://github.com/rapidsai/cudf/issues/15852
     check_dtypes = agg not in {"n_unique", "median"}
-    if not check_dtypes and q.schema["a"] != pl.Float64:
+    if not check_dtypes and q.collect_schema()["a"] != pl.Float64:
         with pytest.raises(AssertionError):
             assert_gpu_result_equal(q)
     assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False)
@@ -65,7 +65,7 @@ def test_agg(df, agg):
 )
 @pytest.mark.parametrize("op", ["min", "max"])
 def test_agg_float_with_nans(propagate_nans, op):
-    df = pl.LazyFrame({"a": [1, 2, float("nan")]})
+    df = pl.LazyFrame({"a": pl.Series([1, 2, float("nan")], dtype=pl.Float64())})
     op = getattr(pl.Expr, f"nan_{op}" if propagate_nans else op)
     q = df.select(op(pl.col("a")))
 
diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py
index 951b749e670..a52fba26528 100644
--- a/python/cudf_polars/tests/expressions/test_booleanfunction.py
+++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py
@@ -26,7 +26,7 @@ def has_nulls(request):
 def test_booleanfunction_reduction(ignore_nulls):
     ldf = pl.LazyFrame(
         {
-            "a": [1, 2, 3.0, 2, 5],
+            "a": pl.Series([1, 2, 3.0, 2, 5], dtype=pl.Float64()),
             "b": [0, 3, 1, -1, None],
             "c": [1, 6, 5, 3, 2],
         }
@@ -82,7 +82,9 @@ def test_boolean_function_unary(request, expr, has_nans, has_nulls):
     ],
 )
 def test_unsupported_boolean_function(expr):
-    df = pl.LazyFrame({"a": [1, float("nan"), 2, 4], "b": [1, 2, 3, 4]})
+    df = pl.LazyFrame(
+        {"a": pl.Series([1, float("nan"), 2, 4], dtype=pl.Float64()), "b": [1, 2, 3, 4]}
+    )
 
     q = df.select(expr)
 
@@ -95,7 +97,11 @@ def test_unsupported_boolean_function(expr):
 )
 def test_boolean_isbetween(closed, bounds):
     df = pl.LazyFrame(
-        {"a": [1, float("nan"), 2, 4], "lo": [1, 2, 2, 3], "hi": [10, 4, 2, 4]}
+        {
+            "a": pl.Series([1, float("nan"), 2, 4], dtype=pl.Float32()),
+            "lo": [1, 2, 2, 3],
+            "hi": [10, 4, 2, 4],
+        }
     )
 
     q = df.select(pl.col("a").is_between(*bounds, closed=closed))
diff --git a/python/cudf_polars/tests/expressions/test_rolling.py b/python/cudf_polars/tests/expressions/test_rolling.py
index d4920d35f14..992efe0ba79 100644
--- a/python/cudf_polars/tests/expressions/test_rolling.py
+++ b/python/cudf_polars/tests/expressions/test_rolling.py
@@ -3,11 +3,9 @@
 
 from __future__ import annotations
 
-import pytest
-
 import polars as pl
 
-from cudf_polars import translate_ir
+from cudf_polars.testing.asserts import assert_ir_translation_raises
 
 
 def test_rolling():
@@ -29,13 +27,13 @@ def test_rolling():
         min_a=pl.min("a").rolling(index_column="dt", period="2d"),
         max_a=pl.max("a").rolling(index_column="dt", period="2d"),
     )
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 def test_grouped_rolling():
     df = pl.LazyFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 2, 1, 3, 1, 2]})
 
     q = df.select(pl.col("a").min().over("b"))
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py
index 3c498fe7286..9729e765948 100644
--- a/python/cudf_polars/tests/expressions/test_stringfunction.py
+++ b/python/cudf_polars/tests/expressions/test_stringfunction.py
@@ -8,8 +8,11 @@
 
 import polars as pl
 
-from cudf_polars import execute_with_cudf, translate_ir
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars import execute_with_cudf
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 @pytest.fixture
@@ -47,22 +50,19 @@ def test_supported_stringfunction_expression(ldf):
 def test_unsupported_stringfunction(ldf):
     q = ldf.select(pl.col("a").str.count_matches("e", literal=True))
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 def test_contains_re_non_strict_raises(ldf):
     q = ldf.select(pl.col("a").str.contains(".", strict=False))
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 def test_contains_re_non_literal_raises(ldf):
     q = ldf.select(pl.col("a").str.contains(pl.col("b"), literal=False))
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index e70f923b097..aefad59eb91 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -6,8 +6,10 @@
 
 import polars as pl
 
-from cudf_polars import translate_ir
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 @pytest.fixture
@@ -72,7 +74,7 @@ def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs):
     q = df.group_by(*keys, maintain_order=maintain_order).agg(*exprs)
 
     if not maintain_order:
-        sort_keys = list(q.schema.keys())[: len(keys)]
+        sort_keys = list(q.collect_schema().keys())[: len(keys)]
         q = q.sort(*sort_keys)
 
     assert_gpu_result_equal(q, check_exact=False)
@@ -97,5 +99,4 @@ def test_groupby_len(df, keys):
 def test_groupby_unsupported(df, expr):
     q = df.group_by("key1").agg(expr)
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 81166b0b2f6..89f6fd3455b 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -6,7 +6,10 @@
 
 import polars as pl
 
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 @pytest.mark.parametrize(
@@ -71,3 +74,14 @@ def test_cross_join():
     q = left.join(right, how="cross")
 
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "left_on,right_on", [(pl.col("a"), pl.lit(2)), (pl.lit(2), pl.col("a"))]
+)
+def test_join_literal_key_unsupported(left_on, right_on):
+    left = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
+    right = pl.LazyFrame({"a": [1, 2, 3], "b": [5, 6, 7]})
+    q = left.join(right, left_on=left_on, right_on=right_on, how="inner")
+
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py
index ec6b3f3fc0a..77032108e6f 100644
--- a/python/cudf_polars/tests/test_mapfunction.py
+++ b/python/cudf_polars/tests/test_mapfunction.py
@@ -6,8 +6,10 @@
 
 import polars as pl
 
-from cudf_polars import translate_ir
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 def test_merge_sorted_raises():
@@ -17,16 +19,14 @@ def test_merge_sorted_raises():
 
     q = df1.merge_sorted(df2, key="a").merge_sorted(df3, key="a")
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 def test_explode_multiple_raises():
     df = pl.LazyFrame({"a": [[1, 2], [3, 4]], "b": [[5, 6], [7, 8]]})
     q = df.explode("a", "b")
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 @pytest.mark.parametrize("column", ["a", "b"])
@@ -41,3 +41,23 @@ def test_explode_single(column):
     q = df.explode(column)
 
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("mapping", [{"b": "a"}, {"a": "c", "b": "c"}])
+def test_rename_duplicate_raises(mapping):
+    df = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
+
+    q = df.rename(mapping)
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize(
+    "mapping", [{}, {"b": "c"}, {"b": "a", "a": "b"}, {"a": "c", "b": "d"}]
+)
+def test_rename_columns(mapping):
+    df = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
+
+    q = df.rename(mapping)
+
+    assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_python_scan.py b/python/cudf_polars/tests/test_python_scan.py
index c03474e3dc8..fd8453b77c4 100644
--- a/python/cudf_polars/tests/test_python_scan.py
+++ b/python/cudf_polars/tests/test_python_scan.py
@@ -2,11 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-import pytest
-
 import polars as pl
 
-from cudf_polars import translate_ir
+from cudf_polars.testing.asserts import assert_ir_translation_raises
 
 
 def test_python_scan():
@@ -14,7 +12,6 @@ def source(with_columns, predicate, nrows):
         return pl.DataFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int8())})
 
     q = pl.LazyFrame._scan_python_function({"a": pl.Int8}, source, pyarrow=False)
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
     assert q.collect().equals(source(None, None, None))
diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py
index 6c9122bc260..b021d832910 100644
--- a/python/cudf_polars/tests/test_union.py
+++ b/python/cudf_polars/tests/test_union.py
@@ -2,12 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-import pytest
-
 import polars as pl
 
-from cudf_polars import translate_ir
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 def test_union():
@@ -31,8 +31,8 @@ def test_union_schema_mismatch_raises():
     ).lazy()
     ldf2 = ldf.select(pl.col("a").cast(pl.Float32))
     query = pl.concat([ldf, ldf2], how="diagonal")
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(query._ldf.visit())
+
+    assert_ir_translation_raises(query, NotImplementedError)
 
 
 def test_concat_vertical():

From 5d49fe6a7fae839b2be16ae8cd6899d287855359 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Thu, 27 Jun 2024 10:44:21 -0500
Subject: [PATCH 14/29] Fix unnecessarily strict check in parquet chunked
 reader for choosing split locations. (#16099)

This is a fix that somehow didn't make it into the initial wave of bug fixes for the parquet chunked reader earlier this year.

The code that determines where to do splits needs to be sure it always chooses a location such that the pages that are selected always enclose at least one full row for a list column.  This means that you need to see at least 1 full row (2 row boundaries) in the group of pages.  The weaklogic was only checking if you had 1 full row within the very last page in the selection, which is unnecessarily strict.  We actually ran into some data out in the wild where this was hit.

This PR changes the logic to include all pages within the chunk when doing the check instead of just the last one.

Authors:
  - https://github.com/nvdbaranec
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16099
---
 cpp/src/io/parquet/reader_impl_chunking.cu   | 13 ++++++++-----
 cpp/src/io/parquet/reader_impl_preprocess.cu |  3 ++-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 9ad5a2d6e8d..d371ef5de93 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -337,7 +337,8 @@ int64_t find_next_split(int64_t cur_pos,
                         size_t cur_row_index,
                         size_t cur_cumulative_size,
                         cudf::host_span<cumulative_page_info const> sizes,
-                        size_t size_limit)
+                        size_t size_limit,
+                        size_t min_row_count)
 {
   auto const start = thrust::make_transform_iterator(
     sizes.begin(),
@@ -357,7 +358,7 @@ int64_t find_next_split(int64_t cur_pos,
   // this guarantees that even if we cannot fit the set of rows represented by our where our cur_pos
   // is, we will still move forward instead of failing.
   while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
-         (sizes[split_pos].end_row_index == cur_row_index)) {
+         (sizes[split_pos].end_row_index - cur_row_index < min_row_count)) {
     split_pos++;
   }
 
@@ -657,8 +658,10 @@ std::tuple<rmm::device_uvector<page_span>, size_t, size_t> compute_next_subpass(
   auto const start_index = find_start_index(h_aggregated_info, start_row);
   auto const cumulative_size =
     start_row == 0 || start_index == 0 ? 0 : h_aggregated_info[start_index - 1].size_bytes;
+  // when choosing subpasses, we need to guarantee at least 2 rows in the included pages so that all
+  // list columns have a clear start and end.
   auto const end_index =
-    find_next_split(start_index, start_row, cumulative_size, h_aggregated_info, size_limit);
+    find_next_split(start_index, start_row, cumulative_size, h_aggregated_info, size_limit, 2);
   auto const end_row = h_aggregated_info[end_index].end_row_index;
 
   // for each column, collect the set of pages that spans start_row / end_row
@@ -703,8 +706,8 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
   size_t cur_cumulative_size = 0;
   auto const max_row         = min(skip_rows + num_rows, h_aggregated_info.back().end_row_index);
   while (cur_row_index < max_row) {
-    auto const split_pos =
-      find_next_split(cur_pos, cur_row_index, cur_cumulative_size, h_aggregated_info, size_limit);
+    auto const split_pos = find_next_split(
+      cur_pos, cur_row_index, cur_cumulative_size, h_aggregated_info, size_limit, 1);
 
     auto const start_row = cur_row_index;
     cur_row_index        = min(max_row, h_aggregated_info[split_pos].end_row_index);
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 9df5c362cdd..f28a7311ccb 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1436,7 +1436,8 @@ void reader::impl::preprocess_subpass_pages(read_mode mode, size_t chunk_read_li
     // subpass since we know that will safely completed.
     bool const is_list = chunk.max_level[level_type::REPETITION] > 0;
     if (is_list && max_col_row < last_pass_row) {
-      size_t const min_col_row = static_cast<size_t>(chunk.start_row + last_page.chunk_row);
+      auto const& first_page   = subpass.pages[page_index];
+      size_t const min_col_row = static_cast<size_t>(chunk.start_row + first_page.chunk_row);
       CUDF_EXPECTS((max_col_row - min_col_row) > 1, "Unexpected short subpass");
       max_col_row--;
     }

From a71c249f9f320ecb61aa8135bbda300122e43491 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 27 Jun 2024 14:29:31 -0500
Subject: [PATCH 15/29] Fix dtype errors in `StringArrays` (#16111)

This PR adds proxy classes for `ArrowStringArray` and `ArrowStringArrayNumpySemantics` that will increase the pandas test pass rate by 1%.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16111
---
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 16 +++++++++++++
 .../cudf/pandas/scripts/run-pandas-tests.sh   |  3 ++-
 .../cudf_pandas_tests/test_cudf_pandas.py     | 23 +++++++++++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 0ba432d6d0e..a64bf7772fe 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -522,6 +522,22 @@ def Index__new__(cls, *args, **kwargs):
     },
 )
 
+ArrowStringArrayNumpySemantics = make_final_proxy_type(
+    "ArrowStringArrayNumpySemantics",
+    _Unusable,
+    pd.core.arrays.string_arrow.ArrowStringArrayNumpySemantics,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+)
+
+ArrowStringArray = make_final_proxy_type(
+    "ArrowStringArray",
+    _Unusable,
+    pd.core.arrays.string_arrow.ArrowStringArray,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+)
+
 StringDtype = make_final_proxy_type(
     "StringDtype",
     _Unusable,
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index cd9f90d50fe..a66f63c09b3 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -133,7 +133,8 @@ and not test_s3_roundtrip"
 TEST_THAT_CRASH_PYTEST_WORKERS="not test_bitmasks_pyarrow \
 and not test_large_string_pyarrow \
 and not test_interchange_from_corrected_buffer_dtypes \
-and not test_eof_states"
+and not test_eof_states \
+and not test_array_tz"
 
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index eed5037cbea..0d46e2e9311 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1533,3 +1533,26 @@ def test_is_proxy_object():
     assert is_proxy_object(np_arr_proxy)
     assert is_proxy_object(s1)
     assert not is_proxy_object(s2)
+
+
+def test_arrow_string_arrays():
+    cu_s = xpd.Series(["a", "b", "c"])
+    pd_s = pd.Series(["a", "b", "c"])
+
+    cu_arr = xpd.arrays.ArrowStringArray._from_sequence(
+        cu_s, dtype=xpd.StringDtype("pyarrow")
+    )
+    pd_arr = pd.arrays.ArrowStringArray._from_sequence(
+        pd_s, dtype=pd.StringDtype("pyarrow")
+    )
+
+    tm.assert_equal(cu_arr, pd_arr)
+
+    cu_arr = xpd.core.arrays.string_arrow.ArrowStringArray._from_sequence(
+        cu_s, dtype=xpd.StringDtype("pyarrow_numpy")
+    )
+    pd_arr = pd.core.arrays.string_arrow.ArrowStringArray._from_sequence(
+        pd_s, dtype=pd.StringDtype("pyarrow_numpy")
+    )
+
+    tm.assert_equal(cu_arr, pd_arr)

From 2ed69c9e830d90a8e565ea23ba1813e594a9f4d9 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 27 Jun 2024 10:11:09 -1000
Subject: [PATCH 16/29] Ensure MultiIndex.to_frame deep copies columns (#16110)

Additionally, this allows simplification in `MultiIndex.__repr__` which avoids a shallow copy and also caught a bug where `NaT` was not supposed to be quoted

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16110
---
 python/cudf/cudf/core/multiindex.py | 88 ++++++++++-------------------
 python/cudf/cudf/tests/test_repr.py | 10 ++--
 2 files changed, 35 insertions(+), 63 deletions(-)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index a01242d957d..547c14cdc99 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -23,6 +23,7 @@
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
 from cudf.core import column
 from cudf.core._base_index import _return_get_indexer_result
+from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
 from cudf.core.index import (
     BaseIndex,
@@ -446,45 +447,26 @@ def __repr__(self):
             )
             preprocess = self.take(indices)
         else:
-            preprocess = self.copy(deep=False)
-
-        if any(col.has_nulls() for col in preprocess._data.columns):
-            preprocess_df = preprocess.to_frame(index=False)
-            for name, col in preprocess._data.items():
-                if isinstance(
-                    col,
-                    (
-                        column.datetime.DatetimeColumn,
-                        column.timedelta.TimeDeltaColumn,
-                    ),
-                ):
-                    preprocess_df[name] = col.astype("str").fillna(
-                        str(cudf.NaT)
-                    )
+            preprocess = self
 
-            tuples_list = list(
-                zip(
-                    *list(
-                        map(lambda val: pd.NA if val is None else val, col)
-                        for col in preprocess_df.to_arrow()
-                        .to_pydict()
-                        .values()
-                    )
-                )
-            )
+        arrays = []
+        for name, col in zip(self.names, preprocess._columns):
+            try:
+                pd_idx = col.to_pandas(nullable=True)
+            except NotImplementedError:
+                pd_idx = col.to_pandas(nullable=False)
+            pd_idx.name = name
+            arrays.append(pd_idx)
 
-            preprocess = preprocess.to_pandas(nullable=True)
-            preprocess.values[:] = tuples_list
-        else:
-            preprocess = preprocess.to_pandas(nullable=True)
+        preprocess_pd = pd.MultiIndex.from_arrays(arrays)
 
-        output = repr(preprocess)
+        output = repr(preprocess_pd)
         output_prefix = self.__class__.__name__ + "("
         output = output.lstrip(output_prefix)
         lines = output.split("\n")
 
         if len(lines) > 1:
-            if "length=" in lines[-1] and len(self) != len(preprocess):
+            if "length=" in lines[-1] and len(self) != len(preprocess_pd):
                 last_line = lines[-1]
                 length_index = last_line.index("length=")
                 last_line = last_line[:length_index] + f"length={len(self)})"
@@ -1022,42 +1004,32 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False):
         a c  a  c
         b d  b  d
         """
-        # TODO: Currently this function makes a shallow copy, which is
-        # incorrect. We want to make a deep copy, otherwise further
-        # modifications of the resulting DataFrame will affect the MultiIndex.
         if name is no_default:
             column_names = [
                 level if name is None else name
                 for level, name in enumerate(self.names)
             ]
+        elif not is_list_like(name):
+            raise TypeError(
+                "'name' must be a list / sequence of column names."
+            )
+        elif len(name) != len(self.levels):
+            raise ValueError(
+                "'name' should have the same length as "
+                "number of levels on index."
+            )
         else:
-            if not is_list_like(name):
-                raise TypeError(
-                    "'name' must be a list / sequence of column names."
-                )
-            if len(name) != len(self.levels):
-                raise ValueError(
-                    "'name' should have the same length as "
-                    "number of levels on index."
-                )
             column_names = name
 
-        all_none_names = None
-        if not (
-            all_none_names := all(x is None for x in column_names)
-        ) and len(column_names) != len(set(column_names)):
+        if len(column_names) != len(set(column_names)):
             raise ValueError("Duplicate column names are not allowed")
-        df = cudf.DataFrame._from_data(
-            data=self._data,
-            columns=column_names
-            if name is not no_default and not all_none_names
-            else None,
+        ca = ColumnAccessor(
+            dict(zip(column_names, (col.copy() for col in self._columns))),
+            verify=False,
+        )
+        return cudf.DataFrame._from_data(
+            data=ca, index=self if index else None
         )
-
-        if index:
-            df = df.set_index(self)
-
-        return df
 
     @_cudf_nvtx_annotate
     def get_level_values(self, level):
@@ -1243,7 +1215,7 @@ def values(self):
 
     @classmethod
     @_cudf_nvtx_annotate
-    def from_frame(cls, df, names=None):
+    def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         """
         Make a MultiIndex from a DataFrame.
 
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 8f65bd26bd1..193d64a9e7f 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1210,7 +1210,7 @@ def test_multiindex_repr(pmi, max_seq_items):
             .index,
             textwrap.dedent(
                 """
-                MultiIndex([('abc',                       'NaT', 0.345),
+                MultiIndex([('abc',                         NaT, 0.345),
                             ( <NA>, '0 days 00:00:00.000000001',  <NA>),
                             ('xyz', '0 days 00:00:00.000000002', 100.0),
                             ( <NA>, '0 days 00:00:00.000000003',  10.0)],
@@ -1252,10 +1252,10 @@ def test_multiindex_repr(pmi, max_seq_items):
             .index,
             textwrap.dedent(
                 """
-            MultiIndex([('NaT', <NA>),
-                        ('NaT', <NA>),
-                        ('NaT', <NA>),
-                        ('NaT', <NA>)],
+            MultiIndex([(NaT, <NA>),
+                        (NaT, <NA>),
+                        (NaT, <NA>),
+                        (NaT, <NA>)],
                     names=['b', 'a'])
             """
             ),

From c847b98291bd41f98ac417becf0c53293a392ce3 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 27 Jun 2024 21:33:29 +0100
Subject: [PATCH 17/29] Finish implementation of cudf-polars boolean function
 handlers (#16098)

The missing nodes were `is_in`, `not` (both easy), `is_finite` and `is_infinite` (obtained by translating to `contains` calls).

While here, remove the implementation of `IsBetween` and just translate to an expression with binary operations. This removes the need for special-casing scalar arguments to `IsBetween` and reproducing the code for binop evaluation.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16098
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 67 +++++++++++--------
 .../cudf_polars/cudf_polars/dsl/translate.py  | 10 +++
 .../tests/expressions/test_booleanfunction.py | 48 +++++++++++--
 3 files changed, 90 insertions(+), 35 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 871134665af..97325161650 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -443,12 +443,12 @@ def __init__(
         ):
             # With ignore_nulls == False, polars uses Kleene logic
             raise NotImplementedError(f"Kleene logic for {self.name}")
-        if self.name in (
-            pl_expr.BooleanFunction.IsFinite,
-            pl_expr.BooleanFunction.IsInfinite,
-            pl_expr.BooleanFunction.IsIn,
+        if self.name == pl_expr.BooleanFunction.IsIn and not all(
+            c.dtype == self.children[0].dtype for c in self.children
         ):
-            raise NotImplementedError(f"{self.name}")
+            # TODO: If polars IR doesn't put the casts in, we need to
+            # mimic the supertype promotion rules.
+            raise NotImplementedError("IsIn doesn't support supertype casting")
 
     @staticmethod
     def _distinct(
@@ -506,6 +506,33 @@ def do_evaluate(
         mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
+        if self.name in (
+            pl_expr.BooleanFunction.IsFinite,
+            pl_expr.BooleanFunction.IsInfinite,
+        ):
+            # Avoid evaluating the child if the dtype tells us it's unnecessary.
+            (child,) = self.children
+            is_finite = self.name == pl_expr.BooleanFunction.IsFinite
+            if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
+                value = plc.interop.from_arrow(
+                    pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype))
+                )
+                return Column(plc.Column.from_scalar(value, df.num_rows))
+            needles = child.evaluate(df, context=context, mapping=mapping)
+            to_search = [-float("inf"), float("inf")]
+            if is_finite:
+                # NaN is neither finite not infinite
+                to_search.append(float("nan"))
+            haystack = plc.interop.from_arrow(
+                pa.array(
+                    to_search,
+                    type=plc.interop.to_arrow(needles.obj.type()),
+                )
+            )
+            result = plc.search.contains(haystack, needles.obj)
+            if is_finite:
+                result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
+            return Column(result)
         columns = [
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
@@ -612,31 +639,13 @@ def do_evaluate(
                     (c.obj for c in columns),
                 )
             )
-        elif self.name == pl_expr.BooleanFunction.IsBetween:
-            column, lo, hi = columns
-            (closed,) = self.options
-            lop, rop = self._BETWEEN_OPS[closed]
-            lo_obj = (
-                lo.obj_scalar
-                if lo.is_scalar and lo.obj.size() != column.obj.size()
-                else lo.obj
-            )
-            hi_obj = (
-                hi.obj_scalar
-                if hi.is_scalar and hi.obj.size() != column.obj.size()
-                else hi.obj
-            )
+        elif self.name == pl_expr.BooleanFunction.IsIn:
+            needles, haystack = columns
+            return Column(plc.search.contains(haystack.obj, needles.obj))
+        elif self.name == pl_expr.BooleanFunction.Not:
+            (column,) = columns
             return Column(
-                plc.binaryop.binary_operation(
-                    plc.binaryop.binary_operation(
-                        column.obj, lo_obj, lop, output_type=self.dtype
-                    ),
-                    plc.binaryop.binary_operation(
-                        column.obj, hi_obj, rop, output_type=self.dtype
-                    ),
-                    plc.binaryop.BinaryOperator.LOGICAL_AND,
-                    self.dtype,
-                )
+                plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT)
             )
         else:
             raise NotImplementedError(
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 5d289885f47..742e5a591ee 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -342,6 +342,16 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             *(translate_expr(visitor, n=n) for n in node.input),
         )
     elif isinstance(name, pl_expr.BooleanFunction):
+        if name == pl_expr.BooleanFunction.IsBetween:
+            column, lo, hi = (translate_expr(visitor, n=n) for n in node.input)
+            (closed,) = options
+            lop, rop = expr.BooleanFunction._BETWEEN_OPS[closed]
+            return expr.BinOp(
+                dtype,
+                plc.binaryop.BinaryOperator.LOGICAL_AND,
+                expr.BinOp(dtype, lop, column, lo),
+                expr.BinOp(dtype, rop, column, hi),
+            )
         return expr.BooleanFunction(
             dtype,
             name,
diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py
index a52fba26528..97421008669 100644
--- a/python/cudf_polars/tests/expressions/test_booleanfunction.py
+++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py
@@ -6,7 +6,10 @@
 
 import polars as pl
 
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 @pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"])
@@ -67,23 +70,26 @@ def test_boolean_function_unary(request, expr, has_nans, has_nulls):
 
     df = pl.LazyFrame({"a": pl.Series(values, dtype=pl.Float32())})
 
-    q = df.select(expr(pl.col("a")))
+    q = df.select(expr(pl.col("a")), expr(pl.col("a")).not_().alias("b"))
 
     assert_gpu_result_equal(q)
 
 
-@pytest.mark.xfail(reason="Evaluation handlers not yet implemented")
 @pytest.mark.parametrize(
     "expr",
     [
         pl.col("a").is_finite(),
         pl.col("a").is_infinite(),
-        pl.col("a").is_in(pl.col("b")),
+        [pl.col("a").is_infinite(), pl.col("b").is_finite()],
     ],
 )
-def test_unsupported_boolean_function(expr):
+def test_boolean_finite(expr):
     df = pl.LazyFrame(
-        {"a": pl.Series([1, float("nan"), 2, 4], dtype=pl.Float64()), "b": [1, 2, 3, 4]}
+        {
+            "a": pl.Series([1, float("nan"), 2, float("inf")], dtype=pl.Float64()),
+            "b": [1, 2, 3, 4],
+            "c": pl.Series([1, 2, 3, 4], dtype=pl.Float64()),
+        }
     )
 
     q = df.select(expr)
@@ -133,3 +139,33 @@ def test_boolean_horizontal(request, expr, has_nulls, wide):
     q = ldf.select(expr)
 
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.col("a").is_in(pl.col("b")),
+        pl.col("a").is_in(pl.col("c")),
+        pl.col("c").is_in(pl.col("d")),
+    ],
+)
+def test_boolean_is_in(expr):
+    ldf = pl.LazyFrame(
+        {
+            "a": pl.Series([1, 2, 3], dtype=pl.Int64()),
+            "b": pl.Series([3, 4, 2], dtype=pl.Int64()),
+            "c": pl.Series([1, None, 3], dtype=pl.Int64()),
+            "d": pl.Series([10, None, 11], dtype=pl.Int64()),
+        }
+    )
+
+    q = ldf.select(expr)
+
+    assert_gpu_result_equal(q)
+
+
+def test_boolean_is_in_raises_unsupported():
+    ldf = pl.LazyFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int64)})
+    q = ldf.select(pl.col("a").is_in(pl.lit(1, dtype=pl.Int32())))
+
+    assert_ir_translation_raises(q, NotImplementedError)

From e35da6b3df55bfa7b8d5df12c35039740566cb21 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 28 Jun 2024 09:54:03 +0100
Subject: [PATCH 18/29] Implement Ternary copy_if_else (#16114)

A straightforward evaluation using `copy_if_else`.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16114
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 29 +++++++++++++++++++
 .../cudf_polars/cudf_polars/dsl/translate.py  | 10 +++++++
 .../tests/expressions/test_when_then.py       | 27 +++++++++++++++++
 3 files changed, 66 insertions(+)
 create mode 100644 python/cudf_polars/tests/expressions/test_when_then.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 97325161650..17d7d15e4e5 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -51,6 +51,7 @@
     "GroupedRollingWindow",
     "Cast",
     "Agg",
+    "Ternary",
     "BinOp",
 ]
 
@@ -1112,6 +1113,34 @@ def do_evaluate(
         return self.op(child.evaluate(df, context=context, mapping=mapping))
 
 
+class Ternary(Expr):
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+    children: tuple[Expr, Expr, Expr]
+
+    def __init__(
+        self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr
+    ) -> None:
+        super().__init__(dtype)
+        self.children = (when, then, otherwise)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        when, then, otherwise = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        then_obj = then.obj_scalar if then.is_scalar else then.obj
+        otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj
+        return Column(plc.copying.copy_if_else(then_obj, otherwise_obj, when.obj))
+
+
 class BinOp(Expr):
     __slots__ = ("op", "children")
     _non_child = ("dtype", "op")
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 742e5a591ee..953ff636cce 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -446,6 +446,16 @@ def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Ex
     )
 
 
+@_translate_expr.register
+def _(node: pl_expr.Ternary, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+    return expr.Ternary(
+        dtype,
+        translate_expr(visitor, n=node.predicate),
+        translate_expr(visitor, n=node.truthy),
+        translate_expr(visitor, n=node.falsy),
+    )
+
+
 @_translate_expr.register
 def _(
     node: pl_expr.BinaryExpr, visitor: NodeTraverser, dtype: plc.DataType
diff --git a/python/cudf_polars/tests/expressions/test_when_then.py b/python/cudf_polars/tests/expressions/test_when_then.py
new file mode 100644
index 00000000000..cf1c0fe7fce
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_when_then.py
@@ -0,0 +1,27 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize("then_scalar", [False, True])
+@pytest.mark.parametrize("otherwise_scalar", [False, True])
+@pytest.mark.parametrize("expr", [pl.col("c"), pl.col("c").is_not_null()])
+def test_when_then(then_scalar, otherwise_scalar, expr):
+    ldf = pl.LazyFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [10, 13, 11, 15, 16, 11, 10],
+            "c": [None, True, False, False, True, True, False],
+        }
+    )
+
+    then = pl.lit(10) if then_scalar else pl.col("a")
+    otherwise = pl.lit(-2) if otherwise_scalar else pl.col("b")
+    q = ldf.select(pl.when(expr).then(then).otherwise(otherwise))
+    assert_gpu_result_equal(q)

From 6b04fd3b704efdae7d39d09beba026fcbca5f996 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Fri, 28 Jun 2024 12:31:18 +0200
Subject: [PATCH 19/29] Memory Profiling (#15866)

Use [RMM's new memory profiler](https://github.com/rapidsai/rmm/pull/1563) to profile all functions already decorated with `_cudf_nvtx_annotate`.

Example
```python
import cudf
from cudf.utils.performance_tracking import print_memory_report

cudf.set_option("memory_profiling", True)

df1 = cudf.DataFrame({"a": [1, 2, 3]})
df2 = cudf.DataFrame({"a": [2, 2, 3]})
df3 = df1.merge(df2)

print_memory_report()
```

Output:
```
Memory Profiling
================

Ordered by: memory_peak

ncalls     memory_peak    memory_total  filename:lineno(function)
     1             272             688  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/dataframe.py:4072(DataFrame.merge)
     2              32              64  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/dataframe.py:1043(DataFrame._init_from_dict_like)
     2              32              64  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/dataframe.py:690(DataFrame.__init__)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/dataframe.py:1131(DataFrame._align_input_series_indices)
     7               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/index.py:214(RangeIndex.__init__)
     6               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/index.py:424(RangeIndex.__len__)
     4               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/frame.py:271(Frame.__len__)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/dataframe.py:3195(DataFrame._insert)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/index.py:270(RangeIndex.name)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/index.py:369(RangeIndex.copy)
     5               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/frame.py:134(Frame._from_data)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/frame.py:1039(Frame._copy_type_metadata)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/indexed_frame.py:315(IndexedFrame._from_columns_like_self)
```

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15866
---
 .../cudf/source/user_guide/api_docs/index.rst |   1 +
 .../api_docs/performance_tracking.rst         |  12 +
 docs/cudf/source/user_guide/index.md          |   1 +
 .../source/user_guide/memory-profiling.md     |  44 ++++
 python/cudf/cudf/core/buffer/spill_manager.py |   4 +-
 .../cudf/cudf/core/buffer/spillable_buffer.py |   7 +-
 python/cudf/cudf/core/dataframe.py            | 180 +++++++-------
 python/cudf/cudf/core/frame.py                | 110 ++++-----
 python/cudf/cudf/core/groupby/groupby.py      |  60 ++---
 python/cudf/cudf/core/index.py                | 228 +++++++++---------
 python/cudf/cudf/core/indexed_frame.py        | 144 +++++------
 python/cudf/cudf/core/multiindex.py           | 130 +++++-----
 python/cudf/cudf/core/series.py               | 228 +++++++++---------
 python/cudf/cudf/core/single_column_frame.py  |  42 ++--
 python/cudf/cudf/core/udf/groupby_utils.py    |   4 +-
 python/cudf/cudf/core/udf/utils.py            |   6 +-
 python/cudf/cudf/io/csv.py                    |   6 +-
 python/cudf/cudf/io/parquet.py                |  28 +--
 python/cudf/cudf/io/text.py                   |   6 +-
 python/cudf/cudf/options.py                   |  14 ++
 .../cudf/tests/test_performance_tracking.py   |  41 ++++
 python/cudf/cudf/utils/nvtx_annotation.py     |  30 ---
 .../cudf/cudf/utils/performance_tracking.py   |  82 +++++++
 python/cudf/cudf/utils/utils.py               |   5 +-
 python/dask_cudf/dask_cudf/backends.py        |  40 +--
 python/dask_cudf/dask_cudf/core.py            |  62 ++---
 python/dask_cudf/dask_cudf/groupby.py         |  72 +++---
 python/dask_cudf/dask_cudf/sorting.py         |  16 +-
 28 files changed, 885 insertions(+), 718 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/performance_tracking.rst
 create mode 100644 docs/cudf/source/user_guide/memory-profiling.md
 create mode 100644 python/cudf/cudf/tests/test_performance_tracking.py
 delete mode 100644 python/cudf/cudf/utils/nvtx_annotation.py
 create mode 100644 python/cudf/cudf/utils/performance_tracking.py

diff --git a/docs/cudf/source/user_guide/api_docs/index.rst b/docs/cudf/source/user_guide/api_docs/index.rst
index 5f26a921012..d05501f4a4a 100644
--- a/docs/cudf/source/user_guide/api_docs/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/index.rst
@@ -26,3 +26,4 @@ This page provides a list of all publicly accessible modules, methods and classe
     options
     extension_dtypes
     pylibcudf/index.rst
+    performance_tracking
diff --git a/docs/cudf/source/user_guide/api_docs/performance_tracking.rst b/docs/cudf/source/user_guide/api_docs/performance_tracking.rst
new file mode 100644
index 00000000000..9da79e69fb2
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/performance_tracking.rst
@@ -0,0 +1,12 @@
+.. _api.performance_tracking:
+
+====================
+Performance Tracking
+====================
+
+.. currentmodule:: cudf.utils.performance_tracking
+.. autosummary::
+   :toctree: api/
+
+   get_memory_records
+   print_memory_report
diff --git a/docs/cudf/source/user_guide/index.md b/docs/cudf/source/user_guide/index.md
index 486368c3b8b..df4e4795a08 100644
--- a/docs/cudf/source/user_guide/index.md
+++ b/docs/cudf/source/user_guide/index.md
@@ -16,5 +16,6 @@ options
 performance-comparisons/index
 PandasCompat
 copy-on-write
+memory-profiling
 pandas-2.0-breaking-changes
 ```
diff --git a/docs/cudf/source/user_guide/memory-profiling.md b/docs/cudf/source/user_guide/memory-profiling.md
new file mode 100644
index 00000000000..ab5433685e6
--- /dev/null
+++ b/docs/cudf/source/user_guide/memory-profiling.md
@@ -0,0 +1,44 @@
+(memory-profiling-user-doc)=
+
+# Memory Profiling
+
+Peak memory usage is a common concern in GPU programming because GPU memory is typically smaller than available CPU memory. To easily identify memory hotspots, cuDF provides a memory profiler. It comes with an overhead so avoid using it in performance-sensitive code.
+
+## Enabling Memory Profiling
+
+First, enable memory profiling in RMM by calling {py:func}`rmm.statistics.enable_statistics()`. This adds a statistics resource adaptor to the current RMM memory resource, which enables cuDF to access memory profiling information. See the [RMM documentation](https://docs.rapids.ai/api/rmm/stable/guide/#memory-statistics-and-profiling) for more details.
+
+Second, enable memory profiling in cuDF by setting the `memory_profiling` option to `True`. Use {py:func}`cudf.set_option` or set the environment variable ``CUDF_MEMORY_PROFILING=1`` prior to the launch of the Python interpreter.
+
+To get the result of the profiling, use {py:func}`cudf.utils.performance_tracking.print_memory_report` or access the raw profiling data by using: {py:func}`cudf.utils.performance_tracking.get_memory_records`.
+
+### Example
+In the following, we enable profiling, do some work, and then print the profiling results:
+
+```python
+>>> import cudf
+>>> from cudf.utils.performance_tracking import print_memory_report
+>>> from rmm.statistics import enable_statistics
+>>> enable_statistics()
+>>> cudf.set_option("memory_profiling", True)
+>>> cudf.DataFrame({"a": [1, 2, 3]})  # Some work
+   a
+0  1
+1  2
+2  3
+>>> print_memory_report()  # Pretty print the result of the profiling
+Memory Profiling
+================
+
+Legends:
+ncalls       - number of times the function or code block was called
+memory_peak  - peak memory allocated in function or code block (in bytes)
+memory_total - total memory allocated in function or code block (in bytes)
+
+Ordered by: memory_peak
+
+ncalls memory_peak memory_total filename:lineno(function)
+     1          32           32 cudf/core/dataframe.py:690(DataFrame.__init__)
+     2           0            0 cudf/core/index.py:214(RangeIndex.__init__)
+     6           0            0 cudf/core/index.py:424(RangeIndex.__len__)
+```
diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index 762cd7f9e86..ed351a6b107 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -18,14 +18,14 @@
 import rmm.mr
 
 from cudf.options import get_option
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.string import format_bytes
 
 if TYPE_CHECKING:
     from cudf.core.buffer.spillable_buffer import SpillableBufferOwner
 
 _spill_cudf_nvtx_annotate = partial(
-    _cudf_nvtx_annotate, domain="cudf_python-spill"
+    _performance_tracking, domain="cudf_python-spill"
 )
 
 
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index eb57a371965..4c9e524ee05 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -10,6 +10,7 @@
 from typing import TYPE_CHECKING, Any, Literal
 
 import numpy
+import nvtx
 from typing_extensions import Self
 
 import rmm
@@ -21,7 +22,7 @@
     host_memory_allocation,
 )
 from cudf.core.buffer.exposure_tracked_buffer import ExposureTrackedBuffer
-from cudf.utils.nvtx_annotation import _get_color_for_nvtx, annotate
+from cudf.utils.performance_tracking import _get_color_for_nvtx
 from cudf.utils.string import format_bytes
 
 if TYPE_CHECKING:
@@ -200,7 +201,7 @@ def spill(self, target: str = "cpu") -> None:
                 )
 
             if (ptr_type, target) == ("gpu", "cpu"):
-                with annotate(
+                with nvtx.annotate(
                     message="SpillDtoH",
                     color=_get_color_for_nvtx("SpillDtoH"),
                     domain="cudf_python-spill",
@@ -218,7 +219,7 @@ def spill(self, target: str = "cpu") -> None:
                 # trigger a new call to this buffer's `spill()`.
                 # Therefore, it is important that spilling-on-demand doesn't
                 # try to unspill an already locked buffer!
-                with annotate(
+                with nvtx.annotate(
                     message="SpillHtoD",
                     color=_get_color_for_nvtx("SpillHtoD"),
                     domain="cudf_python-spill",
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f7f5ef792d6..3fc29582c4c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -83,7 +83,7 @@
     min_scalar_type,
     numeric_normalize_types,
 )
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
 
 if TYPE_CHECKING:
@@ -145,7 +145,7 @@ def __setitem__(self, key, value):
             key = (key, slice(None))
         return self._setitem_tuple_arg(key, value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _can_downcast_to_series(self, df, arg):
         """
         This method encapsulates the logic used
@@ -188,7 +188,7 @@ def _can_downcast_to_series(self, df, arg):
                 return True
         return False
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _downcast_to_series(self, df, arg):
         """
         "Downcast" from a DataFrame to a Series
@@ -233,11 +233,11 @@ class _DataFrameLocIndexer(_DataFrameIndexer):
     For selection by label.
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _getitem_scalar(self, arg):
         return self._frame[arg[1]].loc[arg[0]]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _getitem_tuple_arg(self, arg):
         from uuid import uuid4
 
@@ -363,7 +363,7 @@ def _getitem_tuple_arg(self, arg):
             return self._downcast_to_series(df, arg)
         return df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _setitem_tuple_arg(self, key, value):
         if (
             isinstance(self._frame.index, MultiIndex)
@@ -532,7 +532,7 @@ def __getitem__(self, arg):
             return frame._empty_like(keep_index=True)
         assert_never(row_spec)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _setitem_tuple_arg(self, key, value):
         columns_df = self._frame._from_data(
             self._frame._data.select_by_index(key[1]), self._frame.index
@@ -677,7 +677,7 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     _groupby = DataFrameGroupBy
     _resampler = DataFrameResampler
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -859,7 +859,7 @@ def __init__(
             columns, pd.MultiIndex
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _init_from_series_list(self, data, columns, index):
         if index is None:
             # When `index` is `None`, the final index of
@@ -972,7 +972,7 @@ def _init_from_series_list(self, data, columns, index):
         else:
             self._data.rangeindex = True
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _init_from_list_like(self, data, index=None, columns=None):
         if index is None:
             index = RangeIndex(start=0, stop=len(data))
@@ -1030,7 +1030,7 @@ def _init_from_list_like(self, data, index=None, columns=None):
             )
             self._data.label_dtype = getattr(columns, "dtype", None)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _init_from_dict_like(
         self, data, index=None, columns=None, nan_as_null=None
     ):
@@ -1119,7 +1119,7 @@ def _from_data(
         return out
 
     @staticmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _align_input_series_indices(data, index):
         input_series = [
             Series(val)
@@ -1187,7 +1187,7 @@ def deserialize(cls, header, frames):
         return obj
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def shape(self):
         """Returns a tuple representing the dimensionality of the DataFrame."""
         return self._num_rows, self._num_columns
@@ -1270,7 +1270,7 @@ def __setattr__(self, key, col):
         else:
             super().__setattr__(key, col)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, arg):
         """
         If *arg* is a ``str`` or ``int`` type, return the column Series.
@@ -1364,7 +1364,7 @@ def __getitem__(self, arg):
                 f"__getitem__ on type {type(arg)} is not supported"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __setitem__(self, arg, value):
         """Add/set column by *arg or DataFrame*"""
         if isinstance(arg, DataFrame):
@@ -1482,7 +1482,7 @@ def __setitem__(self, arg, value):
     def __delitem__(self, name):
         self._drop_column(name)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, index=True, deep=False):
         mem_usage = [col.memory_usage for col in self._data.columns]
         names = [str(name) for name in self._data.names]
@@ -1494,7 +1494,7 @@ def memory_usage(self, index=True, deep=False):
             index=as_index(names),
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_function__(self, func, types, args, kwargs):
         if "out" in kwargs or not all(
             issubclass(t, (Series, DataFrame)) for t in types
@@ -1528,7 +1528,7 @@ def __array_function__(self, func, types, args, kwargs):
         return NotImplemented
 
     # The _get_numeric_data method is necessary for dask compatibility.
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_numeric_data(self):
         """Return a dataframe with only numeric data types"""
         columns = [
@@ -1538,7 +1538,7 @@ def _get_numeric_data(self):
         ]
         return self[columns]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def assign(self, **kwargs: Callable[[Self], Any] | Any):
         """
         Assign columns to DataFrame from keyword arguments.
@@ -1571,7 +1571,7 @@ def assign(self, **kwargs: Callable[[Self], Any] | Any):
         return new_df
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _concat(
         cls, objs, axis=0, join="outer", ignore_index=False, sort=False
     ):
@@ -1963,12 +1963,12 @@ def _get_renderable_dataframe(self):
 
         return output
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __repr__(self):
         output = self._get_renderable_dataframe()
         return self._clean_renderable_dataframe(output)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _repr_html_(self):
         lines = (
             self._get_renderable_dataframe()
@@ -1984,7 +1984,7 @@ def _repr_html_(self):
             lines.append("</div>")
         return "\n".join(lines)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _repr_latex_(self):
         return self._get_renderable_dataframe().to_pandas()._repr_latex_()
 
@@ -2098,7 +2098,7 @@ def _make_operands_and_index_for_binop(
         return operands, index, can_use_self_column_name
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_dict(
         cls,
         data: dict,
@@ -2233,7 +2233,7 @@ def from_dict(
                 f"parameter. Got '{orient}' instead"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_dict(
         self,
         orient: str = "dict",
@@ -2354,7 +2354,7 @@ def to_dict(
 
         return self.to_pandas().to_dict(orient=orient, into=into)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def scatter_by_map(
         self, map_index, map_size=None, keep_index=True, debug: bool = False
     ):
@@ -2447,7 +2447,7 @@ def scatter_by_map(
 
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def update(
         self,
         other,
@@ -2542,23 +2542,23 @@ def update(
 
         self._mimic_inplace(source_df, inplace=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __iter__(self):
         return iter(self._column_names)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __contains__(self, item):
         # This must check against containment in the pandas Index and not
         # self._column_names to handle NA, None, nan, etc. correctly.
         return item in self._data.to_pandas_index()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def items(self):
         """Iterate over column names and series pairs"""
         for k in self:
             yield (k, self[k])
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:
         ret = super().equals(other)
         # If all other checks matched, validate names.
@@ -2591,13 +2591,13 @@ def at(self):
         "index is absolutely necessary. For checking if the columns are a "
         "MultiIndex, use _data.multiindex."
     )
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def columns(self):
         """Returns a tuple of columns"""
         return self._data.to_pandas_index()
 
     @columns.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def columns(self, columns):
         multiindex = False
         rangeindex = False
@@ -2665,7 +2665,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None:
             verify=False,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def reindex(
         self,
         labels=None,
@@ -2813,7 +2813,7 @@ def reindex(
             fill_value=fill_value,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def set_index(
         self,
         keys,
@@ -2980,7 +2980,7 @@ def set_index(
         df.index = idx
         return df if not inplace else None
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
     ):  # noqa: D102
@@ -3006,7 +3006,7 @@ def fillna(
             value=value, method=method, axis=axis, inplace=inplace, limit=limit
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         from cudf.core._internals.where import (
             _check_and_cast_columns_with_other,
@@ -3163,7 +3163,7 @@ def reset_index(
             inplace=inplace,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def insert(self, loc, name, value, nan_as_null=no_default):
         """Add a column to DataFrame at the index specified by loc.
 
@@ -3189,7 +3189,7 @@ def insert(self, loc, name, value, nan_as_null=no_default):
             ignore_index=False,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
         """
         Same as `insert`, with additional `ignore_index` param.
@@ -3271,7 +3271,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
         self._data.insert(name, value, loc=loc)
 
     @property  # type:ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def axes(self):
         """
         Return a list representing the axes of the DataFrame.
@@ -3363,7 +3363,7 @@ def diff(self, periods=1, axis=0):
 
         return self - self.shift(periods=periods)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop_duplicates(
         self,
         subset=None,
@@ -3451,14 +3451,14 @@ def drop_duplicates(
 
         return self._mimic_inplace(outdf, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pop(self, item):
         """Return a column and drop it from the DataFrame."""
         popped = self[item]
         del self[item]
         return popped
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rename(
         self,
         mapper=None,
@@ -3616,7 +3616,7 @@ def rename(
 
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def add_prefix(self, prefix):
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
@@ -3625,7 +3625,7 @@ def add_prefix(self, prefix):
         ]
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def add_suffix(self, suffix):
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
@@ -3634,7 +3634,7 @@ def add_suffix(self, suffix):
         ]
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def agg(self, aggs, axis=None):
         """
         Aggregate using one or more operations over the specified axis.
@@ -3770,7 +3770,7 @@ def agg(self, aggs, axis=None):
         else:
             raise ValueError("argument must be a string, list or dict")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nlargest(self, n, columns, keep="first"):
         """Return the first *n* rows ordered by *columns* in descending order.
 
@@ -3910,7 +3910,7 @@ def nsmallest(self, n, columns, keep="first"):
         """
         return self._n_largest_or_smallest(False, n, columns, keep)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def swaplevel(self, i=-2, j=-1, axis=0):
         """
         Swap level i with level j.
@@ -3977,7 +3977,7 @@ def swaplevel(self, i=-2, j=-1, axis=0):
 
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def transpose(self):
         """Transpose index and columns.
 
@@ -4041,7 +4041,7 @@ def transpose(self):
 
     T = property(transpose, doc=transpose.__doc__)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def melt(self, **kwargs):
         """Unpivots a DataFrame from wide format to long format,
         optionally leaving identifier variables set.
@@ -4071,7 +4071,7 @@ def melt(self, **kwargs):
 
         return melt(self, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def merge(
         self,
         right,
@@ -4224,7 +4224,7 @@ def merge(
             suffixes=suffixes,
         ).perform_merge()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def join(
         self,
         other,
@@ -4273,7 +4273,7 @@ def join(
         )
         return df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         groupby_doc_template.format(
             ret=textwrap.dedent(
@@ -4407,7 +4407,7 @@ def query(self, expr, local_dict=None):
                 BooleanMask.from_column_unchecked(boolmask)
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def apply(
         self, func, axis=1, raw=False, result_type=None, args=(), **kwargs
     ):
@@ -4691,7 +4691,7 @@ def _func(x):  # pragma: no cover
 
         return DataFrame._from_data(result, index=self.index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @applyutils.doc_apply()
     def apply_rows(
         self,
@@ -4770,7 +4770,7 @@ def apply_rows(
             cache_key=cache_key,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @applyutils.doc_applychunks()
     def apply_chunks(
         self,
@@ -4837,7 +4837,7 @@ def apply_chunks(
             tpb=tpb,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def partition_by_hash(self, columns, nparts, keep_index=True):
         """Partition the dataframe by the hashed value of data in *columns*.
 
@@ -5181,7 +5181,7 @@ def _sizeof_fmt(num, size_qualifier):
 
         cudf.utils.ioutils.buffer_write_lines(buf, lines)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_describe()
     def describe(
         self,
@@ -5243,7 +5243,7 @@ def describe(
                 )
             return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.DataFrame:
@@ -5333,7 +5333,7 @@ def to_pandas(
         return out_df
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_pandas(cls, dataframe, nan_as_null=no_default):
         """
         Convert from a Pandas DataFrame.
@@ -5406,7 +5406,7 @@ def from_pandas(cls, dataframe, nan_as_null=no_default):
             )
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrow(cls, table):
         """
         Convert from PyArrow Table to DataFrame.
@@ -5492,7 +5492,7 @@ def from_arrow(cls, table):
 
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_arrow(self, preserve_index=None):
         """
         Convert to a PyArrow Table.
@@ -5582,7 +5582,7 @@ def to_arrow(self, preserve_index=None):
 
         return out.replace_schema_metadata(metadata)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_records(self, index=True):
         """Convert to a numpy recarray
 
@@ -5606,7 +5606,7 @@ def to_records(self, index=True):
         return ret
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         """
         Convert structured or record ndarray to DataFrame.
@@ -5685,7 +5685,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         return df
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
         """Convert a numpy/cupy array to DataFrame.
 
@@ -5763,7 +5763,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             index=index,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def interpolate(
         self,
         method="linear",
@@ -5793,7 +5793,7 @@ def interpolate(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quantile(
         self,
         q=0.5,
@@ -5936,7 +5936,7 @@ def quantile(
         result.index = cudf.Index(list(map(float, qs)), dtype="float64")
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isin(self, values):
         """
         Whether each element in the DataFrame is contained in values.
@@ -6080,7 +6080,7 @@ def make_false_column_like_self():
     #
     # Stats
     #
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
         """Prepare a DataFrame for CuPy-based row-wise operations."""
 
@@ -6132,7 +6132,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
             coerced = coerced.astype("int64", copy=False)
         return coerced, mask, common_dtype
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def count(self, axis=0, numeric_only=False):
         """
         Count ``non-NA`` cells for each column or row.
@@ -6184,7 +6184,7 @@ def count(self, axis=0, numeric_only=False):
         "columns": 1,
     }
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _reduce(
         self,
         op,
@@ -6308,7 +6308,7 @@ def _reduce(
         else:
             raise ValueError(f"Invalid value of {axis=} received for {op}")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _scan(
         self,
         op,
@@ -6325,7 +6325,7 @@ def _scan(
         elif axis == 1:
             return self._apply_cupy_method_axis_1(op, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def mode(self, axis=0, numeric_only=False, dropna=True):
         """
         Get the mode(s) of each element along the selected axis.
@@ -6432,17 +6432,17 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
 
         return df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def all(self, axis=0, bool_only=None, skipna=True, **kwargs):
         obj = self.select_dtypes(include="bool") if bool_only else self
         return super(DataFrame, obj).all(axis, skipna, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def any(self, axis=0, bool_only=None, skipna=True, **kwargs):
         obj = self.select_dtypes(include="bool") if bool_only else self
         return super(DataFrame, obj).any(axis, skipna, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
         # This method uses cupy to perform scans and reductions along rows of a
         # DataFrame. Since cuDF is designed around columnar storage and
@@ -6542,7 +6542,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
             result_df._set_columns_like(prepared._data)
             return result_df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _columns_view(self, columns):
         """
         Return a subset of the DataFrame's columns as a view.
@@ -6551,7 +6551,7 @@ def _columns_view(self, columns):
             {col: self._data[col] for col in columns}, index=self.index
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def select_dtypes(self, include=None, exclude=None):
         """Return a subset of the DataFrame's columns based on the column dtypes.
 
@@ -6816,7 +6816,7 @@ def to_orc(
             index=index,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def stack(self, level=-1, dropna=no_default, future_stack=False):
         """Stack the prescribed level(s) from columns to index
 
@@ -7161,7 +7161,7 @@ def unnamed_group_generator():
         else:
             return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cov(self, **kwargs):
         """Compute the covariance matrix of a DataFrame.
 
@@ -7216,7 +7216,7 @@ def corr(self, method="pearson", min_periods=None):
         df._set_columns_like(self._data)
         return df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_struct(self, name=None):
         """
         Return a struct Series composed of the columns of the DataFrame.
@@ -7250,7 +7250,7 @@ def to_struct(self, name=None):
             name=name,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def keys(self):
         """
         Get the columns.
@@ -7310,14 +7310,14 @@ def iterrows(self):
             "if you wish to iterate over each row."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @copy_docstring(reshape.pivot)
     def pivot(self, *, columns, index=no_default, values=no_default):
         return cudf.core.reshape.pivot(
             self, index=index, columns=columns, values=values
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @copy_docstring(reshape.pivot_table)
     def pivot_table(
         self,
@@ -7346,14 +7346,14 @@ def pivot_table(
             sort=sort,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @copy_docstring(reshape.unstack)
     def unstack(self, level=-1, fill_value=None):
         return cudf.core.reshape.unstack(
             self, level=level, fill_value=fill_value
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def explode(self, column, ignore_index=False):
         """
         Transform each element of a list-like to a row, replicating index
@@ -7549,7 +7549,7 @@ def _from_columns_like_self(
         result._set_columns_like(self._data)
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def interleave_columns(self):
         """
         Interleave Series columns of a table into a single column.
@@ -7597,7 +7597,7 @@ def interleave_columns(self):
             {None: libcudf.reshape.interleave_columns([*self._columns])}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def eval(self, expr: str, inplace: bool = False, **kwargs):
         """Evaluate a string describing operations on DataFrame columns.
 
@@ -7953,7 +7953,7 @@ def func(left, right, output):
     )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def from_pandas(obj, nan_as_null=no_default):
     """
     Convert certain Pandas objects into the cudf equivalent.
@@ -8080,7 +8080,7 @@ def from_pandas(obj, nan_as_null=no_default):
         )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def merge(left, right, *args, **kwargs):
     if isinstance(left, Series):
         left = left.to_frame()
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 8ca71180c00..9bac75dc6ac 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -32,7 +32,7 @@
 from cudf.core.mixins import BinaryOperand, Scannable
 from cudf.utils import ioutils
 from cudf.utils.dtypes import find_common_type
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf
 
 if TYPE_CHECKING:
@@ -86,7 +86,7 @@ def _dtypes(self) -> abc.Iterable:
     def ndim(self) -> int:
         raise NotImplementedError()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def serialize(self):
         # TODO: See if self._data can be serialized outright
         header = {
@@ -101,7 +101,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def deserialize(cls, header, frames):
         cls_deserialize = pickle.loads(header["type-serialized"])
         column_names = pickle.loads(header["column_names"])
@@ -122,7 +122,7 @@ def deserialize(cls, header, frames):
         return cls_deserialize._from_data(col_accessor)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data(cls, data: MutableMapping) -> Self:
         """
         Construct cls from a ColumnAccessor-like mapping.
@@ -131,7 +131,7 @@ def _from_data(cls, data: MutableMapping) -> Self:
         Frame.__init__(obj, data)
         return obj
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data_like_self(self, data: MutableMapping) -> Self:
         """
         Return type(self) from a ColumnAccessor-like mapping but
@@ -139,7 +139,7 @@ def _from_data_like_self(self, data: MutableMapping) -> Self:
         """
         return self._from_data(data)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_columns_like_self(
         self,
         columns: list[ColumnBase],
@@ -155,7 +155,7 @@ def _from_columns_like_self(
         frame = self.__class__._from_data(data)
         return frame._copy_type_metadata(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _mimic_inplace(
         self, result: Self, inplace: bool = False
     ) -> Self | None:
@@ -171,7 +171,7 @@ def _mimic_inplace(
             return result
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def size(self) -> int:
         """
         Return the number of elements in the underlying data.
@@ -263,11 +263,11 @@ def memory_usage(self, deep=False):
         """
         raise NotImplementedError
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __len__(self) -> int:
         return self._num_rows
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self:
         casted = (
             col.astype(dtype.get(col_name, col.dtype), copy=copy)
@@ -276,7 +276,7 @@ def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self:
         ca = self._data._from_columns_like_self(casted, verify=False)
         return self._from_data_like_self(ca)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:
         """
         Test whether two objects contain the same elements.
@@ -347,7 +347,7 @@ def equals(self, other) -> bool:
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_columns_by_label(self, labels) -> Self:
         """
         Returns columns of the Frame specified by `labels`.
@@ -357,7 +357,7 @@ def _get_columns_by_label(self, labels) -> Self:
         return self._from_data_like_self(self._data.select_by_label(labels))
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values(self) -> cupy.ndarray:
         """
         Return a CuPy representation of the DataFrame.
@@ -373,7 +373,7 @@ def values(self) -> cupy.ndarray:
         return self.to_cupy()
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values_host(self) -> np.ndarray:
         """
         Return a NumPy representation of the data.
@@ -388,7 +388,7 @@ def values_host(self) -> np.ndarray:
         """
         return self.to_numpy()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array__(self, dtype=None):
         raise TypeError(
             "Implicit conversion to a host NumPy array via __array__ is not "
@@ -397,14 +397,14 @@ def __array__(self, dtype=None):
             "using .to_numpy()."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __arrow_array__(self, type=None):
         raise TypeError(
             "Implicit conversion to a host PyArrow object via __arrow_array__ "
             "is not allowed. Consider using .to_arrow()"
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _to_array(
         self,
         get_array: Callable,
@@ -468,7 +468,7 @@ def to_array(
     # particular, we need to benchmark how much of the overhead is coming from
     # (potentially unavoidable) local copies in to_cupy and how much comes from
     # inefficiencies in the implementation.
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_cupy(
         self,
         dtype: Dtype | None = None,
@@ -502,7 +502,7 @@ def to_cupy(
             na_value,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_numpy(
         self,
         dtype: Dtype | None = None,
@@ -537,7 +537,7 @@ def to_numpy(
             lambda col: col.values_host, numpy, copy, dtype, na_value
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace: bool = False) -> Self | None:
         """
         Replace values where the condition is False.
@@ -610,7 +610,7 @@ def where(self, cond, other=None, inplace: bool = False) -> Self | None:
         """
         raise NotImplementedError
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(
         self,
         value: None | ScalarLike | cudf.Series = None,
@@ -767,14 +767,14 @@ def fillna(
             inplace=inplace,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _drop_column(self, name):
         """Drop a column by *name*"""
         if name not in self._data:
             raise KeyError(f"column '{name}' does not exist")
         del self._data[name]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _quantile_table(
         self,
         q: float,
@@ -808,7 +808,7 @@ def _quantile_table(
         )
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrow(cls, data: pa.Table) -> Self:
         """Convert from PyArrow Table to Frame
 
@@ -968,7 +968,7 @@ def from_arrow(cls, data: pa.Table) -> Self:
 
         return cls._from_data({name: result[name] for name in column_names})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_arrow(self):
         """
         Convert to arrow Table
@@ -992,7 +992,7 @@ def to_arrow(self):
             {str(name): col.to_arrow() for name, col in self._data.items()}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _positions_from_column_names(self, column_names) -> list[int]:
         """Map each column name into their positions in the frame.
 
@@ -1005,7 +1005,7 @@ def _positions_from_column_names(self, column_names) -> list[int]:
             if name in set(column_names)
         ]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _copy_type_metadata(self: Self, other: Self) -> Self:
         """
         Copy type metadata from each column of `other` to the corresponding
@@ -1020,7 +1020,7 @@ def _copy_type_metadata(self: Self, other: Self) -> Self:
 
         return self
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isna(self):
         """
         Identify missing values.
@@ -1101,7 +1101,7 @@ def isna(self):
     # Alias for isna
     isnull = isna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def notna(self):
         """
         Identify non-missing values.
@@ -1182,7 +1182,7 @@ def notna(self):
     # Alias for notna
     notnull = notna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def searchsorted(
         self,
         values,
@@ -1296,7 +1296,7 @@ def searchsorted(
         else:
             return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def argsort(
         self,
         by=None,
@@ -1383,7 +1383,7 @@ def argsort(
             by=by, ascending=ascending, na_position=na_position
         ).values
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_sorted_inds(
         self,
         by=None,
@@ -1411,7 +1411,7 @@ def _get_sorted_inds(
             stable=True,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _split(self, splits):
         """Split a frame with split points in ``splits``. Returns a list of
         Frames of length `len(splits) + 1`.
@@ -1426,13 +1426,13 @@ def _split(self, splits):
             for split_idx in range(len(splits) + 1)
         ]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _encode(self):
         columns, indices = libcudf.transform.table_encode([*self._columns])
         keys = self._from_columns_like_self(columns)
         return keys, indices
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _unaryop(self, op):
         data_columns = (col.unary_operator(op) for col in self._columns)
         return self._from_data_like_self(
@@ -1440,7 +1440,7 @@ def _unaryop(self, op):
         )
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _colwise_binop(
         cls,
         operands: dict[str | None, tuple[ColumnBase, Any, bool, Any]],
@@ -1519,11 +1519,11 @@ def _colwise_binop(
 
         return output
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return _array_ufunc(self, ufunc, method, inputs, kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @acquire_spill_lock()
     def _apply_cupy_ufunc_to_operands(
         self, ufunc, cupy_func, operands, **kwargs
@@ -1565,7 +1565,7 @@ def _apply_cupy_ufunc_to_operands(
         return data
 
     # Unary logical operators
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __neg__(self):
         """Negate for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
@@ -1579,30 +1579,30 @@ def __neg__(self):
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __pos__(self):
         return self.copy(deep=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __abs__(self):
         return self._unaryop("abs")
 
     # Reductions
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_axis_from_axis_arg(cls, axis):
         try:
             return cls._SUPPORT_AXIS_LOOKUP[axis]
         except KeyError:
             raise ValueError(f"No axis named {axis} for object type {cls}")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _reduce(self, *args, **kwargs):
         raise NotImplementedError(
             f"Reductions are not supported for objects of type {type(self)}."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def min(
         self,
         axis=0,
@@ -1653,7 +1653,7 @@ def min(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def max(
         self,
         axis=0,
@@ -1701,7 +1701,7 @@ def max(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def all(self, axis=0, skipna=True, **kwargs):
         """
         Return whether all elements are True in DataFrame.
@@ -1754,7 +1754,7 @@ def all(self, axis=0, skipna=True, **kwargs):
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def any(self, axis=0, skipna=True, **kwargs):
         """
         Return whether any elements is True in DataFrame.
@@ -1807,26 +1807,26 @@ def any(self, axis=0, skipna=True, **kwargs):
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @ioutils.doc_to_dlpack()
     def to_dlpack(self):
         """{docstring}"""
 
         return cudf.io.dlpack.to_dlpack(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __str__(self):
         return repr(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __deepcopy__(self, memo):
         return self.copy(deep=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __copy__(self):
         return self.copy(deep=False)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __invert__(self):
         """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
@@ -1835,7 +1835,7 @@ def __invert__(self):
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nunique(self, dropna: bool = True):
         """
         Returns a per column mapping with counts of unique values for
@@ -1856,7 +1856,7 @@ def nunique(self, dropna: bool = True):
         )
 
     @staticmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _repeat(
         columns: list[ColumnBase], repeats, axis=None
     ) -> list[ColumnBase]:
@@ -1870,7 +1870,7 @@ def _repeat(
 
         return libcudf.filling.repeat(columns, repeats)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @_warn_no_dask_cudf
     def __dask_tokenize__(self):
         from dask.base import normalize_token
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 77b54a583d3..eccb3acabf6 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -31,7 +31,7 @@
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
 from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin
 
 if TYPE_CHECKING:
@@ -392,7 +392,7 @@ def indices(self):
             zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1]))
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_group(self, name, obj=None):
         """
         Construct DataFrame from group with provided name.
@@ -436,7 +436,7 @@ def get_group(self, name, obj=None):
             )
         return obj.iloc[self.indices[name]]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def size(self):
         """
         Return the size of each group.
@@ -451,7 +451,7 @@ def size(self):
             .agg("size")
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cumcount(self):
         """
         Return the cumulative count of keys in each group.
@@ -467,7 +467,7 @@ def cumcount(self):
             .agg("cumcount")
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rank(
         self,
         method="average",
@@ -521,7 +521,7 @@ def _groupby(self):
             [*self.grouping.keys._columns], dropna=self._dropna
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def agg(self, func):
         """
         Apply aggregation(s) to the groups.
@@ -821,7 +821,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
         else:
             return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def head(self, n: int = 5, *, preserve_order: bool = True):
         """Return first n rows of each group
 
@@ -874,7 +874,7 @@ def head(self, n: int = 5, *, preserve_order: bool = True):
             n, take_head=True, preserve_order=preserve_order
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def tail(self, n: int = 5, *, preserve_order: bool = True):
         """Return last n rows of each group
 
@@ -928,7 +928,7 @@ def tail(self, n: int = 5, *, preserve_order: bool = True):
             n, take_head=False, preserve_order=preserve_order
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nth(self, n):
         """
         Return the nth row from each group.
@@ -949,7 +949,7 @@ def nth(self, n):
         del self.obj._data["__groupbynth_order__"]
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ngroup(self, ascending=True):
         """
         Number each group from 0 to the number of groups - 1.
@@ -1261,7 +1261,7 @@ def _normalize_aggs(
         ]
         return column_names, columns, normalized_aggs
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pipe(self, func, *args, **kwargs):
         """
         Apply a function `func` with arguments to this GroupBy
@@ -1316,7 +1316,7 @@ def pipe(self, func, *args, **kwargs):
         """
         return cudf.core.common.pipe(self, func, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _jit_groupby_apply(
         self, function, group_names, offsets, group_keys, grouped_values, *args
     ):
@@ -1327,7 +1327,7 @@ def _jit_groupby_apply(
             chunk_results, group_names, group_keys, grouped_values
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _iterative_groupby_apply(
         self, function, group_names, offsets, group_keys, grouped_values, *args
     ):
@@ -1415,7 +1415,7 @@ def _post_process_chunk_results(
                 result.index = cudf.MultiIndex._from_data(index_data)
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def apply(
         self, function, *args, engine="auto", include_groups: bool = True
     ):
@@ -1573,7 +1573,7 @@ def mult(df):
             result = result.reset_index()
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def apply_grouped(self, function, **kwargs):
         """Apply a transformation function over the grouped chunk.
 
@@ -1712,7 +1712,7 @@ def rolling_avg(val, avg):
         kwargs.update({"chunks": offsets})
         return grouped_values.apply_chunks(function, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _broadcast(self, values):
         """
         Broadcast the results of an aggregation to the group
@@ -1736,7 +1736,7 @@ def _broadcast(self, values):
             values.index = self.obj.index
         return values
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def transform(self, function):
         """Apply an aggregation, then broadcast the result to the group size.
 
@@ -1801,7 +1801,7 @@ def rolling(self, *args, **kwargs):
         """
         return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def count(self, dropna=True):
         """Compute the number of values in each column.
 
@@ -1816,7 +1816,7 @@ def func(x):
 
         return self.agg(func)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def describe(self, include=None, exclude=None):
         """
         Generate descriptive statistics that summarizes the central tendency,
@@ -1888,7 +1888,7 @@ def describe(self, include=None, exclude=None):
         )
         return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def corr(self, method="pearson", min_periods=1):
         """
         Compute pairwise correlation of columns, excluding NA/null values.
@@ -1950,7 +1950,7 @@ def corr(self, method="pearson", min_periods=1):
             lambda x: x.corr(method, min_periods), "Correlation"
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cov(self, min_periods=0, ddof=1):
         """
         Compute the pairwise covariance among the columns of a DataFrame,
@@ -2129,7 +2129,7 @@ def _cov_or_corr(self, func, method_name):
 
         return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def var(self, ddof=1):
         """Compute the column-wise variance of the values in each group.
 
@@ -2145,7 +2145,7 @@ def func(x):
 
         return self.agg(func)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def std(self, ddof=1):
         """Compute the column-wise std of the values in each group.
 
@@ -2161,7 +2161,7 @@ def func(x):
 
         return self.agg(func)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quantile(self, q=0.5, interpolation="linear"):
         """Compute the column-wise quantiles of the values in each group.
 
@@ -2179,18 +2179,18 @@ def func(x):
 
         return self.agg(func)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def collect(self):
         """Get a list of all the values for each column in each group."""
         _deprecate_collect()
         return self.agg(list)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def unique(self):
         """Get a list of the unique values for each column in each group."""
         return self.agg("unique")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def diff(self, periods=1, axis=0):
         """Get the difference between the values in each group.
 
@@ -2258,7 +2258,7 @@ def bfill(self, limit=None):
 
         return self._scan_fill("bfill", limit)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(
         self,
         value=None,
@@ -2325,7 +2325,7 @@ def fillna(
             value=value, inplace=inplace, axis=axis, limit=limit
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         """
         Shift each group by ``periods`` positions.
@@ -2388,7 +2388,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         result = self._mimic_pandas_order(result)
         return result._copy_type_metadata(values)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pct_change(
         self,
         periods=1,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 71658695b80..e069f8d0ea6 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -58,7 +58,7 @@
     is_mixed_with_object_dtype,
     numeric_normalize_types,
 )
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
 
 if TYPE_CHECKING:
@@ -204,7 +204,7 @@ class RangeIndex(BaseIndex, BinaryOperand):
 
     _range: range
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self, start, stop=None, step=1, dtype=None, copy=False, name=None
     ):
@@ -259,17 +259,17 @@ def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
         return codes, uniques
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self):
         return self._name
 
     @name.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self, value):
         self._name = value
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def start(self) -> int:
         """
         The value of the `start` parameter (0 if this was not supplied).
@@ -277,7 +277,7 @@ def start(self) -> int:
         return self._range.start
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def stop(self) -> int:
         """
         The value of the stop parameter.
@@ -285,7 +285,7 @@ def stop(self) -> int:
         return self._range.stop
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def step(self) -> int:
         """
         The value of the step parameter.
@@ -293,12 +293,12 @@ def step(self) -> int:
         return self._range.step
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _num_rows(self) -> int:
         return len(self)
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _values(self):
         if len(self) > 0:
             return column.as_column(self._range, dtype=self.dtype)
@@ -330,18 +330,18 @@ def _is_interval(self) -> bool:
         return False
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hasnans(self) -> bool:
         return False
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _data(self):
         return cudf.core.column_accessor.ColumnAccessor(
             {self.name: self._values}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __contains__(self, item):
         hash(item)
         if isinstance(item, bool) or not isinstance(
@@ -357,7 +357,7 @@ def __contains__(self, item):
         except (ValueError, OverflowError):
             return False
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def copy(self, name=None, deep=False):
         """
         Make a copy of this object.
@@ -377,7 +377,7 @@ def copy(self, name=None, deep=False):
 
         return RangeIndex(self._range, name=name)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(self, dtype, copy: bool = True):
         if is_dtype_equal(dtype, self.dtype):
             return self
@@ -386,15 +386,15 @@ def astype(self, dtype, copy: bool = True):
     def fillna(self, value, downcast=None):
         return self.copy()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop_duplicates(self, keep="first"):
         return self
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def duplicated(self, keep="first") -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __repr__(self):
         return (
             f"{self.__class__.__name__}(start={self.start}, stop={self.stop}"
@@ -408,15 +408,15 @@ def __repr__(self):
         )
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def size(self) -> int:
         return len(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __len__(self):
         return len(self._range)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, index):
         if isinstance(index, slice):
             sl_start, sl_stop, sl_step = index.indices(len(self))
@@ -435,13 +435,13 @@ def __getitem__(self, index):
             return self.start + index * self.step
         return self._as_int_index()[index]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:
         if isinstance(other, RangeIndex):
             return self._range == other._range
         return self._as_int_index().equals(other)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def serialize(self):
         header = {}
         header["index_column"] = {}
@@ -462,7 +462,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def deserialize(cls, header, frames):
         h = header["index_column"]
         name = pickle.loads(header["name"])
@@ -472,7 +472,7 @@ def deserialize(cls, header, frames):
         return RangeIndex(start=start, stop=stop, step=step, name=name)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dtype(self):
         """
         `dtype` of the range of values in RangeIndex.
@@ -487,7 +487,7 @@ def dtype(self):
     def _dtypes(self) -> Iterable:
         return [(self.name, self.dtype)]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.RangeIndex:
@@ -508,16 +508,16 @@ def is_unique(self) -> bool:
         return True
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_increasing(self) -> bool:
         return self.step > 0 or len(self) <= 1
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_decreasing(self):
         return self.step < 0 or len(self) <= 1
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, deep: bool = False) -> int:
         if deep:
             warnings.warn(
@@ -530,7 +530,7 @@ def unique(self) -> Self:
         # RangeIndex always has unique values
         return self.copy()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __mul__(self, other):
         # Multiplication by raw ints must return a RangeIndex to match pandas.
         if isinstance(other, cudf.Scalar) and other.dtype.kind in "iu":
@@ -547,24 +547,24 @@ def __mul__(self, other):
             )
         return self._as_int_index().__mul__(other)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __rmul__(self, other):
         # Multiplication is commutative.
         return self.__mul__(other)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _as_int_index(self):
         # Convert self to an integer index. This method is used to perform ops
         # that are not defined directly on RangeIndex.
         return cudf.Index._from_data(self._data)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return self._as_int_index().__array_ufunc__(
             ufunc, method, *inputs, **kwargs
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_indexer(self, target, limit=None, method=None, tolerance=None):
         target_col = cudf.core.column.as_column(target)
         if method is not None or not isinstance(
@@ -594,7 +594,7 @@ def get_indexer(self, target, limit=None, method=None, tolerance=None):
             locs[valid] = len(self) - 1 - locs[valid]
         return locs
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_loc(self, key):
         if not is_scalar(key):
             raise TypeError("Should be a scalar-like")
@@ -608,7 +608,7 @@ def get_loc(self, key):
             raise KeyError(key)
         return idx_int
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _union(self, other, sort=None):
         if isinstance(other, RangeIndex):
             # Variable suffixes are of the
@@ -685,7 +685,7 @@ def _union(self, other, sort=None):
             self._as_int_index()._union(other, sort=sort)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _intersection(self, other, sort=None):
         if not isinstance(other, RangeIndex):
             return self._try_reconstruct_range_index(
@@ -733,7 +733,7 @@ def _intersection(self, other, sort=None):
 
         return self._try_reconstruct_range_index(new_index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def difference(self, other, sort=None):
         if isinstance(other, RangeIndex) and self.equals(other):
             return self[:0]._get_reconciled_name_object(other)
@@ -785,14 +785,14 @@ def sort_values(
         else:
             return sorted_index
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _gather(self, gather_map, nullify=False, check_bounds=True):
         gather_map = cudf.core.column.as_column(gather_map)
         return cudf.Index._from_data(
             {self.name: self._values.take(gather_map, nullify, check_bounds)}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _apply_boolean_mask(self, boolean_mask):
         return cudf.Index._from_data(
             {self.name: self._values.apply_boolean_mask(boolean_mask)}
@@ -838,21 +838,21 @@ def join(
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _column(self):
         return self._as_int_index()._column
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _columns(self):
         return self._as_int_index()._columns
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values_host(self) -> np.ndarray:
         return np.arange(start=self.start, stop=self.stop, step=self.step)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def argsort(
         self,
         ascending=True,
@@ -865,19 +865,19 @@ def argsort(
         else:
             return cupy.arange(len(self))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         return self._as_int_index().where(cond, other, inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_numpy(self) -> np.ndarray:
         return self.values_host
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_cupy(self) -> cupy.ndarray:
         return self.values
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_arrow(self) -> pa.Array:
         return pa.array(self._range, type=pa.from_numpy_dtype(self.dtype))
 
@@ -889,23 +889,23 @@ def __array__(self, dtype=None):
             "using .to_numpy()."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nunique(self, dropna: bool = True) -> int:
         return len(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isna(self) -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
     isnull = isna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def notna(self) -> cupy.ndarray:
         return cupy.ones(len(self), dtype=bool)
 
     notnull = isna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _minmax(self, meth: str):
         no_steps = len(self) - 1
         if no_steps == -1:
@@ -1004,12 +1004,12 @@ class Index(SingleColumnFrame, BaseIndex, metaclass=IndexMeta):
         Column's, the data Column will be cloned to adopt this name.
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(self, data, **kwargs):
         name = _getdefault_name(data, name=kwargs.get("name"))
         super().__init__({name: data})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         ret = super().__array_ufunc__(ufunc, method, *inputs, **kwargs)
 
@@ -1046,7 +1046,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return NotImplemented
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self:
         out = super()._from_data(data=data)
         if name is not no_default:
@@ -1054,7 +1054,7 @@ def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self:
         return out
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data_like_self(
         cls, data: MutableMapping, name: Any = no_default
     ) -> Self:
@@ -1064,7 +1064,7 @@ def _from_data_like_self(
         return out
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrow(cls, obj):
         try:
             return cls(ColumnBase.from_arrow(obj))
@@ -1118,12 +1118,12 @@ def _binaryop(
         return ret
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _values(self):
         return self._column
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _concat(cls, objs):
         non_empties = [index for index in objs if len(index)]
         if len(objs) != len(non_empties):
@@ -1166,16 +1166,16 @@ def _concat(cls, objs):
         result.name = name
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, deep=False):
         return self._column.memory_usage
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_unique(self):
         return self._column.is_unique
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:
         if not isinstance(other, BaseIndex) or len(self) != len(other):
             return False
@@ -1198,7 +1198,7 @@ def equals(self, other) -> bool:
         except TypeError:
             return False
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def copy(self, name=None, deep=False):
         """
         Make a copy of this object.
@@ -1221,11 +1221,11 @@ def copy(self, name=None, deep=False):
             {name: self._values.copy(True) if deep else self._values}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(self, dtype, copy: bool = True):
         return super().astype({self.name: dtype}, copy)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_indexer(self, target, method=None, limit=None, tolerance=None):
         if is_scalar(target):
             raise TypeError("Should be a sequence")
@@ -1297,7 +1297,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
 
         return _return_get_indexer_result(result_series.to_cupy())
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_loc(self, key):
         if not is_scalar(key):
             raise TypeError("Should be a scalar-like")
@@ -1333,7 +1333,7 @@ def get_loc(self, key):
         mask[true_inds] = True
         return mask
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __repr__(self):
         max_seq_items = pd.get_option("max_seq_items") or len(self)
         mr = 0
@@ -1419,7 +1419,7 @@ def __repr__(self):
         lines.append(f"{prior_to_dtype} {keywords})")
         return "\n".join(lines)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, index):
         res = self._get_elements_from_column(index)
         if isinstance(res, ColumnBase):
@@ -1427,20 +1427,20 @@ def __getitem__(self, index):
         return res
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dtype(self):
         """
         `dtype` of the underlying values in Index.
         """
         return self._values.dtype
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isna(self):
         return self._column.isnull().values
 
     isnull = isna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def notna(self):
         return self._column.notnull().values
 
@@ -1470,11 +1470,11 @@ def _is_interval(self):
         return False
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hasnans(self):
         return self._column.has_nulls(include_nan=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def argsort(
         self,
         axis=0,
@@ -1518,7 +1518,7 @@ def repeat(self, repeats, axis=None):
             Frame._repeat([*self._columns], repeats, axis), self._column_names
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
@@ -1615,7 +1615,7 @@ def _indices_of(self, value):
 
     @copy_docstring(StringMethods)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def str(self):
         if is_string_dtype(self.dtype):
             return StringMethods(parent=self)
@@ -1698,7 +1698,7 @@ class DatetimeIndex(Index):
                   dtype='datetime64[ns]', name='a')
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -1761,7 +1761,7 @@ def __init__(
             ):
                 raise ValueError("No unique frequency found")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _copy_type_metadata(self: Self, other: Self) -> Self:
         super()._copy_type_metadata(other)
         self._freq = _validate_freq(other._freq)
@@ -1783,7 +1783,7 @@ def __getitem__(self, index):
             return pd.Timestamp(value)
         return value
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def copy(self, name=None, deep=False):
         idx_copy = super().copy(name=name, deep=deep)
         return idx_copy._copy_type_metadata(self)
@@ -1801,7 +1801,7 @@ def searchsorted(
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def year(self):
         """
         The year of the datetime.
@@ -1820,7 +1820,7 @@ def year(self):
         return self._get_dt_field("year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def month(self):
         """
         The month as January=1, December=12.
@@ -1839,7 +1839,7 @@ def month(self):
         return self._get_dt_field("month")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day(self):
         """
         The day of the datetime.
@@ -1858,7 +1858,7 @@ def day(self):
         return self._get_dt_field("day")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hour(self):
         """
         The hours of the datetime.
@@ -1879,7 +1879,7 @@ def hour(self):
         return self._get_dt_field("hour")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def minute(self):
         """
         The minutes of the datetime.
@@ -1900,7 +1900,7 @@ def minute(self):
         return self._get_dt_field("minute")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def second(self):
         """
         The seconds of the datetime.
@@ -1921,7 +1921,7 @@ def second(self):
         return self._get_dt_field("second")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def microsecond(self):
         """
         The microseconds of the datetime.
@@ -1952,7 +1952,7 @@ def microsecond(self):
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nanosecond(self):
         """
         The nanoseconds of the datetime.
@@ -1974,7 +1974,7 @@ def nanosecond(self):
         return self._get_dt_field("nanosecond")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def weekday(self):
         """
         The day of the week with Monday=0, Sunday=6.
@@ -1996,7 +1996,7 @@ def weekday(self):
         return self._get_dt_field("weekday")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dayofweek(self):
         """
         The day of the week with Monday=0, Sunday=6.
@@ -2018,7 +2018,7 @@ def dayofweek(self):
         return self._get_dt_field("weekday")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dayofyear(self):
         """
         The day of the year, from 1-365 in non-leap years and
@@ -2041,7 +2041,7 @@ def dayofyear(self):
         return self._get_dt_field("day_of_year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day_of_year(self):
         """
         The day of the year, from 1-365 in non-leap years and
@@ -2064,7 +2064,7 @@ def day_of_year(self):
         return self._get_dt_field("day_of_year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_leap_year(self):
         """
         Boolean indicator if the date belongs to a leap year.
@@ -2083,7 +2083,7 @@ def is_leap_year(self):
         return cupy.asarray(res)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quarter(self):
         """
         Integer indicator for which quarter of the year the date belongs in.
@@ -2108,7 +2108,7 @@ def quarter(self):
         res = extract_quarter(self._values)
         return Index(res, dtype="int8")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day_name(self, locale: str | None = None) -> Index:
         """
         Return the day names. Currently supports English locale only.
@@ -2128,7 +2128,7 @@ def day_name(self, locale: str | None = None) -> Index:
         day_names = self._column.get_day_names(locale)
         return Index._from_data({self.name: day_names})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def month_name(self, locale: str | None = None) -> Index:
         """
         Return the month names. Currently supports English locale only.
@@ -2147,7 +2147,7 @@ def month_name(self, locale: str | None = None) -> Index:
         month_names = self._column.get_month_names(locale)
         return Index._from_data({self.name: month_names})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isocalendar(self) -> cudf.DataFrame:
         """
         Returns a DataFrame with the year, week, and day
@@ -2172,7 +2172,7 @@ def isocalendar(self) -> cudf.DataFrame:
         )
         return cudf.DataFrame._from_data(ca, index=self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.DatetimeIndex:
@@ -2181,7 +2181,7 @@ def to_pandas(
             result.freq = self._freq._maybe_as_fast_pandas_offset()
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_dt_field(self, field):
         out_column = self._values.get_dt_field(field)
         # column.column_empty_like always returns a Column object
@@ -2198,7 +2198,7 @@ def _get_dt_field(self, field):
     def _is_boolean(self):
         return False
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ceil(self, freq):
         """
         Perform ceil operation on the data to the specified freq.
@@ -2231,7 +2231,7 @@ def ceil(self, freq):
 
         return self.__class__._from_data({self.name: out_column})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def floor(self, freq):
         """
         Perform floor operation on the data to the specified freq.
@@ -2264,7 +2264,7 @@ def floor(self, freq):
 
         return self.__class__._from_data({self.name: out_column})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def round(self, freq):
         """
         Perform round operation on the data to the specified freq.
@@ -2452,7 +2452,7 @@ class TimedeltaIndex(Index):
                   dtype='timedelta64[s]', name='delta-index')
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -2500,7 +2500,7 @@ def __getitem__(self, index):
         return value
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def days(self):
         """
         Number of days for each element.
@@ -2509,7 +2509,7 @@ def days(self):
         return Index(self._values.days, name=self.name, dtype="int64")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def seconds(self):
         """
         Number of seconds (>= 0 and less than 1 day) for each element.
@@ -2517,7 +2517,7 @@ def seconds(self):
         return Index(self._values.seconds, name=self.name, dtype="int32")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def microseconds(self):
         """
         Number of microseconds (>= 0 and less than 1 second) for each element.
@@ -2525,7 +2525,7 @@ def microseconds(self):
         return Index(self._values.microseconds, name=self.name, dtype="int32")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nanoseconds(self):
         """
         Number of nanoseconds (>= 0 and less than 1 microsecond) for each
@@ -2534,7 +2534,7 @@ def nanoseconds(self):
         return Index(self._values.nanoseconds, name=self.name, dtype="int32")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def components(self):
         """
         Return a dataframe of the components (days, hours, minutes,
@@ -2612,7 +2612,7 @@ class CategoricalIndex(Index):
     CategoricalIndex([1, 2, 3, <NA>], categories=[1, 2, 3], ordered=False, dtype='category', name='a')
     """  # noqa: E501
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -2667,7 +2667,7 @@ def __init__(
         super().__init__(data, name=name)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def codes(self):
         """
         The category codes of this categorical.
@@ -2675,7 +2675,7 @@ def codes(self):
         return Index(self._values.codes)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def categories(self):
         """
         The categories of this categorical.
@@ -2689,7 +2689,7 @@ def _is_categorical(self):
         return True
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def interval_range(
     start=None,
     end=None,
@@ -2841,7 +2841,7 @@ class IntervalIndex(Index):
     IntervalIndex
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data,
@@ -2900,7 +2900,7 @@ def closed(self):
         return self.dtype.closed
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_breaks(
         cls,
         breaks,
@@ -2975,7 +2975,7 @@ def _clean_nulls_from_index(self):
         return self
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def as_index(
     arbitrary, nan_as_null=no_default, copy=False, name=no_default, dtype=None
 ) -> BaseIndex:
@@ -3090,7 +3090,7 @@ def _getdefault_name(values, name):
     return name
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex:
     """
     An internal Utility function to concat RangeIndex objects.
@@ -3131,7 +3131,7 @@ def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex:
     return RangeIndex(start, stop, step)
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _extended_gcd(a: int, b: int) -> tuple[int, int, int]:
     """
     Extended Euclidean algorithms to solve Bezout's identity:
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 280a6e92eab..72bd3c45fa6 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -56,7 +56,7 @@
 from cudf.utils import docutils, ioutils
 from cudf.utils._numba import _CUDFNumbaConfig
 from cudf.utils.docutils import copy_docstring
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf
 
 if TYPE_CHECKING:
@@ -301,13 +301,13 @@ def _from_data(
         out._index = RangeIndex(out._data.nrows) if index is None else index
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data_like_self(self, data: MutableMapping):
         out = super()._from_data_like_self(data)
         out.index = self.index
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_columns_like_self(
         self,
         columns: list[ColumnBase],
@@ -363,7 +363,7 @@ def _mimic_inplace(
             self._index = result.index
         return super()._mimic_inplace(result, inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _scan(self, op, axis=None, skipna=True):
         """
         Return {op_name} of the {cls}.
@@ -439,7 +439,7 @@ def _check_data_index_length_match(self) -> None:
             )
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def empty(self):
         """
         Indicator whether DataFrame or Series is empty.
@@ -501,7 +501,7 @@ def empty(self):
         """
         return self.size == 0
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @ioutils.doc_to_json()
     def to_json(self, path_or_buf=None, *args, **kwargs):
         """{docstring}"""
@@ -510,14 +510,14 @@ def to_json(self, path_or_buf=None, *args, **kwargs):
             self, path_or_buf=path_or_buf, *args, **kwargs
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @ioutils.doc_to_hdf()
     def to_hdf(self, path_or_buf, key, *args, **kwargs):
         """{docstring}"""
 
         cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_string(self):
         r"""
         Convert to string
@@ -606,7 +606,7 @@ def copy(self, deep: bool = True) -> Self:
             self.index.copy(deep=False),
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:  # noqa: D102
         return super().equals(other) and self.index.equals(other.index)
 
@@ -632,7 +632,7 @@ def index(self, value):
 
         self._index = value
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def replace(
         self,
         to_replace=None,
@@ -900,7 +900,7 @@ def replace(
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def clip(self, lower=None, upper=None, inplace=False, axis=1):
         """
         Trim values at input threshold(s).
@@ -1026,7 +1026,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1):
         )
         return self._mimic_inplace(output, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def abs(self):
         """
         Return a Series/DataFrame with absolute numeric value of each element.
@@ -1052,7 +1052,7 @@ def abs(self):
         """
         return self._unaryop("abs")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dot(self, other, reflect=False):
         """
         Get dot product of frame and other, (binary operator `dot`).
@@ -1159,15 +1159,15 @@ def dot(self, other, reflect=False):
             )
         return result.item()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __matmul__(self, other):
         return self.dot(other)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __rmatmul__(self, other):
         return self.dot(other, reflect=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def head(self, n=5):
         """
         Return the first `n` rows.
@@ -1246,7 +1246,7 @@ def head(self, n=5):
         """
         return self.iloc[:n]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def tail(self, n=5):
         """
         Returns the last n rows as a new DataFrame or Series
@@ -1277,7 +1277,7 @@ def tail(self, n=5):
 
         return self.iloc[-n:]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pipe(self, func, *args, **kwargs):
         """
         Apply ``func(self, *args, **kwargs)``.
@@ -1324,7 +1324,7 @@ def pipe(self, func, *args, **kwargs):
         """
         return cudf.core.common.pipe(self, func, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sum(
         self,
         axis=no_default,
@@ -1385,7 +1385,7 @@ def sum(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def product(
         self,
         axis=no_default,
@@ -1452,7 +1452,7 @@ def product(
     # Alias for pandas compatibility.
     prod = product
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         """
         Return the mean of the values for the requested axis.
@@ -1541,7 +1541,7 @@ def median(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def std(
         self,
         axis=no_default,
@@ -1600,7 +1600,7 @@ def std(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def var(
         self,
         axis=no_default,
@@ -1658,7 +1658,7 @@ def var(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         """
         Return Fisher's unbiased kurtosis of a sample.
@@ -1718,7 +1718,7 @@ def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs):
     # Alias for kurtosis.
     kurt = kurtosis
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         """
         Return unbiased Fisher-Pearson skew of a sample.
@@ -1777,7 +1777,7 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs):
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def mask(self, cond, other=None, inplace: bool = False) -> Self | None:
         """
         Replace values where the condition is True.
@@ -1839,7 +1839,7 @@ def mask(self, cond, other=None, inplace: bool = False) -> Self | None:
 
         return self.where(cond=~cond, other=other, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @copy_docstring(Rolling)
     def rolling(
         self, window, min_periods=None, center=False, axis=0, win_type=None
@@ -1879,7 +1879,7 @@ def ewm(
             times=times,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nans_to_nulls(self):
         """
         Convert nans (if any) to nulls
@@ -1935,7 +1935,7 @@ def nans_to_nulls(self):
             self._data._from_columns_like_self(result)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def interpolate(
         self,
         method="linear",
@@ -2034,7 +2034,7 @@ def interpolate(
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         """Shift values by `periods` positions."""
         axis = self._get_axis_from_axis_arg(axis)
@@ -2050,7 +2050,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
             self._data._from_columns_like_self(data_columns)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def truncate(self, before=None, after=None, axis=0, copy=True):
         """
         Truncate a Series or DataFrame before and after some index value.
@@ -2398,7 +2398,7 @@ def iloc(self):
         return self._iloc_indexer_type(self)
 
     @property  # type:ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def axes(self):
         """
         Return a list representing the axes of the Series.
@@ -2530,7 +2530,7 @@ def squeeze(self, axis: Literal["index", "columns", 0, 1, None] = None):
         )
         return self.iloc[indexer]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def scale(self):
         """
         Scale values to [0, 1] in float64
@@ -2565,7 +2565,7 @@ def scale(self):
         scaled.index = self.index.copy(deep=False)
         return scaled
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sort_index(
         self,
         axis=0,
@@ -3070,7 +3070,7 @@ def drop_duplicates(
             self.index.names if not ignore_index else None,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def duplicated(self, subset=None, keep="first"):
         """
         Return boolean Series denoting duplicate rows.
@@ -3180,7 +3180,7 @@ def duplicated(self, subset=None, keep="first"):
         )
         return cudf.Series(result, index=self.index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _empty_like(self, keep_index=True) -> Self:
         result = self._from_columns_like_self(
             libcudf.copying.columns_empty_like(
@@ -3217,7 +3217,7 @@ def _split(self, splits, keep_index=True):
             for i in range(len(splits) + 1)
         ]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def bfill(self, value=None, axis=None, inplace=None, limit=None):
         """
         Synonym for :meth:`Series.fillna` with ``method='bfill'``.
@@ -3236,7 +3236,7 @@ def bfill(self, value=None, axis=None, inplace=None, limit=None):
                 limit=limit,
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def backfill(self, value=None, axis=None, inplace=None, limit=None):
         """
         Synonym for :meth:`Series.fillna` with ``method='bfill'``.
@@ -3256,7 +3256,7 @@ def backfill(self, value=None, axis=None, inplace=None, limit=None):
         )
         return self.bfill(value=value, axis=axis, inplace=inplace, limit=limit)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ffill(self, value=None, axis=None, inplace=None, limit=None):
         """
         Synonym for :meth:`Series.fillna` with ``method='ffill'``.
@@ -3275,7 +3275,7 @@ def ffill(self, value=None, axis=None, inplace=None, limit=None):
                 limit=limit,
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pad(self, value=None, axis=None, inplace=None, limit=None):
         """
         Synonym for :meth:`Series.fillna` with ``method='ffill'``.
@@ -3415,7 +3415,7 @@ def add_suffix(self, suffix):
         raise NotImplementedError
 
     @acquire_spill_lock()
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _apply(self, func, kernel_getter, *args, **kwargs):
         """Apply `func` across the rows of the frame."""
         if kwargs:
@@ -3626,7 +3626,7 @@ def _align_to_index(
         out.index.names = self.index.names
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _reindex(
         self,
         column_names,
@@ -4154,7 +4154,7 @@ def dropna(
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _drop_na_columns(self, how="any", subset=None, thresh=None):
         """
         Drop columns containing nulls
@@ -4471,7 +4471,7 @@ def last(self, offset):
             slice_func=lambda i: self.iloc[i:],
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sample(
         self,
         n=None,
@@ -4751,7 +4751,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
 
         return NotImplemented
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def repeat(self, repeats, axis=None):
         """Repeats elements consecutively.
 
@@ -4949,7 +4949,7 @@ def astype(
                 raise e
             return self
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop(
         self,
         labels=None,
@@ -5161,7 +5161,7 @@ def drop(
         if not inplace:
             return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _explode(self, explode_column: Any, ignore_index: bool):
         # Helper function for `explode` in `Series` and `Dataframe`, explodes a
         # specified nested column. Other columns' corresponding rows are
@@ -5200,7 +5200,7 @@ def _explode(self, explode_column: Any, ignore_index: bool):
             self.index.names if not ignore_index else None,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def tile(self, count):
         """Repeats the rows `count` times to form a new Frame.
 
@@ -5233,7 +5233,7 @@ def tile(self, count):
             index_names=self._index_names,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def groupby(
         self,
         by=None,
@@ -5283,7 +5283,7 @@ def groupby(
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Addition",
@@ -5324,7 +5324,7 @@ def add(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__add__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Addition",
@@ -5365,7 +5365,7 @@ def radd(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__radd__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Subtraction",
@@ -5408,7 +5408,7 @@ def subtract(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
     sub = subtract
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Subtraction",
@@ -5449,7 +5449,7 @@ def rsub(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rsub__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Multiplication",
@@ -5492,7 +5492,7 @@ def multiply(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
     mul = multiply
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Multiplication",
@@ -5533,7 +5533,7 @@ def rmul(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rmul__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Modulo",
@@ -5574,7 +5574,7 @@ def mod(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__mod__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Modulo",
@@ -5615,7 +5615,7 @@ def rmod(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rmod__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Exponential",
@@ -5656,7 +5656,7 @@ def pow(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__pow__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Exponential",
@@ -5697,7 +5697,7 @@ def rpow(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rpow__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Integer division",
@@ -5738,7 +5738,7 @@ def floordiv(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__floordiv__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Integer division",
@@ -5779,7 +5779,7 @@ def rfloordiv(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rfloordiv__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Floating division",
@@ -5824,7 +5824,7 @@ def truediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
     div = truediv
     divide = truediv
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Floating division",
@@ -5868,7 +5868,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
     # Alias for rtruediv
     rdiv = rtruediv
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Equal to",
@@ -5908,7 +5908,7 @@ def eq(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__eq__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Not equal to",
@@ -5948,7 +5948,7 @@ def ne(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__ne__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Less than",
@@ -5988,7 +5988,7 @@ def lt(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__lt__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Less than or equal to",
@@ -6028,7 +6028,7 @@ def le(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__le__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Greater than",
@@ -6068,7 +6068,7 @@ def gt(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__gt__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Greater than or equal to",
@@ -6123,7 +6123,7 @@ def _preprocess_subset(self, subset):
             raise KeyError(f"columns {diff} do not exist")
         return subset
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rank(
         self,
         axis=0,
@@ -6291,7 +6291,7 @@ def _check_duplicate_level_names(specified, level_names):
         )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _get_replacement_values_for_columns(
     to_replace: Any, value: Any, columns_dtype_map: dict[Any, Any]
 ) -> tuple[dict[Any, bool], dict[Any, Any], dict[Any, Any]]:
@@ -6458,7 +6458,7 @@ def _is_series(obj):
     return isinstance(obj, Frame) and obj.ndim == 1 and obj.index is not None
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _drop_rows_by_labels(
     obj: DataFrameOrSeries,
     labels: ColumnLike | abc.Iterable | str,
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 547c14cdc99..7657fa9e234 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -32,7 +32,7 @@
 )
 from cudf.core.join._join_helpers import _match_join_keys
 from cudf.utils.dtypes import is_column_like
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
 if TYPE_CHECKING:
@@ -126,7 +126,7 @@ class MultiIndex(Frame, BaseIndex, NotIterable):
                )
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         levels=None,
@@ -211,12 +211,12 @@ def __init__(
         self.names = names
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def names(self):
         return self._names
 
     @names.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def names(self, value):
         if value is None:
             value = [None] * self.nlevels
@@ -242,13 +242,13 @@ def names(self, value):
             )
         self._names = pd.core.indexes.frozen.FrozenList(value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_series(self, index=None, name=None):
         raise NotImplementedError(
             "MultiIndex.to_series isn't implemented yet."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(self, dtype, copy: bool = True):
         if not is_object_dtype(dtype):
             raise TypeError(
@@ -257,7 +257,7 @@ def astype(self, dtype, copy: bool = True):
             )
         return self
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rename(self, names, inplace=False):
         """
         Alter MultiIndex level names
@@ -304,7 +304,7 @@ def rename(self, names, inplace=False):
         """
         return self.set_names(names, level=None, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def set_names(self, names, level=None, inplace=False):
         names_is_list_like = is_list_like(names)
         level_is_list_like = is_list_like(level)
@@ -342,7 +342,7 @@ def set_names(self, names, level=None, inplace=False):
         return self._set_names(names=names, inplace=inplace)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data(
         cls,
         data: MutableMapping,
@@ -354,16 +354,16 @@ def _from_data(
         return obj
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self):
         return self._name
 
     @name.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self, value):
         self._name = value
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def copy(
         self,
         names=None,
@@ -432,7 +432,7 @@ def copy(
 
         return mi
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __repr__(self):
         max_seq_items = pd.get_option("display.max_seq_items") or len(self)
 
@@ -484,7 +484,7 @@ def _codes_frame(self):
 
     @property  # type: ignore
     @_external_only_api("Use ._codes_frame instead")
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def codes(self):
         """
         Returns the codes of the underlying MultiIndex.
@@ -510,13 +510,13 @@ def get_slice_bound(self, label, side, kind=None):
         raise NotImplementedError()
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nlevels(self):
         """Integer number of levels in this MultiIndex."""
         return self._num_columns
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def levels(self):
         """
         Returns list of levels in the MultiIndex
@@ -548,12 +548,12 @@ def levels(self):
         return self._levels
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ndim(self) -> int:
         """Dimension of the data. For MultiIndex ndim is always 2."""
         return 2
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_level_label(self, level):
         """Get name of the level.
 
@@ -570,7 +570,7 @@ def _get_level_label(self, level):
         else:
             return self._data.names[level]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isin(self, values, level=None):
         """Return a boolean array where the index values are in values.
 
@@ -669,7 +669,7 @@ def where(self, cond, other=None, inplace=False):
             ".where is not supported for MultiIndex operations"
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _compute_levels_and_codes(self):
         levels = []
 
@@ -683,7 +683,7 @@ def _compute_levels_and_codes(self):
         self._levels = levels
         self._codes = cudf.DataFrame._from_data(codes)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _compute_validity_mask(self, index, row_tuple, max_length):
         """Computes the valid set of indices of values in the lookup"""
         lookup = cudf.DataFrame()
@@ -731,7 +731,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
                     raise KeyError(row)
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
         # Instructions for Slicing
         # if tuple, get first and last elements of tuple
@@ -761,7 +761,7 @@ def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
             return row_tuple
         return self._compute_validity_mask(index, row_tuple, max_length)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _index_and_downcast(self, result, index, index_key):
         if isinstance(index_key, (numbers.Number, slice)):
             index_key = [index_key]
@@ -829,7 +829,7 @@ def _index_and_downcast(self, result, index, index_key):
             result.index = index
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_row_major(
         self,
         df: DataFrameOrSeries,
@@ -856,7 +856,7 @@ def _get_row_major(
         final = self._index_and_downcast(result, result.index, row_tuple)
         return final
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _validate_indexer(
         self,
         indexer: numbers.Number
@@ -884,7 +884,7 @@ def _validate_indexer(
             for i in indexer:
                 self._validate_indexer(i)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __eq__(self, other):
         if isinstance(other, MultiIndex):
             return np.array(
@@ -898,12 +898,12 @@ def __eq__(self, other):
         return NotImplemented
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def size(self):
         # The size of a MultiIndex is only dependent on the number of rows.
         return self._num_rows
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def take(self, indices):
         if isinstance(indices, cudf.Series) and indices.has_nulls:
             raise ValueError("Column must have no nulls.")
@@ -911,7 +911,7 @@ def take(self, indices):
         obj.names = self.names
         return obj
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def serialize(self):
         header, frames = super().serialize()
         # Overwrite the names in _data with the true names.
@@ -919,7 +919,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def deserialize(cls, header, frames):
         # Spoof the column names to construct the frame, then set manually.
         column_names = pickle.loads(header["column_names"])
@@ -927,7 +927,7 @@ def deserialize(cls, header, frames):
         obj = super().deserialize(header, frames)
         return obj._set_names(column_names)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, index):
         flatten = isinstance(index, int)
 
@@ -954,7 +954,7 @@ def __getitem__(self, index):
             result._levels = self._levels
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_frame(self, index=True, name=no_default, allow_duplicates=False):
         """
         Create a DataFrame with the levels of the MultiIndex as columns.
@@ -1031,7 +1031,7 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False):
             data=ca, index=self if index else None
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_level_values(self, level):
         """
         Return the values at the requested level
@@ -1087,7 +1087,7 @@ def _is_interval(self):
         return False
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _concat(cls, objs):
         source_data = [o.to_frame(index=False) for o in objs]
 
@@ -1107,7 +1107,7 @@ def _concat(cls, objs):
         return cudf.MultiIndex.from_frame(source_data, names=names)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_tuples(cls, tuples, names=None):
         """
         Convert list of tuples to MultiIndex.
@@ -1145,12 +1145,12 @@ def from_tuples(cls, tuples, names=None):
         pdi = pd.MultiIndex.from_tuples(tuples, names=names)
         return cls.from_pandas(pdi)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_numpy(self):
         return self.values_host
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values_host(self):
         """
         Return a numpy representation of the MultiIndex.
@@ -1178,7 +1178,7 @@ def values_host(self):
         return self.to_pandas().values
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values(self):
         """
         Return a CuPy representation of the MultiIndex.
@@ -1214,7 +1214,7 @@ def values(self):
         return self.to_frame(index=False).values
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         """
         Make a MultiIndex from a DataFrame.
@@ -1289,7 +1289,7 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         return obj
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_product(cls, arrays, names=None):
         """
         Make a MultiIndex from the cartesian product of multiple iterables.
@@ -1331,7 +1331,7 @@ def from_product(cls, arrays, names=None):
         return cls.from_pandas(pdi)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrays(
         cls,
         arrays,
@@ -1390,7 +1390,7 @@ def from_arrays(
             codes=codes, levels=levels, sortorder=sortorder, names=names
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _poplevels(self, level):
         """
         Remove and return the specified levels from self.
@@ -1441,7 +1441,7 @@ def _poplevels(self, level):
 
         return popped
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def swaplevel(self, i=-2, j=-1):
         """
         Swap level i with level j.
@@ -1492,7 +1492,7 @@ def swaplevel(self, i=-2, j=-1):
             midx = midx.set_names(self.names)
         return midx
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def droplevel(self, level=-1):
         """
         Removes the specified levels from the MultiIndex.
@@ -1555,7 +1555,7 @@ def droplevel(self, level=-1):
         else:
             return mi
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.MultiIndex:
@@ -1572,7 +1572,7 @@ def to_pandas(
         )
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default):
         """
         Convert from a Pandas MultiIndex
@@ -1607,7 +1607,7 @@ def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default):
         )
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_unique(self):
         return len(self) == len(self.unique())
 
@@ -1615,7 +1615,7 @@ def is_unique(self):
     def dtype(self):
         return np.dtype("O")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _is_sorted(self, ascending=None, null_position=None) -> bool:
         """
         Returns a boolean indicating whether the data of the MultiIndex are sorted
@@ -1661,7 +1661,7 @@ def _is_sorted(self, ascending=None, null_position=None) -> bool:
         )
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_increasing(self) -> bool:
         """
         Return if the index is monotonic increasing
@@ -1670,7 +1670,7 @@ def is_monotonic_increasing(self) -> bool:
         return self._is_sorted(ascending=None, null_position=None)
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_decreasing(self) -> bool:
         """
         Return if the index is monotonic decreasing
@@ -1680,7 +1680,7 @@ def is_monotonic_decreasing(self) -> bool:
             ascending=[False] * len(self.levels), null_position=None
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(self, value):
         """
         Fill null values with the specified value.
@@ -1721,11 +1721,11 @@ def fillna(self, value):
 
         return super().fillna(value=value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def unique(self):
         return self.drop_duplicates(keep="first")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nunique(self, dropna: bool = True) -> int:
         mi = self.dropna(how="all") if dropna else self
         return len(mi.unique())
@@ -1740,7 +1740,7 @@ def _clean_nulls_from_index(self):
             index_df._clean_nulls_from_dataframe(index_df), names=self.names
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, deep=False):
         usage = sum(col.memory_usage for col in self._data.columns)
         if self.levels:
@@ -1751,13 +1751,13 @@ def memory_usage(self, deep=False):
                 usage += col.memory_usage
         return usage
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def difference(self, other, sort=None):
         if hasattr(other, "to_pandas"):
             other = other.to_pandas()
         return cudf.from_pandas(self.to_pandas().difference(other, sort))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def append(self, other):
         """
         Append a collection of MultiIndex objects together
@@ -1820,7 +1820,7 @@ def append(self, other):
 
         return MultiIndex._concat(to_concat)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_function__(self, func, types, args, kwargs):
         cudf_df_module = MultiIndex
 
@@ -1867,7 +1867,7 @@ def _level_index_from_level(self, level):
                 ) from None
             return level
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_indexer(self, target, method=None, limit=None, tolerance=None):
         if tolerance is not None:
             raise NotImplementedError(
@@ -1926,7 +1926,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
 
         return _return_get_indexer_result(result_series.to_cupy())
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_loc(self, key):
         is_sorted = (
             self.is_monotonic_increasing or self.is_monotonic_decreasing
@@ -2000,7 +2000,7 @@ def _maybe_match_names(self, other):
             for self_name, other_name in zip(self.names, other.names)
         ]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def union(self, other, sort=None):
         if not isinstance(other, MultiIndex):
             msg = "other must be a MultiIndex or a list of tuples"
@@ -2024,7 +2024,7 @@ def union(self, other, sort=None):
 
         return self._union(other, sort=sort)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _union(self, other, sort=None):
         # TODO: When to_frame is refactored to return a
         # deep copy in future, we should push most of the common
@@ -2050,7 +2050,7 @@ def _union(self, other, sort=None):
             return midx.sort_values()
         return midx
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _intersection(self, other, sort=None):
         if self.names != other.names:
             deep = True
@@ -2073,14 +2073,14 @@ def _intersection(self, other, sort=None):
             return midx.sort_values()
         return midx
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _copy_type_metadata(self: Self, other: Self) -> Self:
         res = super()._copy_type_metadata(other)
         if isinstance(other, MultiIndex):
             res._names = other._names
         return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _split_columns_by_levels(
         self, levels: tuple, *, in_levels: bool
     ) -> Generator[tuple[Any, column.ColumnBase], None, None]:
@@ -2099,7 +2099,7 @@ def _split_columns_by_levels(
             elif not in_levels and i not in level_indices:
                 yield name, col
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _new_index_for_reset_index(
         self, levels: tuple | None, name
     ) -> None | BaseIndex:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index ea25d482578..9acf5294b72 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -68,7 +68,7 @@
     is_mixed_with_object_dtype,
     to_cudf_compatible_scalar,
 )
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 if TYPE_CHECKING:
     from cudf._typing import (
@@ -179,7 +179,7 @@ class _SeriesIlocIndexer(_FrameIndexer):
 
     _frame: cudf.Series
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, arg):
         indexing_spec = indexing_utils.parse_row_iloc_indexer(
             indexing_utils.destructure_series_iloc_indexer(arg, self._frame),
@@ -187,7 +187,7 @@ def __getitem__(self, arg):
         )
         return self._frame._getitem_preprocessed(indexing_spec)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __setitem__(self, key, value):
         if isinstance(key, tuple):
             key = list(key)
@@ -274,7 +274,7 @@ class _SeriesLocIndexer(_FrameIndexer):
     Label-based selection
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, arg: Any) -> ScalarLike | DataFrameOrSeries:
         if isinstance(arg, pd.MultiIndex):
             arg = cudf.from_pandas(arg)
@@ -301,7 +301,7 @@ def __getitem__(self, arg: Any) -> ScalarLike | DataFrameOrSeries:
 
         return self._frame.iloc[arg]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __setitem__(self, key, value):
         try:
             key = self._loc_to_iloc(key)
@@ -476,7 +476,7 @@ def _constructor_expanddim(self):
         return cudf.DataFrame
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_categorical(cls, categorical, codes=None):
         """Creates from a pandas.Categorical
 
@@ -517,7 +517,7 @@ def from_categorical(cls, categorical, codes=None):
         return Series(data=col)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_masked_array(cls, data, mask, null_count=None):
         """Create a Series with null-mask.
         This is equivalent to:
@@ -566,7 +566,7 @@ def from_masked_array(cls, data, mask, null_count=None):
         col = as_column(data).set_mask(mask)
         return cls(data=col)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -663,7 +663,7 @@ def __init__(
         self._check_data_index_length_match()
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data(
         cls,
         data: MutableMapping,
@@ -675,18 +675,18 @@ def _from_data(
             out.name = name
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data_like_self(self, data: MutableMapping):
         out = super()._from_data_like_self(data)
         out.name = self.name
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __contains__(self, item):
         return item in self.index
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_pandas(cls, s: pd.Series, nan_as_null=no_default):
         """
         Convert from a Pandas Series.
@@ -735,7 +735,7 @@ def from_pandas(cls, s: pd.Series, nan_as_null=no_default):
         return result
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_unique(self):
         """Return boolean if values in the object are unique.
 
@@ -746,7 +746,7 @@ def is_unique(self):
         return self._column.is_unique
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dt(self):
         """
         Accessor object for datetime-like properties of the Series values.
@@ -788,7 +788,7 @@ def dt(self):
             )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hasnans(self):
         """
         Return True if there are any NaNs or nulls.
@@ -829,7 +829,7 @@ def hasnans(self):
         """
         return self._column.has_nulls(include_nan=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def serialize(self):
         header, frames = super().serialize()
 
@@ -842,7 +842,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def deserialize(cls, header, frames):
         index_nframes = header["index_frame_count"]
         obj = super().deserialize(
@@ -855,7 +855,7 @@ def deserialize(cls, header, frames):
 
         return obj
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop(
         self,
         labels=None,
@@ -884,7 +884,7 @@ def tolist(self):  # noqa: D102
 
     to_list = tolist
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_dict(self, into: type[dict] = dict) -> dict:
         """
         Convert Series to {label -> value} dict or dict-like object.
@@ -923,7 +923,7 @@ def to_dict(self, into: type[dict] = dict) -> dict:
         """
         return self.to_pandas().to_dict(into=into)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def reindex(self, *args, **kwargs):
         """
         Conform Series to new index.
@@ -996,7 +996,7 @@ def reindex(self, *args, **kwargs):
         series.name = self.name
         return series
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_reset_index_template.format(
             klass="Series",
@@ -1081,7 +1081,7 @@ def reset_index(
             inplace=inplace,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_frame(self, name=None):
         """Convert Series into a DataFrame
 
@@ -1124,13 +1124,13 @@ def to_frame(self, name=None):
 
         return cudf.DataFrame({col: self._column}, index=self.index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, index=True, deep=False):
         return self._column.memory_usage + (
             self.index.memory_usage() if index else 0
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_function__(self, func, types, args, kwargs):
         if "out" in kwargs or not all(issubclass(t, Series) for t in types):
             return NotImplemented
@@ -1191,7 +1191,7 @@ def __array_function__(self, func, types, args, kwargs):
 
         return NotImplemented
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def map(self, arg, na_action=None) -> "Series":
         """
         Map values of Series according to input correspondence.
@@ -1333,7 +1333,7 @@ def _getitem_preprocessed(
             return self._empty_like(keep_index=True)
         assert_never(spec)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, arg):
         if isinstance(arg, slice):
             return self.iloc[arg]
@@ -1344,7 +1344,7 @@ def __getitem__(self, arg):
 
     items = SingleColumnFrame.__iter__
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __setitem__(self, key, value):
         if isinstance(key, slice):
             self.iloc[key] = value
@@ -1495,36 +1495,36 @@ def _make_operands_and_index_for_binop(
 
     @copy_docstring(CategoricalAccessor)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cat(self):
         return CategoricalAccessor(parent=self)
 
     @copy_docstring(StringMethods)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def str(self):
         return StringMethods(parent=self)
 
     @copy_docstring(ListMethods)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def list(self):
         return ListMethods(parent=self)
 
     @copy_docstring(StructMethods)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def struct(self):
         return StructMethods(parent=self)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dtype(self):
         """The dtype of the Series."""
         return self._column.dtype
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _concat(cls, objs, axis=0, index=True):
         # Concatenate index if not provided
         if index is True:
@@ -1590,25 +1590,25 @@ def _concat(cls, objs, axis=0, index=True):
         return cls(data=col, index=index, name=name)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def valid_count(self):
         """Number of non-null values"""
         return len(self) - self._column.null_count
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def null_count(self):
         """Number of null values"""
         return self._column.null_count
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nullable(self):
         """A boolean indicating whether a null-mask is needed"""
         return self._column.nullable
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def has_nulls(self):
         """
         Indicator whether Series contains null values.
@@ -1637,7 +1637,7 @@ def has_nulls(self):
         """
         return self._column.has_nulls()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dropna(self, axis=0, inplace=False, how=None):
         """
         Return a Series with null values removed.
@@ -1717,7 +1717,7 @@ def dropna(self, axis=0, inplace=False, how=None):
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
         """
         Return Series with duplicate values removed.
@@ -1791,7 +1791,7 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
     ):
@@ -1896,7 +1896,7 @@ def between(self, left, right, inclusive="both") -> Series:
             )
         return self._from_data({self.name: lmask & rmask}, self.index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def all(self, axis=0, bool_only=None, skipna=True, **kwargs):
         if bool_only not in (None, True):
             raise NotImplementedError(
@@ -1904,7 +1904,7 @@ def all(self, axis=0, bool_only=None, skipna=True, **kwargs):
             )
         return super().all(axis, skipna, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def any(self, axis=0, bool_only=None, skipna=True, **kwargs):
         if bool_only not in (None, True):
             raise NotImplementedError(
@@ -1912,7 +1912,7 @@ def any(self, axis=0, bool_only=None, skipna=True, **kwargs):
             )
         return super().any(axis, skipna, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self,
         *,
@@ -2004,7 +2004,7 @@ def to_pandas(
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def data(self):
         """The gpu buffer for the data
 
@@ -2029,12 +2029,12 @@ def data(self):
         return self._column.data
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nullmask(self):
         """The gpu buffer for the null-mask"""
         return cudf.Series(self._column.nullmask)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(
         self,
         dtype,
@@ -2051,13 +2051,13 @@ def astype(
             dtype = {self.name: dtype}
         return super().astype(dtype, copy, errors)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sort_index(self, axis=0, *args, **kwargs):
         if axis not in (0, "index"):
             raise ValueError("Only axis=0 is valid for Series.")
         return super().sort_index(axis=axis, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sort_values(
         self,
         axis=0,
@@ -2112,7 +2112,7 @@ def sort_values(
             ignore_index=ignore_index,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nlargest(self, n=5, keep="first"):
         """Returns a new Series of the *n* largest element.
 
@@ -2175,7 +2175,7 @@ def nlargest(self, n=5, keep="first"):
         """
         return self._n_largest_or_smallest(True, n, [self.name], keep)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nsmallest(self, n=5, keep="first"):
         """
         Returns a new Series of the *n* smallest element.
@@ -2251,7 +2251,7 @@ def nsmallest(self, n=5, keep="first"):
         """
         return self._n_largest_or_smallest(False, n, [self.name], keep)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def argsort(
         self,
         axis=0,
@@ -2274,7 +2274,7 @@ def argsort(
         obj.name = self.name
         return obj
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def replace(self, to_replace=None, value=no_default, *args, **kwargs):
         if is_dict_like(to_replace) and value not in {None, no_default}:
             raise ValueError(
@@ -2284,7 +2284,7 @@ def replace(self, to_replace=None, value=no_default, *args, **kwargs):
 
         return super().replace(to_replace, value, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def update(self, other):
         """
         Modify Series in place using values from passed Series.
@@ -2390,7 +2390,7 @@ def update(self, other):
         self.mask(mask, other, inplace=True)
 
     # UDF related
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def apply(self, func, convert_dtype=True, args=(), **kwargs):
         """
         Apply a scalar function to the values of a Series.
@@ -2535,7 +2535,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
     #
     # Stats
     #
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def count(self):
         """
         Return number of non-NA/null observations in the Series
@@ -2559,7 +2559,7 @@ def count(self):
         """
         return self.valid_count
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def mode(self, dropna=True):
         """
         Return the mode(s) of the dataset.
@@ -2630,7 +2630,7 @@ def mode(self, dropna=True):
             {self.name: val_counts.index.sort_values()}, name=self.name
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def round(self, decimals=0, how="half_even"):
         if not is_integer(decimals):
             raise ValueError(
@@ -2639,7 +2639,7 @@ def round(self, decimals=0, how="half_even"):
         decimals = int(decimals)
         return super().round(decimals, how)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cov(self, other, min_periods=None):
         """
         Compute covariance with Series, excluding missing values.
@@ -2690,7 +2690,7 @@ def cov(self, other, min_periods=None):
                 f"{other.dtype}"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def transpose(self):
         """Return the transpose, which is by definition self."""
 
@@ -2698,7 +2698,7 @@ def transpose(self):
 
     T = property(transpose, doc=transpose.__doc__)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def duplicated(self, keep="first"):
         """
         Indicate duplicate Series values.
@@ -2778,7 +2778,7 @@ def duplicated(self, keep="first"):
         """
         return super().duplicated(keep=keep)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def corr(self, other, method="pearson", min_periods=None):
         """Calculates the sample correlation between two Series,
         excluding missing values.
@@ -2830,7 +2830,7 @@ def corr(self, other, method="pearson", min_periods=None):
                 f"cannot perform corr with types {self.dtype}, {other.dtype}"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def autocorr(self, lag=1):
         """Compute the lag-N autocorrelation. This method computes the Pearson
         correlation between the Series and its shifted self.
@@ -2856,7 +2856,7 @@ def autocorr(self, lag=1):
         """
         return self.corr(self.shift(lag))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isin(self, values):
         """Check whether values are contained in Series.
 
@@ -2926,7 +2926,7 @@ def isin(self, values):
             {self.name: self._column.isin(values)}, index=self.index
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def unique(self):
         """
         Returns unique values of this Series.
@@ -2961,7 +2961,7 @@ def unique(self):
             return res.values
         return Series(res, name=self.name)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def value_counts(
         self,
         normalize=False,
@@ -3116,7 +3116,7 @@ def value_counts(
         res.name = result_name
         return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quantile(
         self, q=0.5, interpolation="linear", exact=True, quant_index=True
     ):
@@ -3195,7 +3195,7 @@ def quantile(
         )
 
     @docutils.doc_describe()
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def describe(
         self,
         percentiles=None,
@@ -3240,7 +3240,7 @@ def describe(
             name=self.name,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def digitize(self, bins, right=False):
         """Return the indices of the bins to which each value belongs.
 
@@ -3276,7 +3276,7 @@ def digitize(self, bins, right=False):
             cudf.core.column.numerical.digitize(self._column, bins, right)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def diff(self, periods=1):
         """First discrete difference of element.
 
@@ -3347,7 +3347,7 @@ def diff(self, periods=1):
 
         return self - self.shift(periods=periods)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         groupby_doc_template.format(
             ret=textwrap.dedent(
@@ -3385,7 +3385,7 @@ def groupby(
             dropna,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rename(self, index=None, copy=True):
         """
         Alter Series name
@@ -3431,7 +3431,7 @@ def rename(self, index=None, copy=True):
         out_data = self._data.copy(deep=copy)
         return Series._from_data(out_data, self.index, name=index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def add_prefix(self, prefix):
         return Series._from_data(
             # TODO: Change to deep=False when copy-on-write is default
@@ -3439,7 +3439,7 @@ def add_prefix(self, prefix):
             index=prefix + self.index.astype(str),
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def add_suffix(self, suffix):
         return Series._from_data(
             # TODO: Change to deep=False when copy-on-write is default
@@ -3447,7 +3447,7 @@ def add_suffix(self, suffix):
             index=self.index.astype(str) + suffix,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def keys(self):
         """
         Return alias for index.
@@ -3491,7 +3491,7 @@ def keys(self):
         """
         return self.index
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def explode(self, ignore_index=False):
         """
         Transform each element of a list-like to a row, replicating index
@@ -3528,7 +3528,7 @@ def explode(self, ignore_index=False):
         """
         return super()._explode(self.name, ignore_index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pct_change(
         self, periods=1, fill_method=no_default, limit=no_default, freq=None
     ):
@@ -3602,7 +3602,7 @@ def pct_change(
         change = diff / data.shift(periods=periods, freq=freq)
         return change
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
@@ -3736,7 +3736,7 @@ class DatetimeProperties(BaseDatelikeProperties):
     """
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def year(self) -> Series:
         """
         The year of the datetime.
@@ -3761,7 +3761,7 @@ def year(self) -> Series:
         return self._get_dt_field("year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def month(self) -> Series:
         """
         The month as January=1, December=12.
@@ -3786,7 +3786,7 @@ def month(self) -> Series:
         return self._get_dt_field("month")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day(self) -> Series:
         """
         The day of the datetime.
@@ -3811,7 +3811,7 @@ def day(self) -> Series:
         return self._get_dt_field("day")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hour(self) -> Series:
         """
         The hours of the datetime.
@@ -3836,7 +3836,7 @@ def hour(self) -> Series:
         return self._get_dt_field("hour")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def minute(self) -> Series:
         """
         The minutes of the datetime.
@@ -3861,7 +3861,7 @@ def minute(self) -> Series:
         return self._get_dt_field("minute")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def second(self) -> Series:
         """
         The seconds of the datetime.
@@ -3886,7 +3886,7 @@ def second(self) -> Series:
         return self._get_dt_field("second")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def microsecond(self) -> Series:
         """
         The microseconds of the datetime.
@@ -3918,7 +3918,7 @@ def microsecond(self) -> Series:
         return self._return_result_like_self(micro + extra)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nanosecond(self) -> Series:
         """
         The nanoseconds of the datetime.
@@ -3943,7 +3943,7 @@ def nanosecond(self) -> Series:
         return self._get_dt_field("nanosecond")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def weekday(self) -> Series:
         """
         The day of the week with Monday=0, Sunday=6.
@@ -3980,7 +3980,7 @@ def weekday(self) -> Series:
         return self._get_dt_field("weekday")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dayofweek(self) -> Series:
         """
         The day of the week with Monday=0, Sunday=6.
@@ -4017,7 +4017,7 @@ def dayofweek(self) -> Series:
         return self._get_dt_field("weekday")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dayofyear(self) -> Series:
         """
         The day of the year, from 1-365 in non-leap years and
@@ -4055,7 +4055,7 @@ def dayofyear(self) -> Series:
         return self._get_dt_field("day_of_year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day_of_year(self) -> Series:
         """
         The day of the year, from 1-365 in non-leap years and
@@ -4093,7 +4093,7 @@ def day_of_year(self) -> Series:
         return self._get_dt_field("day_of_year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_leap_year(self) -> Series:
         """
         Boolean indicator if the date belongs to a leap year.
@@ -4148,7 +4148,7 @@ def is_leap_year(self) -> Series:
         return self._return_result_like_self(res)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quarter(self) -> Series:
         """
         Integer indicator for which quarter of the year the date belongs in.
@@ -4177,7 +4177,7 @@ def quarter(self) -> Series:
         )
         return self._return_result_like_self(res)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day_name(self, locale: str | None = None) -> Series:
         """
         Return the day names. Currently supports English locale only.
@@ -4213,7 +4213,7 @@ def day_name(self, locale: str | None = None) -> Series:
             self.series._column.get_day_names(locale)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def month_name(self, locale: str | None = None) -> Series:
         """
         Return the month names. Currently supports English locale only.
@@ -4243,7 +4243,7 @@ def month_name(self, locale: str | None = None) -> Series:
             self.series._column.get_month_names(locale)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isocalendar(self) -> cudf.DataFrame:
         """
         Returns a DataFrame with the year, week, and day
@@ -4291,7 +4291,7 @@ def isocalendar(self) -> cudf.DataFrame:
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_month_start(self) -> Series:
         """
         Booleans indicating if dates are the first day of the month.
@@ -4299,7 +4299,7 @@ def is_month_start(self) -> Series:
         return (self.day == 1).fillna(False)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def days_in_month(self) -> Series:
         """
         Get the total number of days in the month that the date falls on.
@@ -4348,7 +4348,7 @@ def days_in_month(self) -> Series:
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_month_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of the month.
@@ -4391,7 +4391,7 @@ def is_month_end(self) -> Series:
         return (self.day == last_day.dt.day).fillna(False)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_quarter_start(self) -> Series:
         """
         Boolean indicator if the date is the first day of a quarter.
@@ -4436,7 +4436,7 @@ def is_quarter_start(self) -> Series:
         return self._return_result_like_self(result)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_quarter_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of a quarter.
@@ -4483,7 +4483,7 @@ def is_quarter_end(self) -> Series:
         return self._return_result_like_self(result)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_year_start(self) -> Series:
         """
         Boolean indicator if the date is the first day of the year.
@@ -4514,7 +4514,7 @@ def is_year_start(self) -> Series:
         return self._return_result_like_self(outcol.fillna(False))
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_year_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of the year.
@@ -4547,13 +4547,13 @@ def is_year_end(self) -> Series:
         result = cudf._lib.copying.copy_if_else(leap, non_leap, leap_dates)
         return self._return_result_like_self(result.fillna(False))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_dt_field(self, field: str) -> Series:
         return self._return_result_like_self(
             self.series._column.get_dt_field(field)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ceil(self, freq: str) -> Series:
         """
         Perform ceil operation on the data to the specified freq.
@@ -4586,7 +4586,7 @@ def ceil(self, freq: str) -> Series:
         """
         return self._return_result_like_self(self.series._column.ceil(freq))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def floor(self, freq: str) -> Series:
         """
         Perform floor operation on the data to the specified freq.
@@ -4619,7 +4619,7 @@ def floor(self, freq: str) -> Series:
         """
         return self._return_result_like_self(self.series._column.floor(freq))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def round(self, freq: str) -> Series:
         """
         Perform round operation on the data to the specified freq.
@@ -4655,7 +4655,7 @@ def round(self, freq: str) -> Series:
         """
         return self._return_result_like_self(self.series._column.round(freq))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def strftime(self, date_format: str, *args, **kwargs) -> Series:
         """
         Convert to Series using specified ``date_format``.
@@ -4832,7 +4832,7 @@ class TimedeltaProperties(BaseDatelikeProperties):
     """
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def days(self) -> Series:
         """
         Number of days.
@@ -4864,7 +4864,7 @@ def days(self) -> Series:
         return self._get_td_field("days")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def seconds(self) -> Series:
         """
         Number of seconds (>= 0 and less than 1 day).
@@ -4903,7 +4903,7 @@ def seconds(self) -> Series:
         return self._get_td_field("seconds")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def microseconds(self) -> Series:
         """
         Number of microseconds (>= 0 and less than 1 second).
@@ -4935,7 +4935,7 @@ def microseconds(self) -> Series:
         return self._get_td_field("microseconds")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nanoseconds(self) -> Series:
         """
         Return the number of nanoseconds (n), where 0 <= n < 1 microsecond.
@@ -4967,7 +4967,7 @@ def nanoseconds(self) -> Series:
         return self._get_td_field("nanoseconds")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def components(self) -> cudf.DataFrame:
         """
         Return a Dataframe of the components of the Timedeltas.
@@ -4999,14 +4999,14 @@ def components(self) -> cudf.DataFrame:
             ca, index=self.series.index
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_td_field(self, field: str) -> Series:
         return self._return_result_like_self(
             getattr(self.series._column, field)
         )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _align_indices(series_list, how="outer", allow_non_unique=False):
     """
     Internal util to align the indices of a list of Series objects
@@ -5069,7 +5069,7 @@ def _align_indices(series_list, how="outer", allow_non_unique=False):
 
 
 @acquire_spill_lock()
-@_cudf_nvtx_annotate
+@_performance_tracking
 def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     r"""Returns a boolean array where two arrays are equal within a tolerance.
 
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 23a2c828a04..f9555aee6a2 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -18,7 +18,7 @@
 )
 from cudf.core.column import ColumnBase, as_column
 from cudf.core.frame import Frame
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import NotIterable
 
 if TYPE_CHECKING:
@@ -41,7 +41,7 @@ class SingleColumnFrame(Frame, NotIterable):
         "index": 0,
     }
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _reduce(
         self,
         op,
@@ -62,7 +62,7 @@ def _reduce(
         except AttributeError:
             raise TypeError(f"cannot perform {op} with type {self.dtype}")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _scan(self, op, axis=None, *args, **kwargs):
         if axis not in (None, 0):
             raise NotImplementedError("axis parameter is not implemented yet")
@@ -70,24 +70,24 @@ def _scan(self, op, axis=None, *args, **kwargs):
         return super()._scan(op, axis=axis, *args, **kwargs)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self):
         """Get the name of this object."""
         return next(iter(self._column_names))
 
     @name.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self, value):
         self._data[value] = self._data.pop(self.name)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ndim(self) -> int:  # noqa: D401
         """Number of dimensions of the underlying data, by definition 1."""
         return 1
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def shape(self) -> tuple[int]:
         """Get a tuple representing the dimensionality of the Index."""
         return (len(self),)
@@ -99,27 +99,27 @@ def __bool__(self):
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _num_columns(self) -> int:
         return 1
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _column(self) -> ColumnBase:
         return next(iter(self._columns))
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values(self) -> cupy.ndarray:  # noqa: D102
         return self._column.values
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values_host(self) -> numpy.ndarray:  # noqa: D102
         return self._column.values_host
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrow(cls, array) -> Self:
         """Create from PyArrow Array/ChunkedArray.
 
@@ -150,7 +150,7 @@ def from_arrow(cls, array) -> Self:
         """
         return cls(ColumnBase.from_arrow(array))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_arrow(self) -> pa.Array:
         """
         Convert to a PyArrow Array.
@@ -182,7 +182,7 @@ def to_arrow(self) -> pa.Array:
         return self._column.to_arrow()
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_unique(self) -> bool:
         """Return boolean if values in the object are unique.
 
@@ -193,7 +193,7 @@ def is_unique(self) -> bool:
         return self._column.is_unique
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_increasing(self) -> bool:
         """Return boolean if values in the object are monotonically increasing.
 
@@ -204,7 +204,7 @@ def is_monotonic_increasing(self) -> bool:
         return self._column.is_monotonic_increasing
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_decreasing(self) -> bool:
         """Return boolean if values in the object are monotonically decreasing.
 
@@ -215,7 +215,7 @@ def is_monotonic_decreasing(self) -> bool:
         return self._column.is_monotonic_decreasing
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __cuda_array_interface__(self):
         # While the parent column class has a `__cuda_array_interface__` method
         # defined, it is not implemented for all column types. When it is not
@@ -229,7 +229,7 @@ def __cuda_array_interface__(self):
                 "'__cuda_array_interface__'"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def factorize(
         self, sort: bool = False, use_na_sentinel: bool = True
     ) -> tuple[cupy.ndarray, cudf.Index]:
@@ -268,7 +268,7 @@ def factorize(
             use_na_sentinel=use_na_sentinel,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _make_operands_for_binop(
         self,
         other: Any,
@@ -323,7 +323,7 @@ def _make_operands_for_binop(
 
         return {result_name: (self._column, other, reflect, fill_value)}
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nunique(self, dropna: bool = True) -> int:
         """
         Return count of unique values for the column.
@@ -369,7 +369,7 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase:
                 return self._column.apply_boolean_mask(arg)
             raise NotImplementedError(f"Unknown indexer {type(arg)}")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         from cudf.core._internals.where import (
             _check_and_cast_columns_with_other,
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index 06d9296ca0f..265b87350ae 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -30,7 +30,7 @@
     _supported_dtypes_from_frame,
 )
 from cudf.utils._numba import _CUDFNumbaConfig
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 
 def _get_frame_groupby_type(dtype, index_dtype):
@@ -126,7 +126,7 @@ def _get_groupby_apply_kernel(frame, func, args):
     return kernel, return_type
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def jit_groupby_apply(offsets, grouped_values, function, *args):
     """
     Main entrypoint for JIT Groupby.apply via Numba.
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index f1704e4ea78..d616761cb3b 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -38,7 +38,7 @@
     STRING_TYPES,
     TIMEDELTA_TYPES,
 )
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import initfunc
 
 # Maximum size of a string column is 2 GiB
@@ -71,7 +71,7 @@ def _ptx_file():
     )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _get_udf_return_type(argty, func: Callable, args=()):
     """
     Get the return type of a masked UDF for a given set of argument dtypes. It
@@ -236,7 +236,7 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"):
     )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _compile_or_get(
     frame, func, args, kernel_getter=None, suffix="__APPLY_UDF"
 ):
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index f07764e2ce4..e909d96309e 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -12,10 +12,10 @@
 from cudf.api.types import is_scalar
 from cudf.utils import ioutils
 from cudf.utils.dtypes import _maybe_convert_to_default_type
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 @ioutils.doc_read_csv()
 def read_csv(
     filepath_or_buffer,
@@ -151,7 +151,7 @@ def read_csv(
     return df
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 @ioutils.doc_to_csv()
 def to_csv(
     df,
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 2a838ca7417..7733e770d99 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -22,7 +22,7 @@
 from cudf.api.types import is_list_like
 from cudf.core.column import as_column, build_categorical_column, column_empty
 from cudf.utils import ioutils
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 BYTE_SIZES = {
     "kb": 1000,
@@ -50,7 +50,7 @@
 }
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _write_parquet(
     df,
     paths,
@@ -130,7 +130,7 @@ def _write_parquet(
 
 # Logic chosen to match: https://arrow.apache.org/
 # docs/_modules/pyarrow/parquet.html#write_to_dataset
-@_cudf_nvtx_annotate
+@_performance_tracking
 def write_to_dataset(
     df,
     root_path,
@@ -318,7 +318,7 @@ def write_to_dataset(
 
 
 @ioutils.doc_read_parquet_metadata()
-@_cudf_nvtx_annotate
+@_performance_tracking
 def read_parquet_metadata(filepath_or_buffer):
     """{docstring}"""
     # Multiple sources are passed as a list. If a single source is passed,
@@ -360,7 +360,7 @@ def read_parquet_metadata(filepath_or_buffer):
     return libparquet.read_parquet_metadata(filepaths_or_buffers)
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _process_dataset(
     paths,
     fs,
@@ -515,7 +515,7 @@ def _process_dataset(
 
 
 @ioutils.doc_read_parquet()
-@_cudf_nvtx_annotate
+@_performance_tracking
 def read_parquet(
     filepath_or_buffer,
     engine="cudf",
@@ -785,7 +785,7 @@ def _handle_is(column: cudf.Series, value, *, negate) -> cudf.Series:
         return df
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _parquet_to_frame(
     paths_or_buffers,
     *args,
@@ -885,7 +885,7 @@ def _parquet_to_frame(
         return dfs[0]
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _read_parquet(
     filepaths_or_buffers,
     engine,
@@ -941,7 +941,7 @@ def _read_parquet(
 
 
 @ioutils.doc_to_parquet()
-@_cudf_nvtx_annotate
+@_performance_tracking
 def to_parquet(
     df,
     path,
@@ -1107,7 +1107,7 @@ def _get_estimated_file_size(df):
     return file_size
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _get_partitioned(
     df,
     root_path,
@@ -1145,7 +1145,7 @@ def _get_partitioned(
     return full_paths, metadata_file_paths, grouped_df, part_offsets, filename
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _get_groups_and_offsets(
     df,
     partition_cols,
@@ -1305,7 +1305,7 @@ class ParquetDatasetWriter:
 
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         path,
@@ -1355,7 +1355,7 @@ def __init__(
 
         self._file_sizes: dict[str, int] = {}
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def write_table(self, df):
         """
         Write a dataframe to the file/dataset
@@ -1486,7 +1486,7 @@ def write_table(self, df):
             self.path_cw_map.update({k: new_cw_idx for k in new_paths})
             self._chunked_writers[-1][0].write_table(grouped_df, part_info)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def close(self, return_metadata=False):
         """
         Close all open files and optionally return footer metadata as a binary
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 0e19972f6e0..4329480bb2c 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -1,14 +1,14 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from io import BytesIO, StringIO
 
 import cudf
 from cudf._lib import text as libtext
 from cudf.utils import ioutils
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 @ioutils.doc_read_text()
 def read_text(
     filepath_or_buffer,
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index fb5a963f008..1f539e7f266 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -311,6 +311,20 @@ def _integer_and_none_validator(val):
     _make_contains_validator([False, True]),
 )
 
+_register_option(
+    "memory_profiling",
+    _env_get_bool("CUDF_MEMORY_PROFILING", False),
+    textwrap.dedent(
+        """
+        If set to `False`, disables memory profiling.
+        If set to `True`, enables memory profiling.
+        Read more at: :ref:`memory-profiling-user-doc`
+        \tValid values are True or False. Default is False.
+    """
+    ),
+    _make_contains_validator([False, True]),
+)
+
 
 class option_context(ContextDecorator):
     """
diff --git a/python/cudf/cudf/tests/test_performance_tracking.py b/python/cudf/cudf/tests/test_performance_tracking.py
new file mode 100644
index 00000000000..e886b77af3f
--- /dev/null
+++ b/python/cudf/cudf/tests/test_performance_tracking.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from io import StringIO
+
+import pytest
+
+import rmm.mr
+import rmm.statistics
+
+import cudf
+from cudf.utils.performance_tracking import (
+    get_memory_records,
+    print_memory_report,
+)
+
+
+@pytest.fixture
+def rmm_reset():
+    """Fixture to reset the RMM resource before and after the test"""
+    mr = rmm.mr.get_current_device_resource()
+    try:
+        rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource())
+        yield
+    finally:
+        rmm.mr.set_current_device_resource(mr)
+
+
+def test_memory_profiling(rmm_reset):
+    df1 = cudf.DataFrame({"a": [1, 2, 3]})
+    assert len(get_memory_records()) == 0
+
+    rmm.statistics.enable_statistics()
+    cudf.set_option("memory_profiling", True)
+
+    df1.merge(df1)
+
+    assert len(get_memory_records()) > 0
+
+    out = StringIO()
+    print_memory_report(file=out)
+    assert "DataFrame.merge" in out.getvalue()
diff --git a/python/cudf/cudf/utils/nvtx_annotation.py b/python/cudf/cudf/utils/nvtx_annotation.py
deleted file mode 100644
index a4404e51232..00000000000
--- a/python/cudf/cudf/utils/nvtx_annotation.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-
-import hashlib
-from functools import partial
-
-from nvtx import annotate
-
-_NVTX_COLORS = ["green", "blue", "purple", "rapids"]
-
-
-def _get_color_for_nvtx(name):
-    m = hashlib.sha256()
-    m.update(name.encode())
-    hash_value = int(m.hexdigest(), 16)
-    idx = hash_value % len(_NVTX_COLORS)
-    return _NVTX_COLORS[idx]
-
-
-def _cudf_nvtx_annotate(func, domain="cudf_python"):
-    """Decorator for applying nvtx annotations to methods in cudf."""
-    return annotate(
-        message=func.__qualname__,
-        color=_get_color_for_nvtx(func.__qualname__),
-        domain=domain,
-    )(func)
-
-
-_dask_cudf_nvtx_annotate = partial(
-    _cudf_nvtx_annotate, domain="dask_cudf_python"
-)
diff --git a/python/cudf/cudf/utils/performance_tracking.py b/python/cudf/cudf/utils/performance_tracking.py
new file mode 100644
index 00000000000..30c891d0d5a
--- /dev/null
+++ b/python/cudf/cudf/utils/performance_tracking.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import contextlib
+import functools
+import hashlib
+import sys
+
+import nvtx
+
+import rmm.statistics
+
+from cudf.options import get_option
+
+_NVTX_COLORS = ["green", "blue", "purple", "rapids"]
+
+
+def _get_color_for_nvtx(name):
+    m = hashlib.sha256()
+    m.update(name.encode())
+    hash_value = int(m.hexdigest(), 16)
+    idx = hash_value % len(_NVTX_COLORS)
+    return _NVTX_COLORS[idx]
+
+
+def _performance_tracking(func, domain="cudf_python"):
+    """Decorator for applying performance tracking (if enabled)."""
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        with contextlib.ExitStack() as stack:
+            if get_option("memory_profiling"):
+                # NB: the user still needs to call `rmm.statistics.enable_statistics()`
+                #     to enable memory profiling.
+                stack.enter_context(
+                    rmm.statistics.profiler(
+                        name=rmm.statistics._get_descriptive_name_of_object(
+                            func
+                        )
+                    )
+                )
+            if nvtx.enabled():
+                stack.enter_context(
+                    nvtx.annotate(
+                        message=func.__qualname__,
+                        color=_get_color_for_nvtx(func.__qualname__),
+                        domain=domain,
+                    )
+                )
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
+_dask_cudf_performance_tracking = functools.partial(
+    _performance_tracking, domain="dask_cudf_python"
+)
+
+
+def get_memory_records() -> (
+    dict[str, rmm.statistics.ProfilerRecords.MemoryRecord]
+):
+    """Get the memory records from the memory profiling
+
+    Returns
+    -------
+    Dict that maps function names to memory records. Empty if
+    memory profiling is disabled
+    """
+    return rmm.statistics.default_profiler_records.records
+
+
+def print_memory_report(file=sys.stdout) -> None:
+    """Pretty print the result of the memory profiling
+
+    Parameters
+    ----------
+    file
+        The output stream
+    """
+    print(rmm.statistics.default_profiler_records.report(), file=file)
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 2e4dfc4bb14..7347ec7866a 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -159,8 +159,9 @@ def _external_only_api(func, alternative=""):
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
         # Check the immediately preceding frame to see if it's in cudf.
-        frame, lineno = next(traceback.walk_stack(None))
-        fn = frame.f_code.co_filename
+        pre_frame = traceback.extract_stack(limit=2)[0]
+        fn = pre_frame.filename
+        lineno = pre_frame.lineno
         if _cudf_root in fn and _tests_root not in fn:
             raise RuntimeError(
                 f"External-only API called in {fn} at line {lineno}. "
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index d250589e389..1f55a59ea55 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -43,7 +43,7 @@
 
 import cudf
 from cudf.api.types import is_string_dtype
-from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 from .core import DataFrame, Index, Series
 
@@ -53,7 +53,7 @@
 
 
 @meta_nonempty.register(cudf.BaseIndex)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _nonempty_index(idx):
     if isinstance(idx, cudf.core.index.RangeIndex):
         return cudf.core.index.RangeIndex(2, name=idx.name)
@@ -100,7 +100,7 @@ def _nest_list_data(data, leaf_type):
     return data
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _get_non_empty_data(s):
     if isinstance(s, cudf.core.column.CategoricalColumn):
         categories = (
@@ -147,7 +147,7 @@ def _get_non_empty_data(s):
 
 
 @meta_nonempty.register(cudf.Series)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _nonempty_series(s, idx=None):
     if idx is None:
         idx = _nonempty_index(s.index)
@@ -157,7 +157,7 @@ def _nonempty_series(s, idx=None):
 
 
 @meta_nonempty.register(cudf.DataFrame)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def meta_nonempty_cudf(x):
     idx = meta_nonempty(x.index)
     columns_with_dtype = dict()
@@ -182,18 +182,18 @@ def meta_nonempty_cudf(x):
 
 
 @make_meta_dispatch.register((cudf.Series, cudf.DataFrame))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def make_meta_cudf(x, index=None):
     return x.head(0)
 
 
 @make_meta_dispatch.register(cudf.BaseIndex)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def make_meta_cudf_index(x, index=None):
     return x[:0]
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _empty_series(name, dtype, index=None):
     if isinstance(dtype, str) and dtype == "category":
         return cudf.Series(
@@ -203,7 +203,7 @@ def _empty_series(name, dtype, index=None):
 
 
 @make_meta_obj.register(object)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def make_meta_object_cudf(x, index=None):
     """Create an empty cudf object containing the desired metadata.
 
@@ -274,7 +274,7 @@ def make_meta_object_cudf(x, index=None):
 
 
 @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def concat_cudf(
     dfs,
     axis=0,
@@ -299,13 +299,13 @@ def concat_cudf(
 @categorical_dtype_dispatch.register(
     (cudf.DataFrame, cudf.Series, cudf.BaseIndex)
 )
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def categorical_dtype_cudf(categories=None, ordered=False):
     return cudf.CategoricalDtype(categories=categories, ordered=ordered)
 
 
 @tolist_dispatch.register((cudf.Series, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def tolist_cudf(obj):
     return obj.to_pandas().tolist()
 
@@ -313,7 +313,7 @@ def tolist_cudf(obj):
 @is_categorical_dtype_dispatch.register(
     (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series)
 )
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def is_categorical_dtype_cudf(obj):
     return cudf.api.types._is_categorical_dtype(obj)
 
@@ -324,7 +324,7 @@ def get_grouper_cudf(obj):
 
 
 @percentile_lookup.register((cudf.Series, cp.ndarray, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def percentile_cudf(a, q, interpolation="linear"):
     # Cudf dispatch to the equivalent of `np.percentile`:
     # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html
@@ -400,7 +400,7 @@ def _table_to_cudf(obj, table, self_destruct=None, **kwargs):
 
 
 @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def union_categoricals_cudf(
     to_union, sort_categories=False, ignore_order=False
 ):
@@ -410,7 +410,7 @@ def union_categoricals_cudf(
 
 
 @hash_object_dispatch.register((cudf.DataFrame, cudf.Series))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def hash_object_cudf(frame, index=True):
     if index:
         frame = frame.reset_index()
@@ -418,7 +418,7 @@ def hash_object_cudf(frame, index=True):
 
 
 @hash_object_dispatch.register(cudf.BaseIndex)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def hash_object_cudf_index(ind, index=None):
     if isinstance(ind, cudf.MultiIndex):
         return ind.to_frame(index=False).hash_values()
@@ -428,7 +428,7 @@ def hash_object_cudf_index(ind, index=None):
 
 
 @group_split_dispatch.register((cudf.Series, cudf.DataFrame))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def group_split_cudf(df, c, k, ignore_index=False):
     return dict(
         zip(
@@ -443,7 +443,7 @@ def group_split_cudf(df, c, k, ignore_index=False):
 
 
 @sizeof_dispatch.register(cudf.DataFrame)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def sizeof_cudf_dataframe(df):
     return int(
         sum(col.memory_usage for col in df._data.columns)
@@ -452,7 +452,7 @@ def sizeof_cudf_dataframe(df):
 
 
 @sizeof_dispatch.register((cudf.Series, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def sizeof_cudf_series_index(obj):
     return obj.memory_usage()
 
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 3bd455a3a57..aab56e3a1b0 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -22,7 +22,7 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 from dask_cudf import sorting
 from dask_cudf.accessors import ListMethods, StructMethods
@@ -53,7 +53,7 @@ def __repr__(self):
         s = "<dask_cudf.%s | %d tasks | %d npartitions>"
         return s % (type(self).__name__, len(self.dask), self.npartitions)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def to_dask_dataframe(self, **kwargs):
         """Create a dask.dataframe object from a dask_cudf object
 
@@ -92,7 +92,7 @@ class DataFrame(_Frame, dd.core.DataFrame):
 
     _partition_type = cudf.DataFrame
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def _assign_column(self, k, v):
         def assigner(df, k, v):
             out = df.copy()
@@ -102,7 +102,7 @@ def assigner(df, k, v):
         meta = assigner(self._meta, k, dask_make_meta(v))
         return self.map_partitions(assigner, k, v, meta=meta)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None):
         import uuid
 
@@ -123,7 +123,7 @@ def do_apply_rows(df, func, incols, outcols, kwargs):
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def merge(self, other, shuffle_method=None, **kwargs):
         on = kwargs.pop("on", None)
         if isinstance(on, tuple):
@@ -136,7 +136,7 @@ def merge(self, other, shuffle_method=None, **kwargs):
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def join(self, other, shuffle_method=None, **kwargs):
         # CuDF doesn't support "right" join yet
         how = kwargs.pop("how", "left")
@@ -155,7 +155,7 @@ def join(self, other, shuffle_method=None, **kwargs):
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def set_index(
         self,
         other,
@@ -237,7 +237,7 @@ def set_index(
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def sort_values(
         self,
         by,
@@ -275,14 +275,14 @@ def sort_values(
             return df.reset_index(drop=True)
         return df
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def to_parquet(self, path, *args, **kwargs):
         """Calls dask.dataframe.io.to_parquet with CudfEngine backend"""
         from dask_cudf.io import to_parquet
 
         return to_parquet(self, path, *args, **kwargs)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def to_orc(self, path, **kwargs):
         """Calls dask_cudf.io.to_orc"""
         from dask_cudf.io import to_orc
@@ -290,7 +290,7 @@ def to_orc(self, path, **kwargs):
         return to_orc(self, path, **kwargs)
 
     @derived_from(pd.DataFrame)
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def var(
         self,
         axis=None,
@@ -324,28 +324,28 @@ def var(
             return _parallel_var(self, meta, skipna, split_every, out)
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def shuffle(self, *args, shuffle_method=None, **kwargs):
         """Wraps dask.dataframe DataFrame.shuffle method"""
         return super().shuffle(
             *args, shuffle_method=_get_shuffle_method(shuffle_method), **kwargs
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def groupby(self, by=None, **kwargs):
         from .groupby import CudfDataFrameGroupBy
 
         return CudfDataFrameGroupBy(self, by=by, **kwargs)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def sum_of_squares(x):
     x = x.astype("f8")._column
     outcol = libcudf.reduce.reduce("sum_of_squares", x)
     return cudf.Series(outcol)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def var_aggregate(x2, x, n, ddof):
     try:
         with warnings.catch_warnings(record=True):
@@ -358,12 +358,12 @@ def var_aggregate(x2, x, n, ddof):
         return np.float64(np.nan)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def nlargest_agg(x, **kwargs):
     return cudf.concat(x).nlargest(**kwargs)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def nsmallest_agg(x, **kwargs):
     return cudf.concat(x).nsmallest(**kwargs)
 
@@ -371,7 +371,7 @@ def nsmallest_agg(x, **kwargs):
 class Series(_Frame, dd.core.Series):
     _partition_type = cudf.Series
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def count(self, split_every=False):
         return reduction(
             [self],
@@ -381,14 +381,14 @@ def count(self, split_every=False):
             meta="i8",
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def mean(self, split_every=False):
         sum = self.sum(split_every=split_every)
         n = self.count(split_every=split_every)
         return sum / n
 
     @derived_from(pd.DataFrame)
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def var(
         self,
         axis=None,
@@ -417,19 +417,19 @@ def var(
         else:
             return _parallel_var(self, meta, skipna, split_every, out)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def groupby(self, *args, **kwargs):
         from .groupby import CudfSeriesGroupBy
 
         return CudfSeriesGroupBy(self, *args, **kwargs)
 
     @property  # type: ignore
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def list(self):
         return ListMethods(self)
 
     @property  # type: ignore
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def struct(self):
         return StructMethods(self)
 
@@ -438,7 +438,7 @@ class Index(Series, dd.core.Index):
     _partition_type = cudf.Index  # type: ignore
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _naive_var(ddf, meta, skipna, ddof, split_every, out):
     num = ddf._get_numeric_data()
     x = 1.0 * num.sum(skipna=skipna, split_every=split_every)
@@ -453,7 +453,7 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out):
     return handle_out(out, result)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _parallel_var(ddf, meta, skipna, split_every, out):
     def _local_var(x, skipna):
         if skipna:
@@ -520,7 +520,7 @@ def _finalize_var(vals):
     return handle_out(out, result)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _extract_meta(x):
     """
     Extract internal cache data (``_meta``) from dask_cudf objects
@@ -536,7 +536,7 @@ def _extract_meta(x):
     return x
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _emulate(func, *args, **kwargs):
     """
     Apply a function using args / kwargs. If arguments contain dd.DataFrame /
@@ -546,7 +546,7 @@ def _emulate(func, *args, **kwargs):
         return func(*_extract_meta(args), **_extract_meta(kwargs))
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def align_partitions(args):
     """Align partitions between dask_cudf objects.
 
@@ -563,7 +563,7 @@ def align_partitions(args):
     return args
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def reduction(
     args,
     chunk=None,
@@ -702,7 +702,7 @@ def reduction(
     return dd.core.new_dd_object(graph, b, meta, (None, None))
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
     from dask_cudf import QUERY_PLANNING_ON
 
@@ -746,7 +746,7 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
 )
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def from_dask_dataframe(df):
     """
     Convert a Dask :class:`dask.dataframe.DataFrame` to a Dask-cuDF
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 2e72461b43d..bbbcde17b51 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -16,7 +16,7 @@
 
 import cudf
 from cudf.core.groupby.groupby import _deprecate_collect
-from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 from dask_cudf.sorting import _deprecate_shuffle_kwarg
 
@@ -56,13 +56,13 @@ def wrapper(*args, **kwargs):
 
 
 class CudfDataFrameGroupBy(DataFrameGroupBy):
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def __init__(self, *args, sort=None, **kwargs):
         self.sep = kwargs.pop("sep", "___")
         self.as_index = kwargs.pop("as_index", True)
         super().__init__(*args, sort=sort, **kwargs)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def __getitem__(self, key):
         if isinstance(key, list):
             g = CudfDataFrameGroupBy(
@@ -84,7 +84,7 @@ def __getitem__(self, key):
         g._meta = g._meta[key]
         return g
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def _make_groupby_method_aggs(self, agg_name):
         """Create aggs dictionary for aggregation methods"""
 
@@ -92,7 +92,7 @@ def _make_groupby_method_aggs(self, agg_name):
             return {c: agg_name for c in self.obj.columns if c not in self.by}
         return {c: agg_name for c in self.obj.columns if c != self.by}
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def count(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -102,7 +102,7 @@ def count(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def mean(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -112,7 +112,7 @@ def mean(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def std(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -122,7 +122,7 @@ def std(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def var(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -132,7 +132,7 @@ def var(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def sum(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -142,7 +142,7 @@ def sum(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def min(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -152,7 +152,7 @@ def min(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def max(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -162,7 +162,7 @@ def max(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
         _deprecate_collect()
@@ -173,7 +173,7 @@ def collect(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def first(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -183,7 +183,7 @@ def first(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def last(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -194,7 +194,7 @@ def last(self, split_every=None, split_out=1):
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def aggregate(
         self, arg, split_every=None, split_out=1, shuffle_method=None
     ):
@@ -231,13 +231,13 @@ def aggregate(
 
 
 class CudfSeriesGroupBy(SeriesGroupBy):
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def __init__(self, *args, sort=None, **kwargs):
         self.sep = kwargs.pop("sep", "___")
         self.as_index = kwargs.pop("as_index", True)
         super().__init__(*args, sort=sort, **kwargs)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def count(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -247,7 +247,7 @@ def count(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def mean(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -257,7 +257,7 @@ def mean(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def std(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -267,7 +267,7 @@ def std(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def var(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -277,7 +277,7 @@ def var(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def sum(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -287,7 +287,7 @@ def sum(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def min(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -297,7 +297,7 @@ def min(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def max(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -307,7 +307,7 @@ def max(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
         _deprecate_collect()
@@ -318,7 +318,7 @@ def collect(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def first(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -328,7 +328,7 @@ def first(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def last(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -339,7 +339,7 @@ def last(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def aggregate(
         self, arg, split_every=None, split_out=1, shuffle_method=None
     ):
@@ -429,7 +429,7 @@ def _shuffle_aggregate(
     return result
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def groupby_agg(
     ddf,
     gb_cols,
@@ -641,7 +641,7 @@ def groupby_agg(
     )
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _make_groupby_agg_call(
     gb, aggs, split_every, split_out, shuffle_method=None
 ):
@@ -663,7 +663,7 @@ def _make_groupby_agg_call(
     )
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _redirect_aggs(arg):
     """Redirect aggregations to their corresponding name in cuDF"""
     redirects = {
@@ -690,7 +690,7 @@ def _redirect_aggs(arg):
     return redirects.get(arg, arg)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _aggs_optimized(arg, supported: set):
     """Check that aggregations in `arg` are a subset of `supported`"""
     if isinstance(arg, (list, dict)):
@@ -712,7 +712,7 @@ def _aggs_optimized(arg, supported: set):
     return False
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _groupby_optimized(gb):
     """Check that groupby input can use dask-cudf optimized codepath"""
     return isinstance(gb.obj, DaskDataFrame) and (
@@ -730,7 +730,7 @@ def _make_name(col_name, sep="_"):
     return sep.join(name for name in col_name if name != "")
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep):
     """Initial partition-level aggregation task.
 
@@ -768,7 +768,7 @@ def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep):
     return gb[sorted(output_columns)]
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _tree_node_agg(df, gb_cols, dropna, sort, sep):
     """Node in groupby-aggregation reduction tree.
 
@@ -807,7 +807,7 @@ def _tree_node_agg(df, gb_cols, dropna, sort, sep):
     return gb[sorted(output_columns)]
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
     """Calculate variance (given count, sum, and sum-squared columns)."""
 
@@ -829,7 +829,7 @@ def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
     return var
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _finalize_gb_agg(
     gb_in,
     gb_cols,
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index f3774e20d32..a2ba4d1878e 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -18,7 +18,7 @@
 
 import cudf
 from cudf.api.types import _is_categorical_dtype
-from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 _SHUFFLE_SUPPORT = ("tasks", "p2p")  # "disk" not supported
 
@@ -48,14 +48,14 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def set_index_post(df, index_name, drop, column_dtype):
     df2 = df.set_index(index_name, drop=drop)
     df2.columns = df2.columns.astype(column_dtype)
     return df2
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _set_partitions_pre(s, divisions, ascending=True, na_position="last"):
     if ascending:
         partitions = divisions.searchsorted(s, side="right") - 1
@@ -72,7 +72,7 @@ def _set_partitions_pre(s, divisions, ascending=True, na_position="last"):
     return partitions
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _quantile(a, q):
     n = len(a)
     if not len(a):
@@ -83,7 +83,7 @@ def _quantile(a, q):
     )
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def merge_quantiles(finalq, qs, vals):
     """Combine several quantile calculations of different data.
     [NOTE: Same logic as dask.array merge_percentiles]
@@ -146,7 +146,7 @@ def _append_counts(val, count):
     return rv.reset_index(drop=True)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _approximate_quantile(df, q):
     """Approximate quantiles of DataFrame or Series.
     [NOTE: Same logic as dask.dataframe Series quantile]
@@ -220,7 +220,7 @@ def set_quantile_index(df):
     return df
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def quantile_divisions(df, by, npartitions):
     qn = np.linspace(0.0, 1.0, npartitions + 1).tolist()
     divisions = _approximate_quantile(df[by], qn).compute()
@@ -257,7 +257,7 @@ def quantile_divisions(df, by, npartitions):
 
 
 @_deprecate_shuffle_kwarg
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def sort_values(
     df,
     by,

From 57862a3ab1324bc8dbea4133485bb99044bc2742 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 28 Jun 2024 08:43:12 -0400
Subject: [PATCH 20/29] stable_distinct public api now has a stream parameter
 (#16068)

As part of https://github.com/rapidsai/cudf/pull/15982 we determined that the cudf  `stable_distinct` public API needs to be updated so that a user provided stream can be provided.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Srinivas Yadav (https://github.com/srinivasyadav18)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16068
---
 cpp/include/cudf/detail/stream_compaction.hpp |   2 -
 cpp/include/cudf/stream_compaction.hpp        |   2 +
 cpp/src/stream_compaction/stable_distinct.cu  |   4 +-
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/streams/stream_compaction_test.cpp  | 235 ++++++++++++++++++
 5 files changed, 240 insertions(+), 4 deletions(-)
 create mode 100644 cpp/tests/streams/stream_compaction_test.cpp

diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index e2974789ea1..e3ef4190fd2 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -88,8 +88,6 @@ std::unique_ptr<table> distinct(table_view const& input,
 
 /**
  * @copydoc cudf::stable_distinct
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> stable_distinct(table_view const& input,
                                        std::vector<size_type> const& keys,
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index c386b3a22b4..181af11adb8 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -320,6 +320,7 @@ std::unique_ptr<column> distinct_indices(
  * @param keep Copy any, first, last, or none of the found duplicates
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
  * @param nans_equal Flag to specify whether NaN elements should be considered as equal
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned table
  * @return Table with distinct rows, preserving input order
  */
@@ -329,6 +330,7 @@ std::unique_ptr<table> stable_distinct(
   duplicate_keep_option keep        = duplicate_keep_option::KEEP_ANY,
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu
index 27b5a92ab69..074d4fd7d1a 100644
--- a/cpp/src/stream_compaction/stable_distinct.cu
+++ b/cpp/src/stream_compaction/stable_distinct.cu
@@ -79,11 +79,11 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
                                        duplicate_keep_option keep,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::stable_distinct(
-    input, keys, keep, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
+  return detail::stable_distinct(input, keys, keep, nulls_equal, nans_equal, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 9f14455f42d..eef09954647 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -700,6 +700,7 @@ ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_ROLLING_TEST streams/rolling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_STREAM_COMPACTION_TEST streams/stream_compaction_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_STRINGS_TEST
   streams/strings/case_test.cpp
diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp
new file mode 100644
index 00000000000..56443870602
--- /dev/null
+++ b/cpp/tests/streams/stream_compaction_test.cpp
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cmath>
+
+auto constexpr null{0};  // null at current level
+auto constexpr XXX{0};   // null pushed down from parent level
+auto constexpr NaN          = std::numeric_limits<double>::quiet_NaN();
+auto constexpr KEEP_ANY     = cudf::duplicate_keep_option::KEEP_ANY;
+auto constexpr KEEP_FIRST   = cudf::duplicate_keep_option::KEEP_FIRST;
+auto constexpr KEEP_LAST    = cudf::duplicate_keep_option::KEEP_LAST;
+auto constexpr KEEP_NONE    = cudf::duplicate_keep_option::KEEP_NONE;
+auto constexpr NULL_EQUAL   = cudf::null_equality::EQUAL;
+auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL;
+auto constexpr NAN_EQUAL    = cudf::nan_equality::ALL_EQUAL;
+auto constexpr NAN_UNEQUAL  = cudf::nan_equality::UNEQUAL;
+
+using int32s_col = cudf::test::fixed_width_column_wrapper<int32_t>;
+using floats_col = cudf::test::fixed_width_column_wrapper<float>;
+
+using cudf::nan_policy;
+using cudf::null_equality;
+using cudf::null_policy;
+using cudf::test::iterators::no_nulls;
+using cudf::test::iterators::null_at;
+using cudf::test::iterators::nulls_at;
+
+struct StableDistinctKeepAny : public cudf::test::BaseFixture {};
+
+struct StableDistinctKeepFirstLastNone : public cudf::test::BaseFixture {};
+
+TEST_F(StableDistinctKeepAny, NoNullsTableWithNaNs)
+{
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+  auto const col1  = int32s_col{6, 6, 6, 1, 1, 1, 3, 5, 8, 5};
+  auto const col2  = floats_col{6, 6, 6, 1, 1, 1, 3, 4, 9, 4};
+  auto const keys1 = int32s_col{20, 20, 20, 15, 15, 15, 20, 19, 21, 9};
+  auto const keys2 = floats_col{19., 19., 19., NaN, NaN, NaN, 20., 20., 9., 21.};
+
+  auto const input   = cudf::table_view{{col1, col2, keys1, keys2}};
+  auto const key_idx = std::vector<cudf::size_type>{2, 3};
+
+  // NaNs are unequal.
+  {
+    auto const exp_col1  = int32s_col{6, 1, 1, 1, 3, 5, 8, 5};
+    auto const exp_col2  = floats_col{6, 1, 1, 1, 3, 4, 9, 4};
+    auto const exp_keys1 = int32s_col{20, 15, 15, 15, 20, 19, 21, 9};
+    auto const exp_keys2 = floats_col{19., NaN, NaN, NaN, 20., 20., 9., 21.};
+    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // NaNs are equal.
+  {
+    auto const exp_col1  = int32s_col{6, 1, 3, 5, 8, 5};
+    auto const exp_col2  = floats_col{6, 1, 3, 4, 9, 4};
+    auto const exp_keys1 = int32s_col{20, 15, 20, 19, 21, 9};
+    auto const exp_keys2 = floats_col{19., NaN, 20., 20., 9., 21.};
+    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, InputWithNullsAndNaNs)
+{
+  auto constexpr null{0.0};  // shadow the global `null` variable of type int
+
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+  auto const col   = int32s_col{5, 4, 4, 1, 1, 1, 8, 8, 1};
+  auto const keys  = floats_col{{20., null, null, NaN, NaN, NaN, 19., 19., 21.}, nulls_at({1, 2})};
+  auto const input = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // Nulls are equal, NaNs are unequal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 1, 1, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, NaN, NaN, NaN, 19., 21.}, null_at(1)};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // Nulls are equal, NaNs are equal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, NaN, 19., 21.}, null_at(1)};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // Nulls are unequal, NaNs are unequal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 4, 1, 1, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, null, NaN, NaN, NaN, 19., 21.}, nulls_at({1, 2})};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // Nulls are unequal, NaNs are equal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 4, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, null, NaN, 19., 21.}, nulls_at({1, 2})};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_UNEQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsEqual)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6};
+  auto const keys    = floats_col{20., NaN, NaN, 19., 21., 19., 22.};
+  auto const input   = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col  = int32s_col{0, 1, 3, 4, 6};
+    auto const exp_keys = floats_col{20., NaN, 19., 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_FIRST, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col  = int32s_col{0, 2, 4, 5, 6};
+    auto const exp_keys = floats_col{20., NaN, 21., 19., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_LAST, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col  = int32s_col{0, 4, 6};
+    auto const exp_keys = floats_col{20., 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_NONE, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsUnequal)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6, 7};
+  auto const keys    = floats_col{20., NaN, NaN, 19., 21., 19., 22., 20.};
+  auto const input   = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col  = int32s_col{0, 1, 2, 3, 4, 6};
+    auto const exp_keys = floats_col{20., NaN, NaN, 19., 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_FIRST, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col  = int32s_col{1, 2, 4, 5, 6, 7};
+    auto const exp_keys = floats_col{NaN, NaN, 21., 19., 22., 20.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_LAST, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col  = int32s_col{1, 2, 4, 6};
+    auto const exp_keys = floats_col{NaN, NaN, 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_NONE, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}

From 2b547dc70c7f42b671cdc3e75946b123301779f0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 28 Jun 2024 03:11:01 -1000
Subject: [PATCH 21/29] Add ensure_index to not unnecessarily shallow copy
 cudf.Index (#16117)

The `cudf.Index` constructor will shallow copy a `cudf.Index` input. Sometimes, we just need to make sure an input is a `cudf.Index`, so created `ensure_index` (pandas has something similar) so we don't shallow copy these inputs unnecessarily

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16117
---
 python/cudf/cudf/core/_base_index.py     |  6 ++++-
 python/cudf/cudf/core/algorithms.py      |  4 ++--
 python/cudf/cudf/core/cut.py             |  2 +-
 python/cudf/cudf/core/dataframe.py       | 29 ++++++++++++++----------
 python/cudf/cudf/core/index.py           | 13 ++++++++++-
 python/cudf/cudf/core/indexed_frame.py   | 11 ++++-----
 python/cudf/cudf/core/multiindex.py      |  3 ++-
 python/cudf/cudf/core/series.py          | 12 ++++------
 python/cudf/cudf/tests/test_dataframe.py | 24 ++++++++++++++++++++
 9 files changed, 73 insertions(+), 31 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index caf07b286cd..e160fa697ee 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1104,7 +1104,11 @@ def difference(self, other, sort=None):
                 f"of [None, False, True]; {sort} was passed."
             )
 
-        other = cudf.Index(other, name=getattr(other, "name", self.name))
+        if not isinstance(other, BaseIndex):
+            other = cudf.Index(
+                other,
+                name=getattr(other, "name", self.name),
+            )
 
         if not len(other):
             res = self._get_reconciled_name_object(other).unique()
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 51a32e29886..e8b82ff60c2 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -6,7 +6,7 @@
 
 from cudf.core.column import as_column
 from cudf.core.copy_types import BooleanMask
-from cudf.core.index import Index, RangeIndex
+from cudf.core.index import RangeIndex, ensure_index
 from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.scalar import Scalar
 from cudf.options import get_option
@@ -107,7 +107,7 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
         dtype="int64" if get_option("mode.pandas_compatible") else None,
     ).values
 
-    return labels, cats.values if return_cupy_array else Index(cats)
+    return labels, cats.values if return_cupy_array else ensure_index(cats)
 
 
 def _linear_interpolation(column, index=None):
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index 54c5e829e8a..d9f62f51f92 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -292,7 +292,7 @@ def cut(
     )
 
     # we return a categorical index, as we don't have a Categorical method
-    categorical_index = cudf.Index(col)
+    categorical_index = cudf.CategoricalIndex._from_data({None: col})
 
     if isinstance(orig_x, (pd.Series, cudf.Series)):
         # if we have a series input we return a series output
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 3fc29582c4c..4dfeb68b7ba 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -58,7 +58,12 @@
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.copy_types import BooleanMask
 from cudf.core.groupby.groupby import DataFrameGroupBy, groupby_doc_template
-from cudf.core.index import BaseIndex, RangeIndex, _index_from_data, as_index
+from cudf.core.index import (
+    BaseIndex,
+    RangeIndex,
+    _index_from_data,
+    ensure_index,
+)
 from cudf.core.indexed_frame import (
     IndexedFrame,
     _FrameIndexer,
@@ -338,7 +343,7 @@ def _getitem_tuple_arg(self, arg):
                                 range(len(tmp_arg[0]))
                             )
                         },
-                        index=as_index(tmp_arg[0]),
+                        index=cudf.Index(tmp_arg[0]),
                     )
                     columns_df[cantor_name] = column.as_column(
                         range(len(columns_df))
@@ -702,7 +707,7 @@ def __init__(
                     data = data.reindex(index)
                     index = data.index
                 else:
-                    index = cudf.Index(index)
+                    index = ensure_index(index)
             else:
                 index = data.index
 
@@ -751,7 +756,7 @@ def __init__(
             if index is None:
                 self._index = RangeIndex(0)
             else:
-                self._index = cudf.Index(index)
+                self._index = ensure_index(index)
             if columns is not None:
                 rangeindex = isinstance(
                     columns, (range, pd.RangeIndex, cudf.RangeIndex)
@@ -909,7 +914,7 @@ def _init_from_series_list(self, data, columns, index):
                         f"not match length of index ({index_length})"
                     )
 
-            final_index = cudf.Index(index)
+            final_index = ensure_index(index)
 
         series_lengths = list(map(len, data))
         data = numeric_normalize_types(*data)
@@ -977,9 +982,9 @@ def _init_from_list_like(self, data, index=None, columns=None):
         if index is None:
             index = RangeIndex(start=0, stop=len(data))
         else:
-            index = cudf.Index(index)
+            index = ensure_index(index)
 
-        self._index = cudf.Index(index)
+        self._index = index
         # list-of-dicts case
         if len(data) > 0 and isinstance(data[0], dict):
             data = DataFrame.from_pandas(pd.DataFrame(data))
@@ -1085,7 +1090,7 @@ def _init_from_dict_like(
 
             self._index = RangeIndex(0, num_rows)
         else:
-            self._index = cudf.Index(index)
+            self._index = ensure_index(index)
 
         if len(data):
             self._data.multiindex = True
@@ -1491,7 +1496,7 @@ def memory_usage(self, index=True, deep=False):
             names.append("Index")
         return Series._from_data(
             data={None: as_column(mem_usage)},
-            index=as_index(names),
+            index=cudf.Index(names),
         )
 
     @_performance_tracking
@@ -4033,7 +4038,7 @@ def transpose(self):
         # Set the old column names as the new index
         result = self.__class__._from_data(
             ColumnAccessor(dict(enumerate(result_columns)), verify=False),
-            index=as_index(index),
+            index=cudf.Index(index),
         )
         # Set the old index as the new column names
         result.columns = columns
@@ -5657,7 +5662,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
             }
 
         if not is_scalar(index):
-            new_index = cudf.Index(index)
+            new_index = ensure_index(index)
         else:
             new_index = None
 
@@ -5741,7 +5746,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             }
 
         if index is not None:
-            index = cudf.Index(index)
+            index = ensure_index(index)
 
         if isinstance(columns, (pd.Index, cudf.Index)):
             level_names = tuple(columns.names)
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index e069f8d0ea6..b398ee2343e 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -65,6 +65,17 @@
     from collections.abc import Generator, Iterable
 
 
+def ensure_index(index_like: Any) -> BaseIndex:
+    """
+    Ensure an Index is returned.
+
+    Avoids a shallow copy compared to calling cudf.Index(...)
+    """
+    if not isinstance(index_like, BaseIndex):
+        return cudf.Index(index_like)
+    return index_like
+
+
 class IndexMeta(type):
     """Custom metaclass for Index that overrides instance/subclass tests."""
 
@@ -1569,7 +1580,7 @@ def append(self, other):
                 to_concat.append(obj)
         else:
             this = self
-            other = cudf.Index(other)
+            other = ensure_index(other)
 
             if len(this) == 0 or len(other) == 0:
                 # we'll filter out empties later in ._concat
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 72bd3c45fa6..ff10051c52d 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -33,7 +33,6 @@
     is_list_like,
     is_scalar,
 )
-from cudf.core._base_index import BaseIndex
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import ColumnBase, as_column
@@ -42,7 +41,7 @@
 from cudf.core.dtypes import ListDtype
 from cudf.core.frame import Frame
 from cudf.core.groupby.groupby import GroupBy
-from cudf.core.index import Index, RangeIndex, _index_from_data
+from cudf.core.index import RangeIndex, _index_from_data, ensure_index
 from cudf.core.missing import NA
 from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import _Resampler
@@ -66,6 +65,8 @@
         Dtype,
         NotImplementedType,
     )
+    from cudf.core._base_index import BaseIndex
+
 
 doc_reset_index_template = """
         Reset the index of the {klass}, or a level of it.
@@ -627,9 +628,7 @@ def index(self, value):
                 f"new values have {len(value)} elements"
             )
         # avoid unnecessary cast to Index
-        if not isinstance(value, BaseIndex):
-            value = Index(value)
-
+        value = ensure_index(value)
         self._index = value
 
     @_performance_tracking
@@ -3595,7 +3594,7 @@ def _align_to_index(
         sort: bool = True,
         allow_non_unique: bool = False,
     ) -> Self:
-        index = cudf.Index(index)
+        index = ensure_index(index)
 
         if self.index.equals(index):
             return self
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 7657fa9e234..9cbe863142b 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -29,6 +29,7 @@
     BaseIndex,
     _get_indexer_basic,
     _lexsorted_equal_range,
+    ensure_index,
 )
 from cudf.core.join._join_helpers import _match_join_keys
 from cudf.utils.dtypes import is_column_like
@@ -173,7 +174,7 @@ def __init__(
                     "codes and is inconsistent!"
                 )
 
-        levels = [cudf.Index(level) for level in levels]
+        levels = [ensure_index(level) for level in levels]
 
         if len(levels) != len(codes._data):
             raise ValueError(
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 9acf5294b72..97b6bbec2d4 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -48,7 +48,7 @@
 from cudf.core.column.struct import StructMethods
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.groupby.groupby import SeriesGroupBy, groupby_doc_template
-from cudf.core.index import BaseIndex, DatetimeIndex, RangeIndex, as_index
+from cudf.core.index import BaseIndex, DatetimeIndex, RangeIndex, ensure_index
 from cudf.core.indexed_frame import (
     IndexedFrame,
     _FrameIndexer,
@@ -588,10 +588,8 @@ def __init__(
                 data = data.copy(deep=True)
             name_from_data = data.name
             column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
-            if isinstance(data, pd.Series):
-                index_from_data = cudf.Index(data.index)
-            elif isinstance(data, Series):
-                index_from_data = data.index
+            if isinstance(data, (pd.Series, Series)):
+                index_from_data = ensure_index(data.index)
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
                 "Use cudf.Series._from_data for constructing a Series from "
@@ -642,7 +640,7 @@ def __init__(
             name = name_from_data
 
         if index is not None:
-            index = cudf.Index(index)
+            index = ensure_index(index)
 
         if index_from_data is not None:
             first_index = index_from_data
@@ -3191,7 +3189,7 @@ def quantile(
 
         return Series._from_data(
             data={self.name: result},
-            index=as_index(np_array_q) if quant_index else None,
+            index=cudf.Index(np_array_q) if quant_index else None,
         )
 
     @docutils.doc_describe()
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index fc7fd87d4c5..f40106a30f4 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -11078,3 +11078,27 @@ def test_dataframe_loc_int_float(dtype1, dtype2):
     expected = pdf.loc[pidx]
 
     assert_eq(actual, expected, check_index_type=True, check_dtype=True)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        cudf.DataFrame(range(2)),
+        None,
+        [cudf.Series(range(2))],
+        [[0], [1]],
+        {1: range(2)},
+        cupy.arange(2),
+    ],
+)
+def test_init_with_index_no_shallow_copy(data):
+    idx = cudf.RangeIndex(2)
+    df = cudf.DataFrame(data, index=idx)
+    assert df.index is idx
+
+
+def test_from_records_with_index_no_shallow_copy():
+    idx = cudf.RangeIndex(2)
+    data = np.array([(1.0, 2), (3.0, 4)], dtype=[("x", "<f8"), ("y", "<i8")])
+    df = cudf.DataFrame(data.view(np.recarray), index=idx)
+    assert df.index is idx

From 224ac5bad11465d0486af80e7935eac482269805 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 28 Jun 2024 09:26:37 -0400
Subject: [PATCH 22/29] Add libcudf public/detail API pattern to developer
 guide (#16086)

Adds specific description for the public API to detail API function pattern to the libcudf developer guide.
Also fixes some formatting issues and broken link.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16086
---
 .../developer_guide/DEVELOPER_GUIDE.md        | 60 +++++++++++--------
 1 file changed, 34 insertions(+), 26 deletions(-)

diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index ff80c2daab8..0d097541692 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -1,4 +1,4 @@
-# libcudf C++ Developer Guide
+# libcudf C++ Developer Guide {#DEVELOPER_GUIDE}
 
 This document serves as a guide for contributors to libcudf C++ code. Developers should also refer
 to these additional files for further documentation of libcudf best practices.
@@ -469,7 +469,7 @@ libcudf throws under different circumstances, see the [section on error handling
 
 # libcudf API and Implementation
 
-## Streams
+## Streams {#streams}
 
 libcudf is in the process of adding support for asynchronous execution using
 CUDA streams. In order to facilitate the usage of streams, all new libcudf APIs
@@ -486,33 +486,37 @@ use only asynchronous versions of CUDA APIs with the stream parameter.
 
 In order to make the `detail` API callable from other libcudf functions, it should be exposed in a
 header placed in the `cudf/cpp/include/detail/` directory.
+The declaration is not necessary if no other libcudf functions call the `detail` function.
 
 For example:
 
 ```c++
 // cpp/include/cudf/header.hpp
-void external_function(...);
+void external_function(...,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 // cpp/include/cudf/detail/header.hpp
 namespace detail{
-void external_function(..., rmm::cuda_stream_view stream)
+void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 } // namespace detail
 
 // cudf/src/implementation.cpp
 namespace detail{
-    // Use the stream parameter in the detail implementation.
-    void external_function(..., rmm::cuda_stream_view stream){
-        // Implementation uses the stream with async APIs.
-        rmm::device_buffer buff(...,stream);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
-        kernel<<<..., stream>>>(...);
-        thrust::algorithm(rmm::exec_policy(stream), ...);
-    }
+// Use the stream parameter in the detail implementation.
+void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr){
+  // Implementation uses the stream with async APIs.
+  rmm::device_buffer buff(..., stream, mr);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
+  kernel<<<..., stream>>>(...);
+  thrust::algorithm(rmm::exec_policy(stream), ...);
+}
 } // namespace detail
 
-void external_function(...){
-    CUDF_FUNC_RANGE(); // Generates an NVTX range for the lifetime of this function.
-    detail::external_function(..., cudf::get_default_stream());
+void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE(); // Generates an NVTX range for the lifetime of this function.
+  detail::external_function(..., stream, mr);
 }
 ```
 
@@ -703,28 +707,28 @@ The preferred style for how inputs are passed in and outputs are returned is the
     - `column_view const&`
   - Tables:
     - `table_view const&`
-    - Scalar:
-        - `scalar const&`
-    - Everything else:
-       - Trivial or inexpensively copied types
-          - Pass by value
-       - Non-trivial or expensive to copy types
-          - Pass by `const&`
+  - Scalar:
+    - `scalar const&`
+  - Everything else:
+    - Trivial or inexpensively copied types
+      - Pass by value
+    - Non-trivial or expensive to copy types
+      - Pass by `const&`
 - In/Outs
   - Columns:
     - `mutable_column_view&`
   - Tables:
     - `mutable_table_view&`
-    - Everything else:
-        - Pass by via raw pointer
+  - Everything else:
+    - Pass by via raw pointer
 - Outputs
   - Outputs should be *returned*, i.e., no output parameters
   - Columns:
     - `std::unique_ptr<column>`
   - Tables:
     - `std::unique_ptr<table>`
-    - Scalars:
-        - `std::unique_ptr<scalar>`
+  - Scalars:
+    - `std::unique_ptr<scalar>`
 
 
 ### Multiple Return Values
@@ -908,6 +912,10 @@ functions that are specific to columns of Strings. These functions reside in the
 namespace. Similarly, functionality used exclusively for unit testing is in the `cudf::test::`
 namespace.
 
+The public function is expected to contain a call to `CUDF_FUNC_RANGE()` followed by a call to
+a `detail` function with same name and parameters as the public function.
+See the [Streams](#streams) section for an example of this pattern.
+
 ### Internal
 
 Many functions are not meant for public use, so place them in either the `detail` or an *anonymous*

From 673d766836b7e6e8c80afe32cd9a4b4da2cecf58 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Fri, 28 Jun 2024 09:38:57 -0400
Subject: [PATCH 23/29] Make binary operators work between fixed-point and
 floating args (#16116)

Some of the binary operators in cuDF don't work between fixed_point and floating-point numbers after [this earlier PR](https://github.com/rapidsai/cudf/pull/15438) removed the ability to construct and implicitly cast fixed_point numbers from floating point numbers. This PR restores that functionality by detecting and performing the necessary explicit casts, and adds tests for the supported operators.

Note that the `binary_op_has_common_type` code is modeled after `has_common_type` found in traits.hpp.

This closes [issue 16090](https://github.com/rapidsai/cudf/issues/16090)

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16116
---
 cpp/include/cudf/binaryop.hpp                 | 50 ++++++++++++++++
 cpp/src/binaryop/compiled/binary_ops.cuh      | 14 ++++-
 cpp/src/binaryop/compiled/util.cpp            | 12 ++--
 .../binop-compiled-fixed_point-test.cpp       | 58 +++++++++++++++++++
 4 files changed, 125 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 5e41a871f32..22dad11e109 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -91,6 +91,56 @@ enum class binary_operator : int32_t {
                      ///< (null, false) is null, and (valid, valid) == LOGICAL_OR(valid, valid)
   INVALID_BINARY     ///< invalid operation
 };
+
+/// Binary operation common type default
+template <typename L, typename R, typename = void>
+struct binary_op_common_type {};
+
+/// Binary operation common type specialization
+template <typename L, typename R>
+struct binary_op_common_type<L, R, std::enable_if_t<has_common_type_v<L, R>>> {
+  /// The common type of the template parameters
+  using type = std::common_type_t<L, R>;
+};
+
+/// Binary operation common type specialization
+template <typename L, typename R>
+struct binary_op_common_type<
+  L,
+  R,
+  std::enable_if_t<is_fixed_point<L>() && cuda::std::is_floating_point_v<R>>> {
+  /// The common type of the template parameters
+  using type = L;
+};
+
+/// Binary operation common type specialization
+template <typename L, typename R>
+struct binary_op_common_type<
+  L,
+  R,
+  std::enable_if_t<is_fixed_point<R>() && cuda::std::is_floating_point_v<L>>> {
+  /// The common type of the template parameters
+  using type = R;
+};
+
+/// Binary operation common type helper
+template <typename L, typename R>
+using binary_op_common_type_t = typename binary_op_common_type<L, R>::type;
+
+namespace detail {
+template <typename AlwaysVoid, typename L, typename R>
+struct binary_op_has_common_type_impl : std::false_type {};
+
+template <typename L, typename R>
+struct binary_op_has_common_type_impl<std::void_t<binary_op_common_type_t<L, R>>, L, R>
+  : std::true_type {};
+}  // namespace detail
+
+/// Checks if binary operation types have a common type
+template <typename L, typename R>
+constexpr inline bool binary_op_has_common_type_v =
+  detail::binary_op_has_common_type_impl<void, L, R>::value;
+
 /**
  * @brief Performs a binary operation between a scalar and a column.
  *
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 5177e7d4bda..c6af0c3c58a 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -49,9 +49,16 @@ struct type_casted_accessor {
                                         column_device_view const& col,
                                         bool is_scalar) const
   {
-    if constexpr (column_device_view::has_element_accessor<Element>() and
-                  std::is_convertible_v<Element, CastType>)
-      return static_cast<CastType>(col.element<Element>(is_scalar ? 0 : i));
+    if constexpr (column_device_view::has_element_accessor<Element>()) {
+      auto const element = col.element<Element>(is_scalar ? 0 : i);
+      if constexpr (std::is_convertible_v<Element, CastType>) {
+        return static_cast<CastType>(element);
+      } else if constexpr (is_fixed_point<Element>() && cuda::std::is_floating_point_v<CastType>) {
+        return convert_fixed_to_floating<CastType>(element);
+      } else if constexpr (is_fixed_point<CastType>() && cuda::std::is_floating_point_v<Element>) {
+        return convert_floating_to_fixed<CastType>(element, numeric::scale_type{0});
+      }
+    }
     return {};
   }
 };
@@ -159,6 +166,7 @@ struct ops2_wrapper {
       TypeRhs y   = rhs.element<TypeRhs>(is_rhs_scalar ? 0 : i);
       auto result = [&]() {
         if constexpr (std::is_same_v<BinaryOperator, ops::NullEquals> or
+                      std::is_same_v<BinaryOperator, ops::NullNotEquals> or
                       std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
                       std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
                       std::is_same_v<BinaryOperator, ops::NullMax> or
diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp
index 2b6a4f58895..b62c5f1f4e1 100644
--- a/cpp/src/binaryop/compiled/util.cpp
+++ b/cpp/src/binaryop/compiled/util.cpp
@@ -31,8 +31,8 @@ struct common_type_functor {
   template <typename TypeLhs, typename TypeRhs>
   std::optional<data_type> operator()() const
   {
-    if constexpr (cudf::has_common_type_v<TypeLhs, TypeRhs>) {
-      using TypeCommon = std::common_type_t<TypeLhs, TypeRhs>;
+    if constexpr (binary_op_has_common_type_v<TypeLhs, TypeRhs>) {
+      using TypeCommon = binary_op_common_type_t<TypeLhs, TypeRhs>;
       return data_type{type_to_id<TypeCommon>()};
     }
 
@@ -85,8 +85,8 @@ struct is_binary_operation_supported {
   {
     if constexpr (column_device_view::has_element_accessor<TypeLhs>() and
                   column_device_view::has_element_accessor<TypeRhs>()) {
-      if constexpr (has_common_type_v<TypeLhs, TypeRhs>) {
-        using common_t = std::common_type_t<TypeLhs, TypeRhs>;
+      if constexpr (binary_op_has_common_type_v<TypeLhs, TypeRhs>) {
+        using common_t = binary_op_common_type_t<TypeLhs, TypeRhs>;
         return std::is_invocable_v<BinaryOperator, common_t, common_t>;
       } else {
         return std::is_invocable_v<BinaryOperator, TypeLhs, TypeRhs>;
@@ -102,8 +102,8 @@ struct is_binary_operation_supported {
     if constexpr (column_device_view::has_element_accessor<TypeLhs>() and
                   column_device_view::has_element_accessor<TypeRhs>()) {
       if (has_mutable_element_accessor(out_type) or is_fixed_point(out_type)) {
-        if constexpr (has_common_type_v<TypeLhs, TypeRhs>) {
-          using common_t = std::common_type_t<TypeLhs, TypeRhs>;
+        if constexpr (binary_op_has_common_type_v<TypeLhs, TypeRhs>) {
+          using common_t = binary_op_common_type_t<TypeLhs, TypeRhs>;
           if constexpr (std::is_invocable_v<BinaryOperator, common_t, common_t>) {
             using ReturnType = std::invoke_result_t<BinaryOperator, common_t, common_t>;
             return is_constructible<ReturnType>(out_type) or
diff --git a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
index 6d097b2ff12..89824eb6511 100644
--- a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
@@ -843,3 +843,61 @@ TYPED_TEST(FixedPointTest_64_128_Reps, FixedPoint_64_128_ComparisonTests)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(h->view(), falses);
   }
 }
+
+template <typename ResultType>
+void test_fixed_floating(cudf::binary_operator op,
+                         double floating_value,
+                         int decimal_value,
+                         int decimal_scale,
+                         ResultType expected)
+{
+  auto const scale       = numeric::scale_type{decimal_scale};
+  auto const result_type = cudf::data_type(cudf::type_to_id<ResultType>());
+  auto const nullable =
+    (op == cudf::binary_operator::NULL_EQUALS || op == cudf::binary_operator::NULL_NOT_EQUALS ||
+     op == cudf::binary_operator::NULL_MIN || op == cudf::binary_operator::NULL_MAX);
+
+  cudf::test::fixed_width_column_wrapper<double> floating_col({floating_value});
+  cudf::test::fixed_point_column_wrapper<int> decimal_col({decimal_value}, scale);
+
+  auto result = binary_operation(floating_col, decimal_col, op, result_type);
+
+  if constexpr (cudf::is_fixed_point<ResultType>()) {
+    using wrapper_type      = cudf::test::fixed_point_column_wrapper<typename ResultType::rep>;
+    auto const expected_col = nullable ? wrapper_type({expected.value()}, {true}, expected.scale())
+                                       : wrapper_type({expected.value()}, expected.scale());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, *result.get());
+  } else {
+    using wrapper_type = cudf::test::fixed_width_column_wrapper<ResultType>;
+    auto const expected_col =
+      nullable ? wrapper_type({expected}, {true}) : wrapper_type({expected});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, *result.get());
+  }
+}
+
+TYPED_TEST(FixedPointCompiledTest, FixedPointWithFloating)
+{
+  using namespace numeric;
+
+  // BOOLEAN
+  test_fixed_floating(cudf::binary_operator::EQUAL, 1.0, 10, -1, true);
+  test_fixed_floating(cudf::binary_operator::NOT_EQUAL, 1.0, 10, -1, false);
+  test_fixed_floating(cudf::binary_operator::LESS, 2.0, 10, -1, false);
+  test_fixed_floating(cudf::binary_operator::GREATER, 2.0, 10, -1, true);
+  test_fixed_floating(cudf::binary_operator::LESS_EQUAL, 2.0, 20, -1, true);
+  test_fixed_floating(cudf::binary_operator::GREATER_EQUAL, 2.0, 30, -1, false);
+  test_fixed_floating(cudf::binary_operator::NULL_EQUALS, 1.0, 10, -1, true);
+  test_fixed_floating(cudf::binary_operator::NULL_NOT_EQUALS, 1.0, 10, -1, false);
+
+  // PRIMARY ARITHMETIC
+  auto const decimal_result = numeric::decimal32(4, numeric::scale_type{0});
+  test_fixed_floating(cudf::binary_operator::ADD, 1.0, 30, -1, decimal_result);
+  test_fixed_floating(cudf::binary_operator::SUB, 6.0, 20, -1, decimal_result);
+  test_fixed_floating(cudf::binary_operator::MUL, 2.0, 20, -1, decimal_result);
+  test_fixed_floating(cudf::binary_operator::DIV, 8.0, 2, 0, decimal_result);
+  test_fixed_floating(cudf::binary_operator::MOD, 9.0, 50, -1, decimal_result);
+
+  // OTHER ARITHMETIC
+  test_fixed_floating(cudf::binary_operator::NULL_MAX, 4.0, 20, -1, decimal_result);
+  test_fixed_floating(cudf::binary_operator::NULL_MIN, 4.0, 200, -1, decimal_result);
+}

From c40e0cc8dae8922c2633f5359609a1d063ae7f26 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 28 Jun 2024 10:10:31 -0400
Subject: [PATCH 24/29] Add support for proxy `np.flatiter` objects (#16107)

Closes #15388

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16107
---
 python/cudf/cudf/pandas/_wrappers/numpy.py        | 13 +++++++++++++
 python/cudf/cudf_pandas_tests/test_cudf_pandas.py | 10 ++++++++++
 2 files changed, 23 insertions(+)

diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index c445be46f58..3b012169676 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -129,6 +129,19 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
     },
 )
 
+
+flatiter = make_final_proxy_type(
+    "flatiter",
+    cupy.flatiter,
+    numpy.flatiter,
+    fast_to_slow=lambda fast: cupy.asnumpy(fast.base).flat,
+    slow_to_fast=lambda slow: cupy.asarray(slow).flat,
+    additional_attributes={
+        "__array__": array_method,
+    },
+)
+
+
 # Mapping flags between slow and fast types
 _ndarray_flags = make_intermediate_proxy_type(
     "_ndarray_flags",
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 0d46e2e9311..f51ce103677 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1535,6 +1535,16 @@ def test_is_proxy_object():
     assert not is_proxy_object(s2)
 
 
+def test_numpy_cupy_flatiter(series):
+    cp = pytest.importorskip("cupy")
+
+    _, s = series
+    arr = s.values
+
+    assert type(arr.flat._fsproxy_fast) == cp.flatiter
+    assert type(arr.flat._fsproxy_slow) == np.flatiter
+
+
 def test_arrow_string_arrays():
     cu_s = xpd.Series(["a", "b", "c"])
     pd_s = pd.Series(["a", "b", "c"])

From 565c0d1c3a08c9bd7eafa70278a8744097f8ef04 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 28 Jun 2024 10:16:55 -0400
Subject: [PATCH 25/29] Migrate lists/contains to pylibcudf (#15981)

Part of #15162.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15981
---
 cpp/include/cudf/lists/lists_column_view.hpp  |   3 +-
 python/cudf/cudf/_lib/lists.pyx               |  72 +++-------
 python/cudf/cudf/_lib/pylibcudf/column.pxd    |   4 +
 python/cudf/cudf/_lib/pylibcudf/column.pyx    |   9 ++
 .../_lib/pylibcudf/libcudf/lists/contains.pxd |  29 +++-
 .../libcudf/lists/lists_column_view.pxd       |   1 +
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  10 ++
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 124 +++++++++++++++++-
 .../cudf/cudf/pylibcudf_tests/test_lists.py   |  98 +++++++++++++-
 9 files changed, 281 insertions(+), 69 deletions(-)

diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp
index 57a4f724c2d..3397cb0ca1d 100644
--- a/cpp/include/cudf/lists/lists_column_view.hpp
+++ b/cpp/include/cudf/lists/lists_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,6 +38,7 @@ namespace cudf {
  */
 class lists_column_view : private column_view {
  public:
+  lists_column_view() = default;
   /**
    * @brief Construct a new lists column view object from a column view.
    *
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 5d406f5c85f..0ad09dba717 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -9,10 +9,6 @@ from libcpp.utility cimport move
 from cudf._lib.column cimport Column
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.lists.contains cimport (
-    contains,
-    index_of as cpp_index_of,
-)
 from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
     count_elements as cpp_count_elements,
 )
@@ -26,7 +22,6 @@ from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
 from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
     distinct as cpp_distinct,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     null_equality,
@@ -34,11 +29,12 @@ from cudf._lib.pylibcudf.libcudf.types cimport (
     order,
     size_type,
 )
-from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf._lib import pylibcudf
 
+from cudf._lib.pylibcudf cimport Scalar
+
 
 @acquire_spill_lock()
 def count_elements(Column col):
@@ -153,64 +149,36 @@ def extract_element_column(Column col, Column index):
 
 
 @acquire_spill_lock()
-def contains_scalar(Column col, object py_search_key):
-
-    cdef DeviceScalar search_key = py_search_key.device_value
-
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+def contains_scalar(Column col, py_search_key):
+    return Column.from_pylibcudf(
+        pylibcudf.lists.contains(
+            col.to_pylibcudf(mode="read"),
+            <Scalar> py_search_key.device_value.c_value,
+        )
     )
-    cdef const scalar* search_key_value = search_key.get_raw_ptr()
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(contains(
-            list_view.get()[0],
-            search_key_value[0],
-        ))
-    result = Column.from_unique_ptr(move(c_result))
-    return result
 
 
 @acquire_spill_lock()
 def index_of_scalar(Column col, object py_search_key):
-
-    cdef DeviceScalar search_key = py_search_key.device_value
-
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.index_of(
+            col.to_pylibcudf(mode="read"),
+            <Scalar> py_search_key.device_value.c_value,
+            True,
+        )
     )
-    cdef const scalar* search_key_value = search_key.get_raw_ptr()
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_index_of(
-            list_view.get()[0],
-            search_key_value[0],
-        ))
-    return Column.from_unique_ptr(move(c_result))
 
 
 @acquire_spill_lock()
 def index_of_column(Column col, Column search_keys):
-
-    cdef column_view keys_view = search_keys.view()
-
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.index_of(
+            col.to_pylibcudf(mode="read"),
+            search_keys.to_pylibcudf(mode="read"),
+            True,
+        )
     )
 
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_index_of(
-            list_view.get()[0],
-            keys_view,
-        ))
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def concatenate_rows(list source_columns):
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd
index e121e856865..d13791d95cf 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd
@@ -8,6 +8,9 @@ from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
 from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type, size_type
 
 from .gpumemoryview cimport gpumemoryview
@@ -56,3 +59,4 @@ cdef class ListColumnView:
     cdef Column _column
     cpdef child(self)
     cpdef offsets(self)
+    cdef lists_column_view view(self) nogil
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index e726eca154f..e0cf8b7ee32 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -348,6 +348,15 @@ cdef class ListColumnView:
         """The offsets column of the underlying list column."""
         return self._column.child(1)
 
+    cdef lists_column_view view(self) nogil:
+        """Generate a libcudf lists_column_view to pass to libcudf algorithms.
+
+        This method is for pylibcudf's functions to use to generate inputs when
+        calling libcudf algorithms, and should generally not be needed by users
+        (even direct pylibcudf Cython users).
+        """
+        return lists_column_view(self._column.view())
+
 
 @functools.cache
 def _datatype_from_dtype_desc(desc):
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
index 721679f35c7..82aed7d70a0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 
 from cudf._lib.exception_handler cimport cudf_exception_handler
@@ -12,17 +13,33 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
+
+    cpdef enum class duplicate_find_option(int32_t):
+        FIND_FIRST
+        FIND_LAST
+
     cdef unique_ptr[column] contains(
-        lists_column_view lists,
-        scalar search_key,
+        const lists_column_view& lists,
+        const scalar& search_key,
+    ) except +cudf_exception_handler
+
+    cdef unique_ptr[column] contains(
+        const lists_column_view& lists,
+        const column_view& search_keys,
+    ) except +cudf_exception_handler
+
+    cdef unique_ptr[column] contains_nulls(
+        const lists_column_view& lists,
     ) except +cudf_exception_handler
 
     cdef unique_ptr[column] index_of(
-        lists_column_view lists,
-        scalar search_key,
+        const lists_column_view& lists,
+        const scalar& search_key,
+        duplicate_find_option find_option,
     ) except +cudf_exception_handler
 
     cdef unique_ptr[column] index_of(
-        lists_column_view lists,
-        column_view search_keys,
+        const lists_column_view& lists,
+        const column_view& search_keys,
+        duplicate_find_option find_option,
     ) except +cudf_exception_handler
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
index dbafc415e45..fd21e7b334b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
@@ -9,6 +9,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil:
     cdef cppclass lists_column_view(column_view):
+        lists_column_view() except +
         lists_column_view(const column_view& lists_column) except +
         column_view parent() except +
         column_view offsets() except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index 2d2a5b2a9ea..2ccf0139e90 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -5,11 +5,21 @@ from libcpp cimport bool
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
+from .scalar cimport Scalar
 from .table cimport Table
 
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
 
 cpdef Table explode_outer(Table, size_type explode_column_idx)
 
 cpdef Column concatenate_rows(Table)
 
 cpdef Column concatenate_list_elements(Column, bool dropna)
+
+cpdef Column contains(Column, ColumnOrScalar)
+
+cpdef Column contains_nulls(Column)
+
+cpdef Column index_of(Column, ColumnOrScalar, bool)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index 069c9da31c2..a94d940accd 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -1,11 +1,15 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from cython.operator cimport dereference
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists cimport explode as cpp_explode
+from cudf._lib.pylibcudf.libcudf.lists cimport (
+    contains as cpp_contains,
+    explode as cpp_explode,
+)
 from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_list_elements as cpp_concatenate_list_elements,
     concatenate_null_policy,
@@ -13,8 +17,10 @@ from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
 )
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.lists cimport ColumnOrScalar
 
-from .column cimport Column
+from .column cimport Column, ListColumnView
+from .scalar cimport Scalar
 from .table cimport Table
 
 
@@ -71,15 +77,15 @@ cpdef Column concatenate_list_elements(Column input, bool dropna):
     ----------
     input : Column
         The input column
+    dropna : bool
+        If true, null list elements will be ignored
+        from concatenation. Otherwise any input null values will result in
+        the corresponding output row being set to null.
 
     Returns
     -------
     Column
         A new Column of concatenated list elements
-    dropna : bool
-        If true, null list elements will be ignored
-        from concatenation. Otherwise any input null values will result in
-        the corresponding output row being set to null.
     """
     cdef concatenate_null_policy null_policy = (
         concatenate_null_policy.IGNORE if dropna
@@ -94,3 +100,109 @@ cpdef Column concatenate_list_elements(Column input, bool dropna):
         ))
 
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column contains(Column input, ColumnOrScalar search_key):
+    """Create a column of bool values indicating whether
+    the search_key is contained in the input.
+
+    ``search_key`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    For details, see :cpp:func:`contains`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    search_key : Union[Column, Scalar]
+        The search key.
+
+    Returns
+    -------
+    Column
+        A new Column of bools indicating if the search_key was
+        found in the list column.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+
+    if not isinstance(search_key, (Column, Scalar)):
+        raise TypeError("Must pass a Column or Scalar")
+
+    with nogil:
+        c_result = move(cpp_contains.contains(
+            list_view.view(),
+            search_key.view() if ColumnOrScalar is Column else dereference(
+                search_key.get()
+            ),
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column contains_nulls(Column input):
+    """Create a column of bool values indicating whether
+    each row in the lists column contains a null value.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+
+    Returns
+    -------
+    Column
+        A new Column of bools indicating if the list column
+        contains a null value.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+    with nogil:
+        c_result = move(cpp_contains.contains_nulls(list_view.view()))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_option):
+    """Create a column of index values indicating the position of a search
+    key row within the corresponding list row in the lists column.
+
+    ``search_key`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    For details, see :cpp:func:`index_of`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    search_key : Union[Column, Scalar]
+        The search key.
+    find_first_option : bool
+        If true, index_of returns the first match.
+        Otherwise the last match is returned.
+
+    Returns
+    -------
+    Column
+        A new Column of index values that indicate where in the
+        list column tthe search_key was found. An index value
+        of -1 indicates that the search_key was not found.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+    cdef cpp_contains.duplicate_find_option find_option = (
+        cpp_contains.duplicate_find_option.FIND_FIRST if find_first_option
+        else cpp_contains.duplicate_find_option.FIND_LAST
+    )
+
+    with nogil:
+        c_result = move(cpp_contains.index_of(
+            list_view.view(),
+            search_key.view() if ColumnOrScalar is Column else dereference(
+                search_key.get()
+            ),
+            find_option,
+        ))
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index b21af8ea11c..c781126e388 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -7,15 +7,28 @@
 from cudf._lib import pylibcudf as plc
 
 
-def test_concatenate_rows():
-    test_data = [[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]]
+@pytest.fixture
+def test_data():
+    return [[[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]]]
 
-    arrow_tbl = pa.Table.from_arrays(test_data, names=["a", "b"])
+
+@pytest.fixture
+def scalar():
+    return pa.scalar(1)
+
+
+@pytest.fixture
+def column():
+    return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32())
+
+
+def test_concatenate_rows(test_data):
+    arrow_tbl = pa.Table.from_arrays(test_data[0], names=["a", "b"])
     plc_tbl = plc.interop.from_arrow(arrow_tbl)
 
     res = plc.lists.concatenate_rows(plc_tbl)
 
-    expect = pa.array([pair[0] + pair[1] for pair in zip(*test_data)])
+    expect = pa.array([pair[0] + pair[1] for pair in zip(*test_data[0])])
 
     assert_column_eq(expect, res)
 
@@ -44,3 +57,80 @@ def test_concatenate_list_elements(test_data, dropna, expected):
     expect = pa.array(expected)
 
     assert_column_eq(expect, res)
+
+
+def test_contains_scalar(test_data, scalar):
+    list_column = test_data[0][0]
+    arr = pa.array(list_column)
+
+    plc_column = plc.interop.from_arrow(arr)
+    plc_scalar = plc.interop.from_arrow(scalar)
+    res = plc.lists.contains(plc_column, plc_scalar)
+
+    expect = pa.array([True, False, False, False])
+
+    assert_column_eq(expect, res)
+
+
+def test_contains_list_column(test_data):
+    list_column1 = test_data[0][0]
+    list_column2 = [1, 3, 5, 1]
+    arr1 = pa.array(list_column1)
+    arr2 = pa.array(list_column2)
+
+    plc_column1 = plc.interop.from_arrow(arr1)
+    plc_column2 = plc.interop.from_arrow(arr2)
+    res = plc.lists.contains(plc_column1, plc_column2)
+
+    expect = pa.array([True, False, True, False])
+
+    assert_column_eq(expect, res)
+
+
+@pytest.mark.parametrize(
+    "list_column, expected",
+    [
+        (
+            [[1, None], [1, 3, 4], [5, None]],
+            [True, False, True],
+        ),
+        (
+            [[1, None], None, [5]],
+            [True, None, False],
+        ),
+    ],
+)
+def test_contains_nulls(list_column, expected):
+    arr = pa.array(list_column)
+    plc_column = plc.interop.from_arrow(arr)
+    res = plc.lists.contains_nulls(plc_column)
+
+    expect = pa.array(expected)
+
+    assert_column_eq(expect, res)
+
+
+def test_index_of_scalar(test_data, scalar):
+    list_column = test_data[0][0]
+    arr = pa.array(list_column)
+
+    plc_column = plc.interop.from_arrow(arr)
+    plc_scalar = plc.interop.from_arrow(scalar)
+    res = plc.lists.index_of(plc_column, plc_scalar, True)
+
+    expect = pa.array([1, -1, -1, -1], type=pa.int32())
+
+    assert_column_eq(expect, res)
+
+
+def test_index_of_list_column(test_data, column):
+    list_column = test_data[0][0]
+    arr1 = pa.array(list_column)
+    arr2, expect = column
+    plc_column1 = plc.interop.from_arrow(arr1)
+    plc_column2 = plc.interop.from_arrow(arr2)
+    res = plc.lists.index_of(plc_column1, plc_column2, True)
+
+    expect = pa.array(column[1], type=pa.int32())
+
+    assert_column_eq(expect, res)

From e434fdbc546dd1810c750abdd086f07b694782b2 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 28 Jun 2024 10:57:01 -0400
Subject: [PATCH 26/29] Update libcudf compiler requirements in contributing
 doc (#16103)

Updates the compiler requirements in the contributing document.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16103
---
 CONTRIBUTING.md | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 98c2ec0a22e..4fbc28fa6e1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -71,15 +71,14 @@ for a minimal build of libcudf without using conda are also listed below.
 
 Compilers:
 
-* `gcc` version 9.3+
-* `nvcc` version 11.5+
-* `cmake` version 3.26.4+
+* `gcc` version 11.4+
+* `nvcc` version 11.8+
+* `cmake` version 3.29.6+
 
-CUDA/GPU:
+CUDA/GPU Runtime:
 
-* CUDA 11.5+
-* NVIDIA driver 450.80.02+
-* Volta architecture or better (Compute Capability >=7.0)
+* CUDA 11.4+
+* Volta architecture or better ([Compute Capability](https://docs.nvidia.com/deploy/cuda-compatibility/) >=7.0)
 
 You can obtain CUDA from
 [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).

From a4b951a6c140c05178edb61d8e28f51a4b430e15 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Fri, 28 Jun 2024 10:20:42 -0500
Subject: [PATCH 27/29] Templatization of fixed-width parquet decoding kernels.
 (#15911)

This PR merges all of the fixed-width parquet decoding kernels into a single templatized kernel that can be selectively instantiated with desired features (dictionary/no-dictionary, nested/non-nested, etc).  It also adds support for (non-list) nested columns in this path. So structs do not have to use the much slower general decode kernel any more.

A new benchmark was added specific to structs containing only fixed width columns.  I added this because the performance improvement is fairly high (+20%) but we don't see it in the normal struct benchmarks because they include (and are dominated by) string decode times.  The new benchmark shows:

Before this PR:
```
| data_type |    io_type    | cardinality | run_length | bytes_per_second | peak_memory_usage | encoded_file_size |
|-----------|---------------|-------------|------------|------------------|-------------------|-------------------|
|    STRUCT | DEVICE_BUFFER |           0 |          1 |      21071216823 |         1.047 GiB |       511.675 MiB |
|    STRUCT | DEVICE_BUFFER |        1000 |          1 |      18974392387 |       821.312 MiB |       128.884 MiB |
|    STRUCT | DEVICE_BUFFER |           0 |         32 |      20429356824 |      621.787 MiB  |        28.141 MiB |
|    STRUCT | DEVICE_BUFFER |        1000 |         32 |      20572327813 |       598.421 MiB |        16.475 MiB |
```

After this PR:

```
| data_type |    io_type    | cardinality | run_length | bytes_per_second | peak_memory_usage | encoded_file_size |
|-----------|---------------|-------------|------------|------------------|-------------------|-------------------|
|    STRUCT | DEVICE_BUFFER |           0 |          1 |      25805996399 |         1.047 GiB |       511.675 MiB |
|    STRUCT | DEVICE_BUFFER |        1000 |          1 |      22422306660 |       821.312 MiB |       128.884 MiB |
|    STRUCT | DEVICE_BUFFER |           0 |         32 |      24460694014 |       621.787 MiB |        28.141 MiB |
|    STRUCT | DEVICE_BUFFER |        1000 |         32 |      24674861214 |       598.421 MiB |        16.475 MiB |
```

Split-page decoding for fixed-width types + structs are also going through this new path. New test added.

This brings us closer to eliminating the "general" kernel.  The only things left that run through it are lists and booleans.

This is PR 1 of 2, with the followup moving a lot of code around.  At this point, I think it makes sense to start consolidating our files a bit.

I also left some breadcrumbs (a few small commented out code blocks) in the core kernel `gpuDecodePageDataGeneric` for the next step of adding list support. They can be removed if people don't like them.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/15911
---
 .../io/parquet/parquet_reader_input.cpp       |  50 +-
 cpp/src/io/parquet/decode_fixed.cu            | 896 ++++++++++--------
 cpp/src/io/parquet/page_hdr.cu                |  16 +-
 cpp/src/io/parquet/parquet_gpu.hpp            |  46 +-
 cpp/src/io/parquet/reader_impl.cpp            |  57 +-
 cpp/tests/io/parquet_writer_test.cpp          |  97 +-
 6 files changed, 703 insertions(+), 459 deletions(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
index 019e0f30fe9..7563c823454 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,20 +59,18 @@ void parquet_read_common(cudf::size_type num_rows_to_read,
 }
 
 template <data_type DataType>
-void BM_parquet_read_data(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
+void BM_parquet_read_data_common(nvbench::state& state,
+                                 data_profile const& profile,
+                                 nvbench::type_list<nvbench::enum_type<DataType>>)
 {
   auto const d_type      = get_type_or_group(static_cast<int32_t>(DataType));
-  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
-  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
   auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
   auto const compression = cudf::io::compression_type::SNAPPY;
   cuio_source_sink_pair source_sink(source_type);
 
   auto const num_rows_written = [&]() {
-    auto const tbl = create_random_table(
-      cycle_dtypes(d_type, num_cols),
-      table_size_bytes{data_size},
-      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const tbl =
+      create_random_table(cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, profile);
     auto const view = tbl->view();
 
     cudf::io::parquet_writer_options write_opts =
@@ -85,6 +83,32 @@ void BM_parquet_read_data(nvbench::state& state, nvbench::type_list<nvbench::enu
   parquet_read_common(num_rows_written, num_cols, source_sink, state);
 }
 
+template <data_type DataType>
+void BM_parquet_read_data(nvbench::state& state,
+                          nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  BM_parquet_read_data_common<DataType>(
+    state, data_profile_builder().cardinality(cardinality).avg_run_length(run_length), type_list);
+}
+
+template <data_type DataType>
+void BM_parquet_read_fixed_width_struct(nvbench::state& state,
+                                        nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  std::vector<cudf::type_id> s_types{
+    cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::INT64};
+  BM_parquet_read_data_common<DataType>(state,
+                                        data_profile_builder()
+                                          .cardinality(cardinality)
+                                          .avg_run_length(run_length)
+                                          .struct_types(s_types),
+                                        type_list);
+}
+
 void BM_parquet_read_io_compression(nvbench::state& state)
 {
   auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
@@ -247,3 +271,13 @@ NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32})
   .add_int64_axis("num_string_cols", {1, 2, 3});
+
+// a benchmark for structs that only contain fixed-width types
+using d_type_list_struct_only = nvbench::enum_type_list<data_type::STRUCT>;
+NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only))
+  .set_name("parquet_read_fixed_width_struct")
+  .set_type_axes_names({"data_type"})
+  .add_string_axis("io_type", {"DEVICE_BUFFER"})
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index bfd89200786..ea80ae73c2f 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -24,136 +24,11 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
-constexpr int decode_block_size = 128;
-constexpr int rolling_buf_size  = decode_block_size * 2;
-// the required number of runs in shared memory we will need to provide the
-// rle_stream object
-constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<decode_block_size>();
-
-template <bool nullable, typename level_t, typename state_buf>
-static __device__ int gpuUpdateValidityOffsetsAndRowIndicesFlat(
-  int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
-{
-  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
-  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
-
-  auto& ni = s->nesting_info[0];
-
-  // how many (input) values we've processed in the page so far
-  int value_count = s->input_value_count;
-  int valid_count = ni.valid_count;
-
-  // cap by last row so that we don't process any rows past what we want to output.
-  int const first_row                 = s->first_row;
-  int const last_row                  = first_row + s->num_rows;
-  int const capped_target_value_count = min(target_value_count, last_row);
-
-  int const valid_map_offset      = ni.valid_map_offset;
-  int const row_index_lower_bound = s->row_index_lower_bound;
-
-  __syncthreads();
-
-  while (value_count < capped_target_value_count) {
-    int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
-
-    // definition level. only need to process for nullable columns
-    int d = 0;
-    if constexpr (nullable) {
-      d = t < batch_size
-            ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
-            : -1;
-    }
-
-    int const thread_value_count = t + 1;
-    int const block_value_count  = batch_size;
-
-    // compute our row index, whether we're in row bounds, and validity
-    int const row_index     = (thread_value_count + value_count) - 1;
-    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
-    int is_valid;
-    if constexpr (nullable) {
-      is_valid = ((d > 0) && in_row_bounds) ? 1 : 0;
-    } else {
-      is_valid = in_row_bounds;
-    }
-
-    // thread and block validity count
-    int thread_valid_count, block_valid_count;
-    if constexpr (nullable) {
-      using block_scan = cub::BlockScan<int, decode_block_size>;
-      __shared__ typename block_scan::TempStorage scan_storage;
-      block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
-      __syncthreads();
-
-      // validity is processed per-warp
-      //
-      // nested schemas always read and write to the same bounds (that is, read and write
-      // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
-      // at the first value, even if that is before first_row, because we cannot trivially jump to
-      // the correct position to start reading. since we are about to write the validity vector
-      // here we need to adjust our computed mask to take into account the write row bounds.
-      int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
-      int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
-      int warp_null_count   = 0;
-      if (write_start >= 0) {
-        uint32_t const warp_validity_mask = ballot(is_valid);
-        // lane 0 from each warp writes out validity
-        if ((t % cudf::detail::warp_size) == 0) {
-          int const vindex = (value_count + thread_value_count) - 1;  // absolute input value index
-          int const bit_offset = (valid_map_offset + vindex + write_start) -
-                                 first_row;  // absolute bit offset into the output validity map
-          int const write_end =
-            cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
-          int const bit_count = write_end - write_start;
-          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
-
-          store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
-        }
-      }
-
-      // sum null counts. we have to do it this way instead of just incrementing by (value_count -
-      // valid_count) because valid_count also includes rows that potentially start before our row
-      // bounds. if we could come up with a way to clean that up, we could remove this and just
-      // compute it directly at the end of the kernel.
-      size_type const block_null_count =
-        cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
-      if (t == 0) { ni.null_count += block_null_count; }
-    }
-    // trivial for non-nullable columns
-    else {
-      thread_valid_count = thread_value_count;
-      block_valid_count  = block_value_count;
-    }
-
-    // output offset
-    if (is_valid) {
-      int const dst_pos = (value_count + thread_value_count) - 1;
-      int const src_pos = (valid_count + thread_valid_count) - 1;
-      sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
-    }
-
-    // update stuff
-    value_count += block_value_count;
-    valid_count += block_valid_count;
-  }
-
-  if (t == 0) {
-    // update valid value count for decoding and total # of values we've processed
-    ni.valid_count       = valid_count;
-    ni.value_count       = value_count;
-    s->nz_count          = valid_count;
-    s->input_value_count = value_count;
-    s->input_row_count   = value_count;
-  }
-
-  return valid_count;
-}
-
-template <typename state_buf>
-__device__ inline void gpuDecodeValues(
+template <int block_size, typename state_buf>
+__device__ inline void gpuDecodeFixedWidthValues(
   page_state_s* s, state_buf* const sb, int start, int end, int t)
 {
-  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int num_warps      = block_size / cudf::detail::warp_size;
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
@@ -217,18 +92,22 @@ __device__ inline void gpuDecodeValues(
   }
 }
 
-template <typename state_buf>
-__device__ inline void gpuDecodeSplitValues(page_state_s* s,
-                                            state_buf* const sb,
-                                            int start,
-                                            int end)
+template <int block_size, typename state_buf>
+struct decode_fixed_width_values_func {
+  __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t)
+  {
+    gpuDecodeFixedWidthValues<block_size, state_buf>(s, sb, start, end, t);
+  }
+};
+
+template <int block_size, typename state_buf>
+__device__ inline void gpuDecodeFixedWidthSplitValues(
+  page_state_s* s, state_buf* const sb, int start, int end, int t)
 {
   using cudf::detail::warp_size;
-  constexpr int num_warps      = decode_block_size / warp_size;
+  constexpr int num_warps      = block_size / warp_size;
   constexpr int max_batch_size = num_warps * warp_size;
 
-  auto const t = threadIdx.x;
-
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
   int const dtype                          = s->col.physical_type;
   auto const data_len                      = thrust::distance(s->data_start, s->data_end);
@@ -307,266 +186,293 @@ __device__ inline void gpuDecodeSplitValues(page_state_s* s,
   }
 }
 
-// is the page marked nullable or not
-__device__ inline bool is_nullable(page_state_s* s)
-{
-  auto const lvl           = level_type::DEFINITION;
-  auto const max_def_level = s->col.max_level[lvl];
-  return max_def_level > 0;
-}
+template <int block_size, typename state_buf>
+struct decode_fixed_width_split_values_func {
+  __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t)
+  {
+    gpuDecodeFixedWidthSplitValues<block_size, state_buf>(s, sb, start, end, t);
+  }
+};
 
-// for a nullable page, check to see if it could have nulls
-__device__ inline bool has_nulls(page_state_s* s)
+template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+static __device__ int gpuUpdateValidityAndRowIndicesNested(
+  int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
-  auto const lvl      = level_type::DEFINITION;
-  auto const init_run = s->initial_rle_run[lvl];
-  // literal runs, lets assume they could hold nulls
-  if (is_literal_run(init_run)) { return true; }
-
-  // repeated run with number of items in the run not equal
-  // to the rows in the page, assume that means we could have nulls
-  if (s->page.num_input_values != (init_run >> 1)) { return true; }
-
-  auto const lvl_bits = s->col.level_bits[lvl];
-  auto const run_val  = lvl_bits == 0 ? 0 : s->initial_rle_value[lvl];
-
-  // the encoded repeated value isn't valid, we have (all) nulls
-  return run_val != s->col.max_level[lvl];
-}
+  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
-/**
- * @brief Kernel for computing fixed width non dictionary column data stored in the pages
- *
- * This function will write the page data and the page data's validity to the
- * output specified in the page's column chunk. If necessary, additional
- * conversion will be performed to translate from the Parquet datatype to
- * desired output datatype.
- *
- * @param pages List of pages
- * @param chunks List of column chunks
- * @param min_row Row index to start reading at
- * @param num_rows Maximum number of rows to read
- * @param error_code Error code to set if an error is encountered
- */
-template <typename level_t>
-CUDF_KERNEL void __launch_bounds__(decode_block_size)
-  gpuDecodePageDataFixed(PageInfo* pages,
-                         device_span<ColumnChunkDesc const> chunks,
-                         size_t min_row,
-                         size_t num_rows,
-                         kernel_error::pointer error_code)
-{
-  __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
-                                                1,                 // unused in this kernel
-                                                1>                 // unused in this kernel
-    state_buffers;
+  // how many (input) values we've processed in the page so far
+  int value_count = s->input_value_count;
 
-  page_state_s* const s = &state_g;
-  auto* const sb        = &state_buffers;
-  int const page_idx    = blockIdx.x;
-  int const t           = threadIdx.x;
-  PageInfo* pp          = &pages[page_idx];
+  // cap by last row so that we don't process any rows past what we want to output.
+  int const first_row                 = s->first_row;
+  int const last_row                  = first_row + s->num_rows;
+  int const capped_target_value_count = min(target_value_count, last_row);
 
-  if (!(BitAnd(pages[page_idx].kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT))) { return; }
+  int const row_index_lower_bound = s->row_index_lower_bound;
 
-  // must come after the kernel mask check
-  [[maybe_unused]] null_count_back_copier _{s, t};
+  int const max_depth = s->col.max_nesting_depth - 1;
+  __syncthreads();
 
-  if (!setupLocalPageInfo(s,
-                          pp,
-                          chunks,
-                          min_row,
-                          num_rows,
-                          mask_filter{decode_kernel_mask::FIXED_WIDTH_NO_DICT},
-                          page_processing_stage::DECODE)) {
-    return;
-  }
+  while (value_count < capped_target_value_count) {
+    int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-  // the level stream decoders
-  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
-  rle_stream<level_t, decode_block_size, rolling_buf_size> def_decoder{def_runs};
+    // definition level. only need to process for nullable columns
+    int d = 0;
+    if constexpr (nullable) {
+      if (def) {
+        d = t < batch_size
+              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
+              : -1;
+      } else {
+        d = t < batch_size ? 1 : -1;
+      }
+    }
 
-  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
-  if (s->num_rows == 0) { return; }
+    int const thread_value_count = t + 1;
+    int const block_value_count  = batch_size;
 
-  bool const nullable            = is_nullable(s);
-  bool const nullable_with_nulls = nullable && has_nulls(s);
+    // compute our row index, whether we're in row bounds, and validity
+    int const row_index           = (thread_value_count + value_count) - 1;
+    int const in_row_bounds       = (row_index >= row_index_lower_bound) && (row_index < last_row);
+    int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
+    int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
+
+    // iterate by depth
+    for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
+      auto& ni = s->nesting_info[d_idx];
+
+      int is_valid;
+      if constexpr (nullable) {
+        is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
+      } else {
+        is_valid = in_row_bounds;
+      }
 
-  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
-  level_t* const def = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
-  if (nullable_with_nulls) {
-    def_decoder.init(s->col.level_bits[level_type::DEFINITION],
-                     s->abs_lvl_start[level_type::DEFINITION],
-                     s->abs_lvl_end[level_type::DEFINITION],
-                     def,
-                     s->page.num_input_values);
-  }
-  __syncthreads();
+      // thread and block validity count
+      int thread_valid_count, block_valid_count;
+      if constexpr (nullable) {
+        using block_scan = cub::BlockScan<int, decode_block_size>;
+        __shared__ typename block_scan::TempStorage scan_storage;
+        block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
+        __syncthreads();
+
+        // validity is processed per-warp
+        //
+        // nested schemas always read and write to the same bounds (that is, read and write
+        // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
+        // at the first value, even if that is before first_row, because we cannot trivially jump to
+        // the correct position to start reading. since we are about to write the validity vector
+        // here we need to adjust our computed mask to take into account the write row bounds.
+        int warp_null_count = 0;
+        if (write_start >= 0 && ni.valid_map != nullptr) {
+          int const valid_map_offset        = ni.valid_map_offset;
+          uint32_t const warp_validity_mask = ballot(is_valid);
+          // lane 0 from each warp writes out validity
+          if ((t % cudf::detail::warp_size) == 0) {
+            int const vindex =
+              (value_count + thread_value_count) - 1;  // absolute input value index
+            int const bit_offset = (valid_map_offset + vindex + write_start) -
+                                   first_row;  // absolute bit offset into the output validity map
+            int const write_end = cudf::detail::warp_size -
+                                  __clz(in_write_row_bounds);  // last bit in the warp to store
+            int const bit_count = write_end - write_start;
+            warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
+
+            store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
+          }
+        }
 
-  // We use two counters in the loop below: processed_count and valid_count.
-  // - processed_count: number of rows out of num_input_values that we have decoded so far.
-  //   the definition stream returns the number of total rows it has processed in each call
-  //   to decode_next and we accumulate in process_count.
-  // - valid_count: number of non-null rows we have decoded so far. In each iteration of the
-  //   loop below, we look at the number of valid items (which could be all for non-nullable),
-  //   and valid_count is that running count.
-  int processed_count = 0;
-  int valid_count     = 0;
-  // the core loop. decode batches of level stream data using rle_stream objects
-  // and pass the results to gpuDecodeValues
-  while (s->error == 0 && processed_count < s->page.num_input_values) {
-    int next_valid_count;
+        // sum null counts. we have to do it this way instead of just incrementing by (value_count -
+        // valid_count) because valid_count also includes rows that potentially start before our row
+        // bounds. if we could come up with a way to clean that up, we could remove this and just
+        // compute it directly at the end of the kernel.
+        size_type const block_null_count =
+          cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+        if (t == 0) { ni.null_count += block_null_count; }
+      }
+      // trivial for non-nullable columns
+      else {
+        thread_valid_count = thread_value_count;
+        block_valid_count  = block_value_count;
+      }
 
-    // only need to process definition levels if the column has nulls
-    if (nullable_with_nulls) {
-      processed_count += def_decoder.decode_next(t);
-      __syncthreads();
+      // if this is valid and we're at the leaf, output dst_pos
+      __syncthreads();  // handle modification of ni.value_count from below
+      if (is_valid && d_idx == max_depth) {
+        // for non-list types, the value count is always the same across
+        int const dst_pos = (value_count + thread_value_count) - 1;
+        int const src_pos = (ni.valid_count + thread_valid_count) - 1;
+        sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+      }
+      __syncthreads();  // handle modification of ni.value_count from below
 
-      next_valid_count =
-        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
+      // update stuff
+      if (t == 0) { ni.valid_count += block_valid_count; }
     }
-    // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
-    // this function call entirely since all it will ever generate is a mapping of (i -> i) for
-    // nz_idx.  gpuDecodeValues would be the only work that happens.
-    else {
-      processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t);
-    }
-    __syncthreads();
 
-    // decode the values themselves
-    gpuDecodeValues(s, sb, valid_count, next_valid_count, t);
-    __syncthreads();
+    value_count += block_value_count;
+  }
 
-    valid_count = next_valid_count;
+  if (t == 0) {
+    // update valid value count for decoding and total # of values we've processed
+    s->nz_count          = s->nesting_info[max_depth].valid_count;
+    s->input_value_count = value_count;
+    s->input_row_count   = value_count;
   }
-  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
+
+  __syncthreads();
+  return s->nesting_info[max_depth].valid_count;
 }
 
-/**
- * @brief Kernel for computing fixed width dictionary column data stored in the pages
- *
- * This function will write the page data and the page data's validity to the
- * output specified in the page's column chunk. If necessary, additional
- * conversion will be performed to translate from the Parquet datatype to
- * desired output datatype.
- *
- * @param pages List of pages
- * @param chunks List of column chunks
- * @param min_row Row index to start reading at
- * @param num_rows Maximum number of rows to read
- * @param error_code Error code to set if an error is encountered
- */
-template <typename level_t>
-CUDF_KERNEL void __launch_bounds__(decode_block_size)
-  gpuDecodePageDataFixedDict(PageInfo* pages,
-                             device_span<ColumnChunkDesc const> chunks,
-                             size_t min_row,
-                             size_t num_rows,
-                             kernel_error::pointer error_code)
+template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+static __device__ int gpuUpdateValidityAndRowIndicesFlat(
+  int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
-  __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
-                                                rolling_buf_size,  // dictionary
-                                                1>                 // unused in this kernel
-    state_buffers;
-
-  page_state_s* const s = &state_g;
-  auto* const sb        = &state_buffers;
-  int const page_idx    = blockIdx.x;
-  int const t           = threadIdx.x;
-  PageInfo* pp          = &pages[page_idx];
+  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
-  if (!(BitAnd(pages[page_idx].kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT))) { return; }
+  auto& ni = s->nesting_info[0];
 
-  // must come after the kernel mask check
-  [[maybe_unused]] null_count_back_copier _{s, t};
+  // how many (input) values we've processed in the page so far
+  int value_count = s->input_value_count;
+  int valid_count = ni.valid_count;
 
-  if (!setupLocalPageInfo(s,
-                          pp,
-                          chunks,
-                          min_row,
-                          num_rows,
-                          mask_filter{decode_kernel_mask::FIXED_WIDTH_DICT},
-                          page_processing_stage::DECODE)) {
-    return;
-  }
+  // cap by last row so that we don't process any rows past what we want to output.
+  int const first_row                 = s->first_row;
+  int const last_row                  = first_row + s->num_rows;
+  int const capped_target_value_count = min(target_value_count, last_row);
 
-  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
-  rle_stream<level_t, decode_block_size, rolling_buf_size> def_decoder{def_runs};
+  int const valid_map_offset      = ni.valid_map_offset;
+  int const row_index_lower_bound = s->row_index_lower_bound;
 
-  __shared__ rle_run<uint32_t> dict_runs[rle_run_buffer_size];
-  rle_stream<uint32_t, decode_block_size, rolling_buf_size> dict_stream{dict_runs};
+  __syncthreads();
 
-  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
-  if (s->num_rows == 0) { return; }
+  while (value_count < capped_target_value_count) {
+    int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-  bool const nullable            = is_nullable(s);
-  bool const nullable_with_nulls = nullable && has_nulls(s);
+    // definition level. only need to process for nullable columns
+    int d = 0;
+    if constexpr (nullable) {
+      if (def) {
+        d = t < batch_size
+              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
+              : -1;
+      } else {
+        d = t < batch_size ? 1 : -1;
+      }
+    }
 
-  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
-  level_t* const def = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
-  if (nullable_with_nulls) {
-    def_decoder.init(s->col.level_bits[level_type::DEFINITION],
-                     s->abs_lvl_start[level_type::DEFINITION],
-                     s->abs_lvl_end[level_type::DEFINITION],
-                     def,
-                     s->page.num_input_values);
-  }
+    int const thread_value_count = t + 1;
+    int const block_value_count  = batch_size;
 
-  dict_stream.init(
-    s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values);
-  __syncthreads();
+    // compute our row index, whether we're in row bounds, and validity
+    int const row_index     = (thread_value_count + value_count) - 1;
+    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
+    int is_valid;
+    if constexpr (nullable) {
+      is_valid = ((d > 0) && in_row_bounds) ? 1 : 0;
+    } else {
+      is_valid = in_row_bounds;
+    }
 
-  // We use two counters in the loop below: processed_count and valid_count.
-  // - processed_count: number of rows out of num_input_values that we have decoded so far.
-  //   the definition stream returns the number of total rows it has processed in each call
-  //   to decode_next and we accumulate in process_count.
-  // - valid_count: number of non-null rows we have decoded so far. In each iteration of the
-  //   loop below, we look at the number of valid items (which could be all for non-nullable),
-  //   and valid_count is that running count.
-  int processed_count = 0;
-  int valid_count     = 0;
+    // thread and block validity count
+    int thread_valid_count, block_valid_count;
+    if constexpr (nullable) {
+      using block_scan = cub::BlockScan<int, decode_block_size>;
+      __shared__ typename block_scan::TempStorage scan_storage;
+      block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
+      __syncthreads();
 
-  // the core loop. decode batches of level stream data using rle_stream objects
-  // and pass the results to gpuDecodeValues
-  while (s->error == 0 && processed_count < s->page.num_input_values) {
-    int next_valid_count;
+      // validity is processed per-warp
+      //
+      // nested schemas always read and write to the same bounds (that is, read and write
+      // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
+      // at the first value, even if that is before first_row, because we cannot trivially jump to
+      // the correct position to start reading. since we are about to write the validity vector
+      // here we need to adjust our computed mask to take into account the write row bounds.
+      int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
+      int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
+      int warp_null_count   = 0;
+      if (write_start >= 0) {
+        uint32_t const warp_validity_mask = ballot(is_valid);
+        // lane 0 from each warp writes out validity
+        if ((t % cudf::detail::warp_size) == 0) {
+          int const vindex = (value_count + thread_value_count) - 1;  // absolute input value index
+          int const bit_offset = (valid_map_offset + vindex + write_start) -
+                                 first_row;  // absolute bit offset into the output validity map
+          int const write_end =
+            cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
+          int const bit_count = write_end - write_start;
+          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
 
-    // only need to process definition levels if the column has nulls
-    if (nullable_with_nulls) {
-      processed_count += def_decoder.decode_next(t);
-      __syncthreads();
+          store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
+        }
+      }
 
-      // count of valid items in this batch
-      next_valid_count =
-        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
+      // sum null counts. we have to do it this way instead of just incrementing by (value_count -
+      // valid_count) because valid_count also includes rows that potentially start before our row
+      // bounds. if we could come up with a way to clean that up, we could remove this and just
+      // compute it directly at the end of the kernel.
+      size_type const block_null_count =
+        cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+      if (t == 0) { ni.null_count += block_null_count; }
     }
-    // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
-    // this function call entirely since all it will ever generate is a mapping of (i -> i) for
-    // nz_idx.  gpuDecodeValues would be the only work that happens.
+    // trivial for non-nullable columns
     else {
-      processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t);
+      thread_valid_count = thread_value_count;
+      block_valid_count  = block_value_count;
     }
-    __syncthreads();
 
-    // We want to limit the number of dictionary items we decode, that correspond to
-    // the rows we have processed in this iteration that are valid.
-    // We know the number of valid rows to process with: next_valid_count - valid_count.
-    dict_stream.decode_next(t, next_valid_count - valid_count);
-    __syncthreads();
+    // output offset
+    if (is_valid) {
+      int const dst_pos = (value_count + thread_value_count) - 1;
+      int const src_pos = (valid_count + thread_valid_count) - 1;
+      sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+    }
 
-    // decode the values themselves
-    gpuDecodeValues(s, sb, valid_count, next_valid_count, t);
-    __syncthreads();
+    // update stuff
+    value_count += block_value_count;
+    valid_count += block_valid_count;
+  }
 
-    valid_count = next_valid_count;
+  if (t == 0) {
+    // update valid value count for decoding and total # of values we've processed
+    ni.valid_count       = valid_count;
+    ni.value_count       = value_count;  // TODO: remove? this is unused in the non-list path
+    s->nz_count          = valid_count;
+    s->input_value_count = value_count;
+    s->input_row_count   = value_count;
   }
-  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
+
+  return valid_count;
+}
+
+// is the page marked nullable or not
+__device__ inline bool is_nullable(page_state_s* s)
+{
+  auto const lvl           = level_type::DEFINITION;
+  auto const max_def_level = s->col.max_level[lvl];
+  return max_def_level > 0;
+}
+
+// for a nullable page, check to see if it could have nulls
+__device__ inline bool maybe_has_nulls(page_state_s* s)
+{
+  auto const lvl      = level_type::DEFINITION;
+  auto const init_run = s->initial_rle_run[lvl];
+  // literal runs, lets assume they could hold nulls
+  if (is_literal_run(init_run)) { return true; }
+
+  // repeated run with number of items in the run not equal
+  // to the rows in the page, assume that means we could have nulls
+  if (s->page.num_input_values != (init_run >> 1)) { return true; }
+
+  auto const lvl_bits = s->col.level_bits[lvl];
+  auto const run_val  = lvl_bits == 0 ? 0 : s->initial_rle_value[lvl];
+
+  // the encoded repeated value isn't valid, we have (all) nulls
+  return run_val != s->col.max_level[lvl];
 }
 
 /**
@@ -583,19 +489,28 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
  * @param num_rows Maximum number of rows to read
  * @param error_code Error code to set if an error is encountered
  */
-template <typename level_t>
-CUDF_KERNEL void __launch_bounds__(decode_block_size)
-  gpuDecodeSplitPageDataFlat(PageInfo* pages,
-                             device_span<ColumnChunkDesc const> chunks,
-                             size_t min_row,
-                             size_t num_rows,
-                             kernel_error::pointer error_code)
+template <typename level_t,
+          int decode_block_size_t,
+          decode_kernel_mask kernel_mask_t,
+          bool has_dict_t,
+          bool has_nesting_t,
+          template <int block_size, typename state_buf>
+          typename DecodeValuesFunc>
+CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
+  gpuDecodePageDataGeneric(PageInfo* pages,
+                           device_span<ColumnChunkDesc const> chunks,
+                           size_t min_row,
+                           size_t num_rows,
+                           kernel_error::pointer error_code)
 {
+  constexpr int rolling_buf_size    = decode_block_size_t * 2;
+  constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<decode_block_size_t>();
+
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
-                                                1,                 // unused in this kernel
-                                                1>                 // unused in this kernel
-    state_buffers;
+  using state_buf_t = page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
+                                           has_dict_t ? rolling_buf_size : 1,
+                                           1>;
+  __shared__ __align__(16) state_buf_t state_buffers;
 
   page_state_s* const s = &state_g;
   auto* const sb        = &state_buffers;
@@ -603,9 +518,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   int const t           = threadIdx.x;
   PageInfo* pp          = &pages[page_idx];
 
-  if (!(BitAnd(pages[page_idx].kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT))) {
-    return;
-  }
+  if (!(BitAnd(pages[page_idx].kernel_mask, kernel_mask_t))) { return; }
 
   // must come after the kernel mask check
   [[maybe_unused]] null_count_back_copier _{s, t};
@@ -615,30 +528,70 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
                           chunks,
                           min_row,
                           num_rows,
-                          mask_filter{decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT},
+                          mask_filter{kernel_mask_t},
                           page_processing_stage::DECODE)) {
     return;
   }
 
-  // the level stream decoders
-  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
-  rle_stream<level_t, decode_block_size, rolling_buf_size> def_decoder{def_runs};
-
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
   if (s->num_rows == 0) { return; }
 
-  bool const nullable            = is_nullable(s);
-  bool const nullable_with_nulls = nullable && has_nulls(s);
+  DecodeValuesFunc<decode_block_size_t, state_buf_t> decode_values;
+
+  bool const nullable             = is_nullable(s);
+  bool const should_process_nulls = nullable && maybe_has_nulls(s);
+
+  // shared buffer. all shared memory is suballocated out of here
+  // constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size *
+  // sizeof(rle_run<level_t>), size_t{16}) : 0;
+  constexpr int shared_dict_size =
+    has_dict_t
+      ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<uint32_t>), size_t{16})
+      : 0;
+  constexpr int shared_def_size =
+    cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), size_t{16});
+  constexpr int shared_buf_size = /*shared_rep_size +*/ shared_dict_size + shared_def_size;
+  __shared__ __align__(16) uint8_t shared_buf[shared_buf_size];
+
+  // setup all shared memory buffers
+  int shared_offset = 0;
+  /*
+  rle_run<level_t> *rep_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
+  if constexpr (has_lists_t){
+    shared_offset += shared_rep_size;
+  }
+  */
+  rle_run<uint32_t>* dict_runs = reinterpret_cast<rle_run<uint32_t>*>(shared_buf + shared_offset);
+  if constexpr (has_dict_t) { shared_offset += shared_dict_size; }
+  rle_run<level_t>* def_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
 
   // initialize the stream decoders (requires values computed in setupLocalPageInfo)
+  rle_stream<level_t, decode_block_size_t, rolling_buf_size> def_decoder{def_runs};
   level_t* const def = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
-  if (nullable_with_nulls) {
+  if (should_process_nulls) {
     def_decoder.init(s->col.level_bits[level_type::DEFINITION],
                      s->abs_lvl_start[level_type::DEFINITION],
                      s->abs_lvl_end[level_type::DEFINITION],
                      def,
                      s->page.num_input_values);
   }
+  /*
+  rle_stream<level_t, decode_block_size_t, rolling_buf_size> rep_decoder{rep_runs};
+  level_t* const rep = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
+  if constexpr(has_lists_t){
+    rep_decoder.init(s->col.level_bits[level_type::REPETITION],
+                     s->abs_lvl_start[level_type::REPETITION],
+                     s->abs_lvl_end[level_type::REPETITION],
+                     rep,
+                     s->page.num_input_values);
+  }
+  */
+
+  rle_stream<uint32_t, decode_block_size_t, rolling_buf_size> dict_stream{dict_runs};
+  if constexpr (has_dict_t) {
+    dict_stream.init(
+      s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values);
+  }
   __syncthreads();
 
   // We use two counters in the loop below: processed_count and valid_count.
@@ -655,26 +608,47 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
-    // only need to process definition levels if the column has nulls
-    if (nullable_with_nulls) {
+    // only need to process definition levels if this is a nullable column
+    if (should_process_nulls) {
       processed_count += def_decoder.decode_next(t);
       __syncthreads();
 
-      next_valid_count =
-        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
+      if constexpr (has_nesting_t) {
+        next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, true, level_t>(
+          processed_count, s, sb, def, t);
+      } else {
+        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, true, level_t>(
+          processed_count, s, sb, def, t);
+      }
     }
     // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
     // this function call entirely since all it will ever generate is a mapping of (i -> i) for
-    // nz_idx.  gpuDecodeValues would be the only work that happens.
+    // nz_idx.  gpuDecodeFixedWidthValues would be the only work that happens.
     else {
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t);
+
+      if constexpr (has_nesting_t) {
+        next_valid_count =
+          gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, false, level_t>(
+            processed_count, s, sb, nullptr, t);
+      } else {
+        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, false, level_t>(
+          processed_count, s, sb, nullptr, t);
+      }
     }
     __syncthreads();
 
+    // if we have dictionary data
+    if constexpr (has_dict_t) {
+      // We want to limit the number of dictionary items we decode, that correspond to
+      // the rows we have processed in this iteration that are valid.
+      // We know the number of valid rows to process with: next_valid_count - valid_count.
+      dict_stream.decode_next(t, next_valid_count - valid_count);
+      __syncthreads();
+    }
+
     // decode the values themselves
-    gpuDecodeSplitValues(s, sb, valid_count, next_valid_count);
+    decode_values(s, sb, valid_count, next_valid_count, t);
     __syncthreads();
 
     valid_count = next_valid_count;
@@ -689,18 +663,55 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                   size_t num_rows,
                                   size_t min_row,
                                   int level_type_size,
+                                  bool has_nesting,
                                   kernel_error::pointer error_code,
                                   rmm::cuda_stream_view stream)
 {
+  constexpr int decode_block_size = 128;
+
   dim3 dim_block(decode_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   if (level_type_size == 1) {
-    gpuDecodePageDataFixed<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
+                               false,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT,
+                               false,
+                               false,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   } else {
-    gpuDecodePageDataFixed<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
+                               false,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT,
+                               false,
+                               false,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   }
 }
 
@@ -709,40 +720,113 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
                                       size_t num_rows,
                                       size_t min_row,
                                       int level_type_size,
+                                      bool has_nesting,
                                       kernel_error::pointer error_code,
                                       rmm::cuda_stream_view stream)
 {
-  //  dim3 dim_block(decode_block_size, 1); // decode_block_size = 128 threads per block
-  // 1 full warp, and 1 warp of 1 thread
+  constexpr int decode_block_size = 128;
+
   dim3 dim_block(decode_block_size, 1);  // decode_block_size = 128 threads per block
   dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
-    gpuDecodePageDataFixedDict<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT,
+                               true,
+                               false,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   } else {
-    gpuDecodePageDataFixedDict<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT,
+                               true,
+                               false,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   }
 }
 
-void __host__ DecodeSplitPageDataFlat(cudf::detail::hostdevice_span<PageInfo> pages,
-                                      cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
-                                      size_t num_rows,
-                                      size_t min_row,
-                                      int level_type_size,
-                                      kernel_error::pointer error_code,
-                                      rmm::cuda_stream_view stream)
+void __host__
+DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
+                              cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                              size_t num_rows,
+                              size_t min_row,
+                              int level_type_size,
+                              bool has_nesting,
+                              kernel_error::pointer error_code,
+                              rmm::cuda_stream_view stream)
 {
+  constexpr int decode_block_size = 128;
+
   dim3 dim_block(decode_block_size, 1);  // decode_block_size = 128 threads per block
   dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
-    gpuDecodeSplitPageDataFlat<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
+                               true,
+                               true,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
+                               true,
+                               false,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   } else {
-    gpuDecodeSplitPageDataFlat<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
+                               true,
+                               true,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
+                               true,
+                               false,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   }
 }
 
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index cf0dd85e490..d604642be54 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -145,6 +145,11 @@ __device__ inline bool is_nested(ColumnChunkDesc const& chunk)
   return chunk.max_nesting_depth > 1;
 }
 
+__device__ inline bool is_list(ColumnChunkDesc const& chunk)
+{
+  return chunk.max_level[level_type::REPETITION] > 0;
+}
+
 __device__ inline bool is_byte_array(ColumnChunkDesc const& chunk)
 {
   return chunk.physical_type == BYTE_ARRAY;
@@ -178,14 +183,17 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
     return decode_kernel_mask::STRING;
   }
 
-  if (!is_nested(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
+  if (!is_list(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
     if (page.encoding == Encoding::PLAIN) {
-      return decode_kernel_mask::FIXED_WIDTH_NO_DICT;
+      return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED
+                              : decode_kernel_mask::FIXED_WIDTH_NO_DICT;
     } else if (page.encoding == Encoding::PLAIN_DICTIONARY ||
                page.encoding == Encoding::RLE_DICTIONARY) {
-      return decode_kernel_mask::FIXED_WIDTH_DICT;
+      return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED
+                              : decode_kernel_mask::FIXED_WIDTH_DICT;
     } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
-      return decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT;
+      return is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED
+                              : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT;
     }
   }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index d82c6f0de59..efc1f5ebab1 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -207,16 +207,20 @@ enum level_type {
  * Used to control which decode kernels to run.
  */
 enum class decode_kernel_mask {
-  NONE                   = 0,
-  GENERAL                = (1 << 0),  // Run catch-all decode kernel
-  STRING                 = (1 << 1),  // Run decode kernel for string data
-  DELTA_BINARY           = (1 << 2),  // Run decode kernel for DELTA_BINARY_PACKED data
-  DELTA_BYTE_ARRAY       = (1 << 3),  // Run decode kernel for DELTA_BYTE_ARRAY encoded data
-  DELTA_LENGTH_BA        = (1 << 4),  // Run decode kernel for DELTA_LENGTH_BYTE_ARRAY encoded data
-  FIXED_WIDTH_NO_DICT    = (1 << 5),  // Run decode kernel for fixed width non-dictionary pages
-  FIXED_WIDTH_DICT       = (1 << 6),  // Run decode kernel for fixed width dictionary pages
-  BYTE_STREAM_SPLIT      = (1 << 7),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data
-  BYTE_STREAM_SPLIT_FLAT = (1 << 8),  // Same as above but with a flat schema
+  NONE                = 0,
+  GENERAL             = (1 << 0),  // Run catch-all decode kernel
+  STRING              = (1 << 1),  // Run decode kernel for string data
+  DELTA_BINARY        = (1 << 2),  // Run decode kernel for DELTA_BINARY_PACKED data
+  DELTA_BYTE_ARRAY    = (1 << 3),  // Run decode kernel for DELTA_BYTE_ARRAY encoded data
+  DELTA_LENGTH_BA     = (1 << 4),  // Run decode kernel for DELTA_LENGTH_BYTE_ARRAY encoded data
+  FIXED_WIDTH_NO_DICT = (1 << 5),  // Run decode kernel for fixed width non-dictionary pages
+  FIXED_WIDTH_DICT    = (1 << 6),  // Run decode kernel for fixed width dictionary pages
+  BYTE_STREAM_SPLIT   = (1 << 7),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data
+  BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT = (1 << 8),  // Same as above but for flat, fixed-width data
+  BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED =
+    (1 << 9),                              // Same as above but for nested, fixed-width data
+  FIXED_WIDTH_NO_DICT_NESTED = (1 << 10),  // Run decode kernel for fixed width non-dictionary pages
+  FIXED_WIDTH_DICT_NESTED    = (1 << 11),  // Run decode kernel for fixed width dictionary pages
 };
 
 // mask representing all the ways in which a string can be encoded
@@ -888,6 +892,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -896,6 +901,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                          std::size_t num_rows,
                          size_t min_row,
                          int level_type_size,
+                         bool has_nesting,
                          kernel_error::pointer error_code,
                          rmm::cuda_stream_view stream);
 
@@ -910,6 +916,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -918,11 +925,12 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
                              std::size_t num_rows,
                              size_t min_row,
                              int level_type_size,
+                             bool has_nesting,
                              kernel_error::pointer error_code,
                              rmm::cuda_stream_view stream);
 
 /**
- * @brief Launches kernel for reading dictionary fixed width column data stored in the pages
+ * @brief Launches kernel for reading fixed width column data stored in the pages
  *
  * The page data will be written to the output pointed to in the page's
  * associated column chunk.
@@ -932,16 +940,18 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
-void DecodeSplitPageDataFlat(cudf::detail::hostdevice_span<PageInfo> pages,
-                             cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
-                             std::size_t num_rows,
-                             size_t min_row,
-                             int level_type_size,
-                             kernel_error::pointer error_code,
-                             rmm::cuda_stream_view stream);
+void DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
+                                   cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                                   std::size_t num_rows,
+                                   size_t min_row,
+                                   int level_type_size,
+                                   bool has_nesting,
+                                   kernel_error::pointer error_code,
+                                   rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for initializing encoder row group fragments
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 1bd2fae281c..f705f6626e7 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -267,14 +267,27 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
   }
 
   // launch byte stream split decoder
-  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT) != 0) {
-    DecodeSplitPageDataFlat(subpass.pages,
-                            pass.chunks,
-                            num_rows,
-                            skip_rows,
-                            level_type_size,
-                            error_code.data(),
-                            streams[s_idx++]);
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT) != 0) {
+    DecodeSplitPageFixedWidthData(subpass.pages,
+                                  pass.chunks,
+                                  num_rows,
+                                  skip_rows,
+                                  level_type_size,
+                                  false,
+                                  error_code.data(),
+                                  streams[s_idx++]);
+  }
+
+  // launch byte stream split decoder, for nested columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED) != 0) {
+    DecodeSplitPageFixedWidthData(subpass.pages,
+                                  pass.chunks,
+                                  num_rows,
+                                  skip_rows,
+                                  level_type_size,
+                                  true,
+                                  error_code.data(),
+                                  streams[s_idx++]);
   }
 
   // launch byte stream split decoder
@@ -288,22 +301,50 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                         streams[s_idx++]);
   }
 
+  // launch fixed width type decoder
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT) != 0) {
     DecodePageDataFixed(subpass.pages,
                         pass.chunks,
                         num_rows,
                         skip_rows,
                         level_type_size,
+                        false,
+                        error_code.data(),
+                        streams[s_idx++]);
+  }
+
+  // launch fixed width type decoder, for nested columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED) != 0) {
+    DecodePageDataFixed(subpass.pages,
+                        pass.chunks,
+                        num_rows,
+                        skip_rows,
+                        level_type_size,
+                        true,
                         error_code.data(),
                         streams[s_idx++]);
   }
 
+  // launch fixed width type decoder with dictionaries
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT) != 0) {
     DecodePageDataFixedDict(subpass.pages,
                             pass.chunks,
                             num_rows,
                             skip_rows,
                             level_type_size,
+                            false,
+                            error_code.data(),
+                            streams[s_idx++]);
+  }
+
+  // launch fixed width type decoder with dictionaries, for nested columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_NESTED) != 0) {
+    DecodePageDataFixedDict(subpass.pages,
+                            pass.chunks,
+                            num_rows,
+                            skip_rows,
+                            level_type_size,
+                            true,
                             error_code.data(),
                             streams[s_idx++]);
   }
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 84ab83e33d0..a1f4c7b81d8 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -1785,7 +1785,8 @@ TEST_F(ParquetWriterTest, DeltaBinaryStartsWithNulls)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
-TEST_F(ParquetWriterTest, ByteStreamSplit)
+std::pair<std::unique_ptr<cudf::table>, cudf::io::table_input_metadata>
+make_byte_stream_split_table(bool as_struct)
 {
   constexpr auto num_rows = 100;
   std::mt19937 engine{31337};
@@ -1802,24 +1803,73 @@ TEST_F(ParquetWriterTest, ByteStreamSplit)
   // throw in a list to make sure both decoders are working
   auto col4 = make_parquet_list_col<int32_t>(engine, num_rows, 5, true);
 
-  auto expected = table_view{{col0, col1, col2, col3, *col4}};
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.reserve(5);
+  columns.push_back(col0.release());
+  columns.push_back(col1.release());
+  columns.push_back(col2.release());
+  columns.push_back(col3.release());
+  columns.push_back(std::move(col4));
+
+  return [&]() -> std::pair<std::unique_ptr<cudf::table>, cudf::io::table_input_metadata> {
+    auto const encoding = cudf::io::column_encoding::BYTE_STREAM_SPLIT;
+
+    // make as a nested struct
+    if (as_struct) {
+      auto valids =
+        cudf::detail::make_counting_transform_iterator(0, [](int i) { return i % 2 == 0; });
+      auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows);
+
+      std::vector<std::unique_ptr<cudf::column>> table_cols;
+      table_cols.push_back(
+        cudf::make_structs_column(num_rows, std::move(columns), null_count, std::move(null_mask)));
+
+      auto tbl      = std::make_unique<cudf::table>(std::move(table_cols));
+      auto expected = table_view{*tbl};
+
+      cudf::io::table_input_metadata expected_metadata(expected);
+      expected_metadata.column_metadata[0].set_name("struct");
+      expected_metadata.column_metadata[0].set_encoding(encoding);
+
+      expected_metadata.column_metadata[0].child(0).set_name("int32s");
+      expected_metadata.column_metadata[0].child(1).set_name("int64s");
+      expected_metadata.column_metadata[0].child(2).set_name("floats");
+      expected_metadata.column_metadata[0].child(3).set_name("doubles");
+      expected_metadata.column_metadata[0].child(4).set_name("int32list");
+      for (int idx = 0; idx <= 3; idx++) {
+        expected_metadata.column_metadata[0].child(idx).set_encoding(encoding);
+      }
+      expected_metadata.column_metadata[0].child(4).child(1).set_encoding(encoding);
 
-  cudf::io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_name("int32s");
-  expected_metadata.column_metadata[1].set_name("int64s");
-  expected_metadata.column_metadata[2].set_name("floats");
-  expected_metadata.column_metadata[3].set_name("doubles");
-  expected_metadata.column_metadata[4].set_name("int32list");
-  auto const encoding = cudf::io::column_encoding::BYTE_STREAM_SPLIT;
-  for (int i = 0; i <= 3; i++) {
-    expected_metadata.column_metadata[i].set_encoding(encoding);
-  }
+      return {std::move(tbl), expected_metadata};
+    }
+
+    // make flat
+    auto tbl      = std::make_unique<cudf::table>(std::move(columns));
+    auto expected = table_view{*tbl};
 
-  expected_metadata.column_metadata[4].child(1).set_encoding(encoding);
+    cudf::io::table_input_metadata expected_metadata(expected);
+    expected_metadata.column_metadata[0].set_name("int32s");
+    expected_metadata.column_metadata[1].set_name("int64s");
+    expected_metadata.column_metadata[2].set_name("floats");
+    expected_metadata.column_metadata[3].set_name("doubles");
+    expected_metadata.column_metadata[4].set_name("int32list");
+    for (int idx = 0; idx <= 3; idx++) {
+      expected_metadata.column_metadata[idx].set_encoding(encoding);
+    }
+
+    expected_metadata.column_metadata[4].child(1).set_encoding(encoding);
+    return {std::move(tbl), expected_metadata};
+  }();
+}
+
+TEST_F(ParquetWriterTest, ByteStreamSplit)
+{
+  auto [expected, expected_metadata] = make_byte_stream_split_table(false);
 
   auto const filepath = temp_env->get_temp_filepath("ByteStreamSplit.parquet");
   cudf::io::parquet_writer_options out_opts =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
       .metadata(expected_metadata);
   cudf::io::write_parquet(out_opts);
 
@@ -1827,7 +1877,24 @@ TEST_F(ParquetWriterTest, ByteStreamSplit)
     cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
   auto result = cudf::io::read_parquet(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, result.tbl->view());
+}
+
+TEST_F(ParquetWriterTest, ByteStreamSplitStruct)
+{
+  auto [expected, expected_metadata] = make_byte_stream_split_table(true);
+
+  auto const filepath = temp_env->get_temp_filepath("ByteStreamSplitStruct.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, result.tbl->view());
 }
 
 TEST_F(ParquetWriterTest, DecimalByteStreamSplit)

From 78f4a8a3f639677358bce83a699f92c90476ae75 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 28 Jun 2024 11:26:27 -0400
Subject: [PATCH 28/29] Move common string utilities to public api (#16070)

As part of https://github.com/rapidsai/cudf/pull/15982 a subset of the strings utility functions have been identified as being worth expsosing as part of the cudf public API.

The `create_string_vector_from_column`, `get_offset64_threshold`, and `is_large_strings_enabled` are now made part of the public `cudf::strings` api.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - David Wendt (https://github.com/davidwendt)
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16070
---
 .../cudf/strings/detail/strings_children.cuh  |  7 ++-
 cpp/include/cudf/strings/utilities.hpp        | 62 +++++++++++++++++++
 cpp/src/strings/utilities.cu                  | 22 +++++--
 cpp/tests/column/factories_test.cpp           |  4 +-
 cpp/tests/copying/concatenate_tests.cpp       |  8 +--
 cpp/tests/strings/array_tests.cpp             |  4 +-
 cpp/tests/strings/repeat_strings_tests.cpp    |  4 +-
 .../strings/src/strings/udf/udf_apis.cu       |  4 +-
 8 files changed, 95 insertions(+), 20 deletions(-)
 create mode 100644 cpp/include/cudf/strings/utilities.hpp

diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index f105a6dc546..f5f3982a5d6 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -21,6 +21,7 @@
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -81,11 +82,11 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
   auto const total_bytes =
     cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
 
-  auto const threshold = get_offset64_threshold();
-  CUDF_EXPECTS(is_large_strings_enabled() || (total_bytes < threshold),
+  auto const threshold = cudf::strings::get_offset64_threshold();
+  CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || (total_bytes < threshold),
                "Size of output exceeds the column size limit",
                std::overflow_error);
-  if (total_bytes >= get_offset64_threshold()) {
+  if (total_bytes >= cudf::strings::get_offset64_threshold()) {
     // recompute as int64 offsets when above the threshold
     offsets_column = make_numeric_column(
       data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/include/cudf/strings/utilities.hpp b/cpp/include/cudf/strings/utilities.hpp
new file mode 100644
index 00000000000..ae445282382
--- /dev/null
+++ b/cpp/include/cudf/strings/utilities.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
+
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+namespace CUDF_EXPORT cudf {
+namespace strings {
+
+/**
+ * @brief Creates a string_view vector from a strings column.
+ *
+ * @param strings Strings column instance.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned vector's device memory.
+ * @return Device vector of string_views
+ */
+rmm::device_uvector<string_view> create_string_vector_from_column(
+  cudf::strings_column_view const strings,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Return the threshold size for a strings column to use int64 offsets
+ *
+ * A computed size above this threshold should using int64 offsets, otherwise
+ * int32 offsets. By default this function will return std::numeric_limits<int32_t>::max().
+ * This value can be overridden at runtime using the environment variable
+ * LIBCUDF_LARGE_STRINGS_THRESHOLD.
+ *
+ * @return size in bytes
+ */
+int64_t get_offset64_threshold();
+
+/**
+ * @brief Checks if large strings is enabled
+ *
+ * This checks the setting in the environment variable LIBCUDF_LARGE_STRINGS_ENABLED.
+ *
+ * @return true if large strings are supported
+ */
+bool is_large_strings_enabled();
+
+}  // namespace strings
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 18e726a6d7d..101004a5d06 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -13,16 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include "strings/char_types/char_cases.h"
 #include "strings/char_types/char_flags.h"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -36,8 +37,7 @@
 #include <cstdlib>
 #include <string>
 
-namespace cudf {
-namespace strings {
+namespace cudf::strings {
 namespace detail {
 
 /**
@@ -175,5 +175,17 @@ int64_t get_offset_value(cudf::column_view const& offsets,
 }
 
 }  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+
+rmm::device_uvector<string_view> create_string_vector_from_column(
+  cudf::strings_column_view const strings,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::create_string_vector_from_column(strings, stream, mr);
+}
+
+int64_t get_offset64_threshold() { return detail::get_offset64_threshold(); }
+bool is_large_strings_enabled() { return detail::is_large_strings_enabled(); }
+
+}  // namespace cudf::strings
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index dca36eaa4e7..603187f0330 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -24,7 +24,7 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -762,7 +762,7 @@ TEST_F(ColumnFactoryTest, FromStructScalarNull) { struct_from_scalar(false); }
 
 TEST_F(ColumnFactoryTest, FromScalarErrors)
 {
-  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+  if (cudf::strings::is_large_strings_enabled()) { return; }
   cudf::string_scalar ss("hello world");
   EXPECT_THROW(cudf::make_column_from_scalar(ss, 214748365), std::overflow_error);
 
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 078e0ef9bae..054441788d0 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -29,7 +29,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/filling.hpp>
-#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -189,7 +189,7 @@ TEST_F(StringColumnTest, ConcatenateManyColumns)
 
 TEST_F(StringColumnTest, ConcatenateTooLarge)
 {
-  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+  if (cudf::strings::is_large_strings_enabled()) { return; }
 
   std::string big_str(1000000, 'a');  // 1 million bytes x 5 = 5 million bytes
   cudf::test::strings_column_wrapper input{big_str, big_str, big_str, big_str, big_str};
@@ -379,7 +379,7 @@ TEST_F(OverflowTest, OverflowTest)
   }
 
   // string column, overflow on chars
-  if (!cudf::strings::detail::is_large_strings_enabled()) {
+  if (!cudf::strings::is_large_strings_enabled()) {
     constexpr auto size = static_cast<cudf::size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 6 string columns of with 1 billion chars in each
@@ -502,7 +502,7 @@ TEST_F(OverflowTest, Presliced)
   }
 
   // strings, overflow on chars
-  if (!cudf::strings::detail::is_large_strings_enabled()) {
+  if (!cudf::strings::is_large_strings_enabled()) {
     constexpr cudf::size_type total_chars_size = 1024 * 1024 * 1024;
     constexpr cudf::size_type string_size      = 64;
     constexpr cudf::size_type num_rows         = total_chars_size / string_size;
diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp
index a1bb87a43fb..9c0ecaa52c0 100644
--- a/cpp/tests/strings/array_tests.cpp
+++ b/cpp/tests/strings/array_tests.cpp
@@ -23,8 +23,8 @@
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/sorting.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -153,7 +153,7 @@ TEST_F(StringsColumnTest, GatherZeroSizeStringsColumn)
 
 TEST_F(StringsColumnTest, GatherTooBig)
 {
-  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+  if (cudf::strings::is_large_strings_enabled()) { return; }
 
   std::vector<int8_t> h_chars(3000000);
   cudf::test::fixed_width_column_wrapper<int8_t> chars(h_chars.begin(), h_chars.end());
diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp
index 0539895c5f4..aa4d9320d7c 100644
--- a/cpp/tests/strings/repeat_strings_tests.cpp
+++ b/cpp/tests/strings/repeat_strings_tests.cpp
@@ -20,9 +20,9 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/utilities.hpp>
 
 using namespace cudf::test::iterators;
 
@@ -221,7 +221,7 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesInvalidInput)
 
 TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesOverflowOutput)
 {
-  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+  if (cudf::strings::is_large_strings_enabled()) { return; }
 
   auto const strs    = strs_col{"1", "12", "123", "1234", "12345", "123456", "1234567"};
   auto const strs_cv = cudf::strings_column_view(strs);
diff --git a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
index 941e61e6787..b924995cf4b 100644
--- a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
+++ b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
@@ -15,10 +15,10 @@
  */
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/udf/udf_apis.hpp>
 #include <cudf/strings/udf/udf_string.cuh>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -57,7 +57,7 @@ std::unique_ptr<rmm::device_buffer> to_string_view_array(cudf::column_view const
                                                          rmm::cuda_stream_view stream)
 {
   return std::make_unique<rmm::device_buffer>(
-    std::move(cudf::strings::detail::create_string_vector_from_column(
+    std::move(cudf::strings::create_string_vector_from_column(
                 cudf::strings_column_view(input), stream, rmm::mr::get_current_device_resource())
                 .release()));
 }

From fb12d980342833a9d7092a19717eedad22328e6a Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 28 Jun 2024 12:14:58 -0400
Subject: [PATCH 29/29] Installed cudf header use cudf::allocate_like (#16087)

Remove usage of non public cudf::allocate_like from implementations in headers we install

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16087
---
 cpp/include/cudf/detail/copy_if.cuh       |  6 +++---
 cpp/include/cudf/detail/gather.cuh        | 13 ++++++-------
 cpp/src/copying/sample.cu                 |  1 +
 cpp/src/lists/copying/segmented_gather.cu |  1 +
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index c98057d077a..b6310e6cd2f 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -18,7 +18,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/copy.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -242,8 +242,8 @@ struct scatter_gather_functor {
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
   {
-    auto output_column = cudf::detail::allocate_like(
-      input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
+    auto output_column =
+      cudf::allocate_like(input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
     auto output = output_column->mutable_view();
 
     bool has_valid = input.nullable();
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index c9d350ce983..5977c7341c1 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <cudf/detail/copy.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/assert.cuh>
@@ -217,10 +217,9 @@ struct column_gatherer_impl<Element, std::enable_if_t<is_rep_layout_compatible<E
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
   {
-    auto const num_rows = cudf::distance(gather_map_begin, gather_map_end);
-    auto const policy   = cudf::mask_allocation_policy::NEVER;
-    auto destination_column =
-      cudf::detail::allocate_like(source_column, num_rows, policy, stream, mr);
+    auto const num_rows     = cudf::distance(gather_map_begin, gather_map_end);
+    auto const policy       = cudf::mask_allocation_policy::NEVER;
+    auto destination_column = cudf::allocate_like(source_column, num_rows, policy, stream, mr);
 
     gather_helper(source_column.data<Element>(),
                   source_column.size(),
@@ -413,8 +412,8 @@ struct column_gatherer_impl<dictionary32> {
     auto keys_copy = std::make_unique<column>(dictionary.keys(), stream, mr);
     // Perform gather on just the indices
     column_view indices = dictionary.get_indices_annotated();
-    auto new_indices    = cudf::detail::allocate_like(
-      indices, output_count, cudf::mask_allocation_policy::NEVER, stream, mr);
+    auto new_indices =
+      cudf::allocate_like(indices, output_count, cudf::mask_allocation_policy::NEVER, stream, mr);
     gather_helper(
       cudf::detail::indexalator_factory::make_input_iterator(indices),
       indices.size(),
diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu
index f8e3a9a83e3..ba00527f6b6 100644
--- a/cpp/src/copying/sample.cu
+++ b/cpp/src/copying/sample.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 89b1a126fc5..779eca438db 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/copy_range.cuh>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/indexalator.cuh>