From a021d78ec455917adcad5af87faa61cbfeb7b8d4 Mon Sep 17 00:00:00 2001
From: Matthew Murray <matthewmurray711@gmail.com>
Date: Fri, 28 Jun 2024 09:57:50 -0700
Subject: [PATCH] Fix merge conflicts

---
 .github/workflows/pr.yaml                     |  12 +
 CONTRIBUTING.md                               |  13 +-
 ci/test_cudf_polars.sh                        |  68 ++
 cpp/CMakeLists.txt                            |   1 +
 .../io/parquet/parquet_reader_input.cpp       |  50 +-
 .../developer_guide/DEVELOPER_GUIDE.md        |  60 +-
 cpp/include/cudf/binaryop.hpp                 |  50 +
 cpp/include/cudf/detail/copy_if.cuh           |   6 +-
 cpp/include/cudf/detail/gather.cuh            |  13 +-
 cpp/include/cudf/detail/stream_compaction.hpp |   2 -
 .../cudf/detail/utilities/cuda_memcpy.hpp     |  53 ++
 cpp/include/cudf/lists/lists_column_view.hpp  |   3 +-
 cpp/include/cudf/stream_compaction.hpp        |   2 +
 .../cudf/strings/detail/strings_children.cuh  |   7 +-
 cpp/include/cudf/strings/utilities.hpp        |  62 ++
 cpp/include/cudf/utilities/pinned_memory.hpp  |  16 +
 cpp/include/cudf_test/column_wrapper.hpp      |   7 +-
 cpp/src/binaryop/compiled/binary_ops.cuh      |  14 +-
 cpp/src/binaryop/compiled/util.cpp            |  12 +-
 cpp/src/copying/sample.cu                     |   1 +
 cpp/src/io/parquet/decode_fixed.cu            | 896 ++++++++++--------
 cpp/src/io/parquet/page_hdr.cu                |  16 +-
 cpp/src/io/parquet/parquet_gpu.hpp            |  46 +-
 cpp/src/io/parquet/reader_impl.cpp            |  57 +-
 cpp/src/io/parquet/reader_impl_chunking.cu    |  13 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu  |   3 +-
 cpp/src/io/utilities/hostdevice_vector.hpp    |  13 +-
 cpp/src/join/conditional_join.cu              |  13 +-
 cpp/src/lists/copying/segmented_gather.cu     |   1 +
 cpp/src/stream_compaction/distinct.cu         | 146 ++-
 cpp/src/stream_compaction/distinct_count.cu   |   3 +-
 cpp/src/stream_compaction/distinct_helpers.cu | 189 ++--
 .../stream_compaction/distinct_helpers.hpp    |  58 +-
 cpp/src/stream_compaction/stable_distinct.cu  |   4 +-
 .../stream_compaction_common.cuh              |   5 +-
 .../stream_compaction_common.hpp              |  35 -
 cpp/src/stream_compaction/unique.cu           |   1 -
 cpp/src/strings/utilities.cu                  |  22 +-
 cpp/src/utilities/cuda_memcpy.cu              |  71 ++
 cpp/src/utilities/pinned_memory.cpp           |  14 +
 cpp/tests/CMakeLists.txt                      |   1 +
 .../binop-compiled-fixed_point-test.cpp       |  58 ++
 cpp/tests/column/factories_test.cpp           |   4 +-
 cpp/tests/copying/concatenate_tests.cpp       |   8 +-
 cpp/tests/io/parquet_writer_test.cpp          |  97 +-
 cpp/tests/join/conditional_join_tests.cu      |  92 +-
 cpp/tests/streams/stream_compaction_test.cpp  | 235 +++++
 cpp/tests/strings/array_tests.cpp             |   4 +-
 cpp/tests/strings/repeat_strings_tests.cpp    |   4 +-
 .../_static/cudf-pandas-line-profile.png      | Bin 0 -> 15125 bytes
 docs/cudf/source/cudf_pandas/faq.md           |  16 +
 docs/cudf/source/cudf_pandas/usage.md         |  34 +-
 .../cudf/source/user_guide/api_docs/index.rst |   3 +
 .../api_docs/performance_tracking.rst         |  12 +
 .../user_guide/api_docs/pylibcudf/index.rst   |   1 +
 .../user_guide/api_docs/pylibcudf/interop.rst |   6 +
 .../api_docs/pylibcudf/strings/index.rst      |   1 +
 .../api_docs/pylibcudf/strings/slice.rst      |   6 +
 docs/cudf/source/user_guide/index.md          |   1 +
 .../source/user_guide/memory-profiling.md     |  44 +
 python/cudf/cudf/_lib/lists.pyx               |  72 +-
 python/cudf/cudf/_lib/parquet.pyx             |  36 +-
 python/cudf/cudf/_lib/pylibcudf/column.pxd    |   4 +
 python/cudf/cudf/_lib/pylibcudf/column.pyx    |   9 +
 python/cudf/cudf/_lib/pylibcudf/join.pxd      |   2 +
 python/cudf/cudf/_lib/pylibcudf/join.pyx      |  30 +-
 .../cudf/cudf/_lib/pylibcudf/libcudf/join.pxd |   5 +
 .../_lib/pylibcudf/libcudf/lists/contains.pxd |  29 +-
 .../libcudf/lists/lists_column_view.pxd       |   1 +
 .../libcudf/scalar/scalar_factories.pxd       |   1 +
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  10 +
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 150 ++-
 .../_lib/pylibcudf/strings/CMakeLists.txt     |   2 +-
 .../cudf/_lib/pylibcudf/strings/__init__.pxd  |   1 +
 .../cudf/_lib/pylibcudf/strings/__init__.py   |   1 +
 .../cudf/_lib/pylibcudf/strings/slice.pxd     |  15 +
 .../cudf/_lib/pylibcudf/strings/slice.pyx     | 102 ++
 python/cudf/cudf/_lib/pylibcudf/table.pyx     |   2 +
 python/cudf/cudf/_lib/strings/substring.pyx   |  88 +-
 python/cudf/cudf/core/_base_index.py          |   6 +-
 python/cudf/cudf/core/algorithms.py           |   4 +-
 python/cudf/cudf/core/buffer/spill_manager.py |   4 +-
 .../cudf/cudf/core/buffer/spillable_buffer.py |   7 +-
 python/cudf/cudf/core/column/column.py        |   4 +-
 python/cudf/cudf/core/column_accessor.py      |   4 +-
 python/cudf/cudf/core/cut.py                  |   2 +-
 python/cudf/cudf/core/dataframe.py            | 257 ++---
 python/cudf/cudf/core/frame.py                | 110 +--
 python/cudf/cudf/core/groupby/groupby.py      |  60 +-
 python/cudf/cudf/core/index.py                | 241 ++---
 python/cudf/cudf/core/indexed_frame.py        | 155 ++-
 python/cudf/cudf/core/multiindex.py           | 221 ++---
 python/cudf/cudf/core/reshape.py              |  62 +-
 python/cudf/cudf/core/series.py               | 244 ++---
 python/cudf/cudf/core/single_column_frame.py  |  42 +-
 python/cudf/cudf/core/udf/groupby_utils.py    |   4 +-
 python/cudf/cudf/core/udf/utils.py            |   6 +-
 python/cudf/cudf/io/csv.py                    |   6 +-
 python/cudf/cudf/io/parquet.py                |  48 +-
 python/cudf/cudf/io/text.py                   |   6 +-
 python/cudf/cudf/options.py                   |  14 +
 python/cudf/cudf/pandas/_wrappers/numpy.py    |  13 +
 python/cudf/cudf/pandas/_wrappers/pandas.py   |  16 +
 .../cudf/pandas/scripts/run-pandas-tests.sh   |   3 +-
 python/cudf/cudf/pylibcudf_tests/test_join.py |  29 +
 .../cudf/cudf/pylibcudf_tests/test_lists.py   | 107 ++-
 .../cudf/pylibcudf_tests/test_string_slice.py | 116 +++
 .../cudf/cudf/pylibcudf_tests/test_table.py   |  22 +
 python/cudf/cudf/tests/test_dataframe.py      |  32 +
 python/cudf/cudf/tests/test_monotonic.py      |  19 +
 python/cudf/cudf/tests/test_parquet.py        |  11 +
 .../cudf/tests/test_performance_tracking.py   |  41 +
 python/cudf/cudf/tests/test_repr.py           |  10 +-
 python/cudf/cudf/utils/nvtx_annotation.py     |  30 -
 .../cudf/cudf/utils/performance_tracking.py   |  82 ++
 python/cudf/cudf/utils/utils.py               |   5 +-
 .../cudf_pandas_tests/test_cudf_pandas.py     |  33 +
 .../strings/src/strings/udf/udf_apis.cu       |   4 +-
 python/cudf_polars/cudf_polars/dsl/expr.py    |  96 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      |  91 +-
 .../cudf_polars/cudf_polars/dsl/translate.py  |  20 +
 .../cudf_polars/typing/__init__.py            |  74 +-
 .../cudf_polars/tests/expressions/test_agg.py |   4 +-
 .../tests/expressions/test_booleanfunction.py |  58 +-
 .../tests/expressions/test_rolling.py         |  12 +-
 .../tests/expressions/test_stringfunction.py  |  16 +-
 .../tests/expressions/test_when_then.py       |  27 +
 python/cudf_polars/tests/test_groupby.py      |  11 +-
 python/cudf_polars/tests/test_join.py         |  40 +-
 python/cudf_polars/tests/test_mapfunction.py  |  32 +-
 python/cudf_polars/tests/test_python_scan.py  |   7 +-
 python/cudf_polars/tests/test_union.py        |  12 +-
 python/dask_cudf/dask_cudf/backends.py        |  40 +-
 python/dask_cudf/dask_cudf/core.py            |  62 +-
 python/dask_cudf/dask_cudf/groupby.py         |  72 +-
 python/dask_cudf/dask_cudf/sorting.py         |  16 +-
 136 files changed, 4000 insertions(+), 2015 deletions(-)
 create mode 100755 ci/test_cudf_polars.sh
 create mode 100644 cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
 create mode 100644 cpp/include/cudf/strings/utilities.hpp
 delete mode 100644 cpp/src/stream_compaction/stream_compaction_common.hpp
 create mode 100644 cpp/src/utilities/cuda_memcpy.cu
 create mode 100644 cpp/tests/streams/stream_compaction_test.cpp
 create mode 100644 docs/cudf/source/_static/cudf-pandas-line-profile.png
 create mode 100644 docs/cudf/source/user_guide/api_docs/performance_tracking.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst
 create mode 100644 docs/cudf/source/user_guide/memory-profiling.md
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_join.py
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_slice.py
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_table.py
 create mode 100644 python/cudf/cudf/tests/test_performance_tracking.py
 delete mode 100644 python/cudf/cudf/utils/nvtx_annotation.py
 create mode 100644 python/cudf/cudf/utils/performance_tracking.py
 create mode 100644 python/cudf_polars/tests/expressions/test_when_then.py

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index cb582df21e0..a35802f2ab0 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,6 +25,7 @@ jobs:
       - docs-build
       - wheel-build-cudf
       - wheel-tests-cudf
+      - test-cudf-polars
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
       - devcontainer
@@ -132,6 +133,17 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
+  test-cudf-polars:
+    needs: wheel-build-cudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      # This always runs, but only fails if this PR touches code in
+      # pylibcudf or cudf_polars
+      script: "ci/test_cudf_polars.sh"
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 98c2ec0a22e..4fbc28fa6e1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -71,15 +71,14 @@ for a minimal build of libcudf without using conda are also listed below.
 
 Compilers:
 
-* `gcc` version 9.3+
-* `nvcc` version 11.5+
-* `cmake` version 3.26.4+
+* `gcc` version 11.4+
+* `nvcc` version 11.8+
+* `cmake` version 3.29.6+
 
-CUDA/GPU:
+CUDA/GPU Runtime:
 
-* CUDA 11.5+
-* NVIDIA driver 450.80.02+
-* Volta architecture or better (Compute Capability >=7.0)
+* CUDA 11.4+
+* Volta architecture or better ([Compute Capability](https://docs.nvidia.com/deploy/cuda-compatibility/) >=7.0)
 
 You can obtain CUDA from
 [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
diff --git a/ci/test_cudf_polars.sh b/ci/test_cudf_polars.sh
new file mode 100755
index 00000000000..669e049ab26
--- /dev/null
+++ b/ci/test_cudf_polars.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -eou pipefail
+
+# We will only fail these tests if the PR touches code in pylibcudf
+# or cudf_polars itself.
+# Note, the three dots mean we are doing diff between the merge-base
+# of upstream and HEAD. So this is asking, "does _this branch_ touch
+# files in cudf_polars/pylibcudf", rather than "are there changes
+# between upstream and this branch which touch cudf_polars/pylibcudf"
+# TODO: is the target branch exposed anywhere in an environment variable?
+if [ -n "$(git diff --name-only origin/branch-24.08...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
+then
+    HAS_CHANGES=1
+else
+    HAS_CHANGES=0
+fi
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+
+RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
+RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
+mkdir -p "${RAPIDS_TESTS_DIR}"
+
+rapids-logger "Install cudf wheel"
+# echo to expand wildcard before adding `[extra]` requires for pip
+python -m pip install $(echo ./dist/cudf*.whl)[test]
+
+rapids-logger "Install polars (allow pre-release versions)"
+python -m pip install 'polars>=1.0.0a0'
+
+rapids-logger "Install cudf_polars"
+python -m pip install --no-deps python/cudf_polars
+
+rapids-logger "Run cudf_polars tests"
+
+function set_exitcode()
+{
+    EXITCODE=$?
+}
+EXITCODE=0
+trap set_exitcode ERR
+set +e
+
+python -m pytest \
+       --cache-clear \
+       --cov cudf_polars \
+       --cov-fail-under=100 \
+       --cov-config=python/cudf_polars/pyproject.toml \
+       --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf_polars.xml" \
+       python/cudf_polars/tests
+
+trap ERR
+set -e
+
+if [ ${EXITCODE} != 0 ]; then
+    rapids-logger "Testing FAILED: exitcode ${EXITCODE}"
+else
+    rapids-logger "Testing PASSED"
+fi
+
+if [ ${HAS_CHANGES} == 1 ]; then
+    exit ${EXITCODE}
+else
+    exit 0
+fi
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5fd68bfb26c..35cf90411f2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -662,6 +662,7 @@ add_library(
   src/unary/math_ops.cu
   src/unary/nan_ops.cu
   src/unary/null_ops.cu
+  src/utilities/cuda_memcpy.cu
   src/utilities/default_stream.cpp
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
index 019e0f30fe9..7563c823454 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,20 +59,18 @@ void parquet_read_common(cudf::size_type num_rows_to_read,
 }
 
 template <data_type DataType>
-void BM_parquet_read_data(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
+void BM_parquet_read_data_common(nvbench::state& state,
+                                 data_profile const& profile,
+                                 nvbench::type_list<nvbench::enum_type<DataType>>)
 {
   auto const d_type      = get_type_or_group(static_cast<int32_t>(DataType));
-  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
-  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
   auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
   auto const compression = cudf::io::compression_type::SNAPPY;
   cuio_source_sink_pair source_sink(source_type);
 
   auto const num_rows_written = [&]() {
-    auto const tbl = create_random_table(
-      cycle_dtypes(d_type, num_cols),
-      table_size_bytes{data_size},
-      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const tbl =
+      create_random_table(cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, profile);
     auto const view = tbl->view();
 
     cudf::io::parquet_writer_options write_opts =
@@ -85,6 +83,32 @@ void BM_parquet_read_data(nvbench::state& state, nvbench::type_list<nvbench::enu
   parquet_read_common(num_rows_written, num_cols, source_sink, state);
 }
 
+template <data_type DataType>
+void BM_parquet_read_data(nvbench::state& state,
+                          nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  BM_parquet_read_data_common<DataType>(
+    state, data_profile_builder().cardinality(cardinality).avg_run_length(run_length), type_list);
+}
+
+template <data_type DataType>
+void BM_parquet_read_fixed_width_struct(nvbench::state& state,
+                                        nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  std::vector<cudf::type_id> s_types{
+    cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::INT64};
+  BM_parquet_read_data_common<DataType>(state,
+                                        data_profile_builder()
+                                          .cardinality(cardinality)
+                                          .avg_run_length(run_length)
+                                          .struct_types(s_types),
+                                        type_list);
+}
+
 void BM_parquet_read_io_compression(nvbench::state& state)
 {
   auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
@@ -247,3 +271,13 @@ NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32})
   .add_int64_axis("num_string_cols", {1, 2, 3});
+
+// a benchmark for structs that only contain fixed-width types
+using d_type_list_struct_only = nvbench::enum_type_list<data_type::STRUCT>;
+NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only))
+  .set_name("parquet_read_fixed_width_struct")
+  .set_type_axes_names({"data_type"})
+  .add_string_axis("io_type", {"DEVICE_BUFFER"})
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index ff80c2daab8..0d097541692 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -1,4 +1,4 @@
-# libcudf C++ Developer Guide
+# libcudf C++ Developer Guide {#DEVELOPER_GUIDE}
 
 This document serves as a guide for contributors to libcudf C++ code. Developers should also refer
 to these additional files for further documentation of libcudf best practices.
@@ -469,7 +469,7 @@ libcudf throws under different circumstances, see the [section on error handling
 
 # libcudf API and Implementation
 
-## Streams
+## Streams {#streams}
 
 libcudf is in the process of adding support for asynchronous execution using
 CUDA streams. In order to facilitate the usage of streams, all new libcudf APIs
@@ -486,33 +486,37 @@ use only asynchronous versions of CUDA APIs with the stream parameter.
 
 In order to make the `detail` API callable from other libcudf functions, it should be exposed in a
 header placed in the `cudf/cpp/include/detail/` directory.
+The declaration is not necessary if no other libcudf functions call the `detail` function.
 
 For example:
 
 ```c++
 // cpp/include/cudf/header.hpp
-void external_function(...);
+void external_function(...,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 // cpp/include/cudf/detail/header.hpp
 namespace detail{
-void external_function(..., rmm::cuda_stream_view stream)
+void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 } // namespace detail
 
 // cudf/src/implementation.cpp
 namespace detail{
-    // Use the stream parameter in the detail implementation.
-    void external_function(..., rmm::cuda_stream_view stream){
-        // Implementation uses the stream with async APIs.
-        rmm::device_buffer buff(...,stream);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
-        kernel<<<..., stream>>>(...);
-        thrust::algorithm(rmm::exec_policy(stream), ...);
-    }
+// Use the stream parameter in the detail implementation.
+void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr){
+  // Implementation uses the stream with async APIs.
+  rmm::device_buffer buff(..., stream, mr);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
+  kernel<<<..., stream>>>(...);
+  thrust::algorithm(rmm::exec_policy(stream), ...);
+}
 } // namespace detail
 
-void external_function(...){
-    CUDF_FUNC_RANGE(); // Generates an NVTX range for the lifetime of this function.
-    detail::external_function(..., cudf::get_default_stream());
+void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE(); // Generates an NVTX range for the lifetime of this function.
+  detail::external_function(..., stream, mr);
 }
 ```
 
@@ -703,28 +707,28 @@ The preferred style for how inputs are passed in and outputs are returned is the
     - `column_view const&`
   - Tables:
     - `table_view const&`
-    - Scalar:
-        - `scalar const&`
-    - Everything else:
-       - Trivial or inexpensively copied types
-          - Pass by value
-       - Non-trivial or expensive to copy types
-          - Pass by `const&`
+  - Scalar:
+    - `scalar const&`
+  - Everything else:
+    - Trivial or inexpensively copied types
+      - Pass by value
+    - Non-trivial or expensive to copy types
+      - Pass by `const&`
 - In/Outs
   - Columns:
     - `mutable_column_view&`
   - Tables:
     - `mutable_table_view&`
-    - Everything else:
-        - Pass by via raw pointer
+  - Everything else:
+    - Pass by via raw pointer
 - Outputs
   - Outputs should be *returned*, i.e., no output parameters
   - Columns:
     - `std::unique_ptr<column>`
   - Tables:
     - `std::unique_ptr<table>`
-    - Scalars:
-        - `std::unique_ptr<scalar>`
+  - Scalars:
+    - `std::unique_ptr<scalar>`
 
 
 ### Multiple Return Values
@@ -908,6 +912,10 @@ functions that are specific to columns of Strings. These functions reside in the
 namespace. Similarly, functionality used exclusively for unit testing is in the `cudf::test::`
 namespace.
 
+The public function is expected to contain a call to `CUDF_FUNC_RANGE()` followed by a call to
+a `detail` function with same name and parameters as the public function.
+See the [Streams](#streams) section for an example of this pattern.
+
 ### Internal
 
 Many functions are not meant for public use, so place them in either the `detail` or an *anonymous*
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 5e41a871f32..22dad11e109 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -91,6 +91,56 @@ enum class binary_operator : int32_t {
                      ///< (null, false) is null, and (valid, valid) == LOGICAL_OR(valid, valid)
   INVALID_BINARY     ///< invalid operation
 };
+
+/// Binary operation common type default
+template <typename L, typename R, typename = void>
+struct binary_op_common_type {};
+
+/// Binary operation common type specialization
+template <typename L, typename R>
+struct binary_op_common_type<L, R, std::enable_if_t<has_common_type_v<L, R>>> {
+  /// The common type of the template parameters
+  using type = std::common_type_t<L, R>;
+};
+
+/// Binary operation common type specialization
+template <typename L, typename R>
+struct binary_op_common_type<
+  L,
+  R,
+  std::enable_if_t<is_fixed_point<L>() && cuda::std::is_floating_point_v<R>>> {
+  /// The common type of the template parameters
+  using type = L;
+};
+
+/// Binary operation common type specialization
+template <typename L, typename R>
+struct binary_op_common_type<
+  L,
+  R,
+  std::enable_if_t<is_fixed_point<R>() && cuda::std::is_floating_point_v<L>>> {
+  /// The common type of the template parameters
+  using type = R;
+};
+
+/// Binary operation common type helper
+template <typename L, typename R>
+using binary_op_common_type_t = typename binary_op_common_type<L, R>::type;
+
+namespace detail {
+template <typename AlwaysVoid, typename L, typename R>
+struct binary_op_has_common_type_impl : std::false_type {};
+
+template <typename L, typename R>
+struct binary_op_has_common_type_impl<std::void_t<binary_op_common_type_t<L, R>>, L, R>
+  : std::true_type {};
+}  // namespace detail
+
+/// Checks if binary operation types have a common type
+template <typename L, typename R>
+constexpr inline bool binary_op_has_common_type_v =
+  detail::binary_op_has_common_type_impl<void, L, R>::value;
+
 /**
  * @brief Performs a binary operation between a scalar and a column.
  *
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index c98057d077a..b6310e6cd2f 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -18,7 +18,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/copy.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -242,8 +242,8 @@ struct scatter_gather_functor {
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
   {
-    auto output_column = cudf::detail::allocate_like(
-      input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
+    auto output_column =
+      cudf::allocate_like(input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
     auto output = output_column->mutable_view();
 
     bool has_valid = input.nullable();
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index c9d350ce983..5977c7341c1 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <cudf/detail/copy.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/assert.cuh>
@@ -217,10 +217,9 @@ struct column_gatherer_impl<Element, std::enable_if_t<is_rep_layout_compatible<E
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
   {
-    auto const num_rows = cudf::distance(gather_map_begin, gather_map_end);
-    auto const policy   = cudf::mask_allocation_policy::NEVER;
-    auto destination_column =
-      cudf::detail::allocate_like(source_column, num_rows, policy, stream, mr);
+    auto const num_rows     = cudf::distance(gather_map_begin, gather_map_end);
+    auto const policy       = cudf::mask_allocation_policy::NEVER;
+    auto destination_column = cudf::allocate_like(source_column, num_rows, policy, stream, mr);
 
     gather_helper(source_column.data<Element>(),
                   source_column.size(),
@@ -413,8 +412,8 @@ struct column_gatherer_impl<dictionary32> {
     auto keys_copy = std::make_unique<column>(dictionary.keys(), stream, mr);
     // Perform gather on just the indices
     column_view indices = dictionary.get_indices_annotated();
-    auto new_indices    = cudf::detail::allocate_like(
-      indices, output_count, cudf::mask_allocation_policy::NEVER, stream, mr);
+    auto new_indices =
+      cudf::allocate_like(indices, output_count, cudf::mask_allocation_policy::NEVER, stream, mr);
     gather_helper(
       cudf::detail::indexalator_factory::make_input_iterator(indices),
       indices.size(),
diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index e2974789ea1..e3ef4190fd2 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -88,8 +88,6 @@ std::unique_ptr<table> distinct(table_view const& input,
 
 /**
  * @copydoc cudf::stable_distinct
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> stable_distinct(table_view const& input,
                                        std::vector<size_type> const& keys,
diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
new file mode 100644
index 00000000000..b66c461ab12
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::detail {
+
+enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };
+
+/**
+ * @brief Asynchronously copies data between the host and device.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination memory address
+ * @param src Source memory address
+ * @param size Number of bytes to copy
+ * @param kind Type of host memory
+ * @param stream CUDA stream used for the copy
+ */
+void cuda_memcpy_async(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+
+/**
+ * @brief Synchronously copies data between the host and device.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination memory address
+ * @param src Source memory address
+ * @param size Number of bytes to copy
+ * @param kind Type of host memory
+ * @param stream CUDA stream used for the copy
+ */
+void cuda_memcpy(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp
index 57a4f724c2d..3397cb0ca1d 100644
--- a/cpp/include/cudf/lists/lists_column_view.hpp
+++ b/cpp/include/cudf/lists/lists_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,6 +38,7 @@ namespace cudf {
  */
 class lists_column_view : private column_view {
  public:
+  lists_column_view() = default;
   /**
    * @brief Construct a new lists column view object from a column view.
    *
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index c386b3a22b4..181af11adb8 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -320,6 +320,7 @@ std::unique_ptr<column> distinct_indices(
  * @param keep Copy any, first, last, or none of the found duplicates
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
  * @param nans_equal Flag to specify whether NaN elements should be considered as equal
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned table
  * @return Table with distinct rows, preserving input order
  */
@@ -329,6 +330,7 @@ std::unique_ptr<table> stable_distinct(
   duplicate_keep_option keep        = duplicate_keep_option::KEEP_ANY,
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index f105a6dc546..f5f3982a5d6 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -21,6 +21,7 @@
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -81,11 +82,11 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
   auto const total_bytes =
     cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
 
-  auto const threshold = get_offset64_threshold();
-  CUDF_EXPECTS(is_large_strings_enabled() || (total_bytes < threshold),
+  auto const threshold = cudf::strings::get_offset64_threshold();
+  CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || (total_bytes < threshold),
                "Size of output exceeds the column size limit",
                std::overflow_error);
-  if (total_bytes >= get_offset64_threshold()) {
+  if (total_bytes >= cudf::strings::get_offset64_threshold()) {
     // recompute as int64 offsets when above the threshold
     offsets_column = make_numeric_column(
       data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/include/cudf/strings/utilities.hpp b/cpp/include/cudf/strings/utilities.hpp
new file mode 100644
index 00000000000..ae445282382
--- /dev/null
+++ b/cpp/include/cudf/strings/utilities.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
+
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+namespace CUDF_EXPORT cudf {
+namespace strings {
+
+/**
+ * @brief Creates a string_view vector from a strings column.
+ *
+ * @param strings Strings column instance.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned vector's device memory.
+ * @return Device vector of string_views
+ */
+rmm::device_uvector<string_view> create_string_vector_from_column(
+  cudf::strings_column_view const strings,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Return the threshold size for a strings column to use int64 offsets
+ *
+ * A computed size above this threshold should using int64 offsets, otherwise
+ * int32 offsets. By default this function will return std::numeric_limits<int32_t>::max().
+ * This value can be overridden at runtime using the environment variable
+ * LIBCUDF_LARGE_STRINGS_THRESHOLD.
+ *
+ * @return size in bytes
+ */
+int64_t get_offset64_threshold();
+
+/**
+ * @brief Checks if large strings is enabled
+ *
+ * This checks the setting in the environment variable LIBCUDF_LARGE_STRINGS_ENABLED.
+ *
+ * @return true if large strings are supported
+ */
+bool is_large_strings_enabled();
+
+}  // namespace strings
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index b423eab6d38..3e2fa43cb50 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -55,4 +55,20 @@ struct pinned_mr_options {
  */
 bool config_default_pinned_memory_resource(pinned_mr_options const& opts);
 
+/**
+ * @brief Set the threshold size for using kernels for pinned memory copies.
+ *
+ * @param threshold The threshold size in bytes. If the size of the copy is less than this
+ * threshold, the copy will be done using kernels. If the size is greater than or equal to this
+ * threshold, the copy will be done using cudaMemcpyAsync.
+ */
+void set_kernel_pinned_copy_threshold(size_t threshold);
+
+/**
+ * @brief Get the threshold size for using kernels for pinned memory copies.
+ *
+ * @return The threshold size in bytes.
+ */
+size_t get_kernel_pinned_copy_threshold();
+
 }  // namespace cudf
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 47d17988775..7363f965af8 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -314,7 +314,12 @@ auto make_chars_and_offsets(StringsIterator begin, StringsIterator end, Validity
   for (auto str = begin; str < end; ++str) {
     std::string tmp = (*v++) ? std::string(*str) : std::string{};
     chars.insert(chars.end(), std::cbegin(tmp), std::cend(tmp));
-    offsets.push_back(offsets.back() + tmp.length());
+    auto const last_offset = static_cast<std::size_t>(offsets.back());
+    auto const next_offset = last_offset + tmp.length();
+    CUDF_EXPECTS(
+      next_offset < static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+      "Cannot use strings_column_wrapper to build a large strings column");
+    offsets.push_back(static_cast<cudf::size_type>(next_offset));
   }
   return std::pair(std::move(chars), std::move(offsets));
 };
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 5177e7d4bda..c6af0c3c58a 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -49,9 +49,16 @@ struct type_casted_accessor {
                                         column_device_view const& col,
                                         bool is_scalar) const
   {
-    if constexpr (column_device_view::has_element_accessor<Element>() and
-                  std::is_convertible_v<Element, CastType>)
-      return static_cast<CastType>(col.element<Element>(is_scalar ? 0 : i));
+    if constexpr (column_device_view::has_element_accessor<Element>()) {
+      auto const element = col.element<Element>(is_scalar ? 0 : i);
+      if constexpr (std::is_convertible_v<Element, CastType>) {
+        return static_cast<CastType>(element);
+      } else if constexpr (is_fixed_point<Element>() && cuda::std::is_floating_point_v<CastType>) {
+        return convert_fixed_to_floating<CastType>(element);
+      } else if constexpr (is_fixed_point<CastType>() && cuda::std::is_floating_point_v<Element>) {
+        return convert_floating_to_fixed<CastType>(element, numeric::scale_type{0});
+      }
+    }
     return {};
   }
 };
@@ -159,6 +166,7 @@ struct ops2_wrapper {
       TypeRhs y   = rhs.element<TypeRhs>(is_rhs_scalar ? 0 : i);
       auto result = [&]() {
         if constexpr (std::is_same_v<BinaryOperator, ops::NullEquals> or
+                      std::is_same_v<BinaryOperator, ops::NullNotEquals> or
                       std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
                       std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
                       std::is_same_v<BinaryOperator, ops::NullMax> or
diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp
index 2b6a4f58895..b62c5f1f4e1 100644
--- a/cpp/src/binaryop/compiled/util.cpp
+++ b/cpp/src/binaryop/compiled/util.cpp
@@ -31,8 +31,8 @@ struct common_type_functor {
   template <typename TypeLhs, typename TypeRhs>
   std::optional<data_type> operator()() const
   {
-    if constexpr (cudf::has_common_type_v<TypeLhs, TypeRhs>) {
-      using TypeCommon = std::common_type_t<TypeLhs, TypeRhs>;
+    if constexpr (binary_op_has_common_type_v<TypeLhs, TypeRhs>) {
+      using TypeCommon = binary_op_common_type_t<TypeLhs, TypeRhs>;
       return data_type{type_to_id<TypeCommon>()};
     }
 
@@ -85,8 +85,8 @@ struct is_binary_operation_supported {
   {
     if constexpr (column_device_view::has_element_accessor<TypeLhs>() and
                   column_device_view::has_element_accessor<TypeRhs>()) {
-      if constexpr (has_common_type_v<TypeLhs, TypeRhs>) {
-        using common_t = std::common_type_t<TypeLhs, TypeRhs>;
+      if constexpr (binary_op_has_common_type_v<TypeLhs, TypeRhs>) {
+        using common_t = binary_op_common_type_t<TypeLhs, TypeRhs>;
         return std::is_invocable_v<BinaryOperator, common_t, common_t>;
       } else {
         return std::is_invocable_v<BinaryOperator, TypeLhs, TypeRhs>;
@@ -102,8 +102,8 @@ struct is_binary_operation_supported {
     if constexpr (column_device_view::has_element_accessor<TypeLhs>() and
                   column_device_view::has_element_accessor<TypeRhs>()) {
       if (has_mutable_element_accessor(out_type) or is_fixed_point(out_type)) {
-        if constexpr (has_common_type_v<TypeLhs, TypeRhs>) {
-          using common_t = std::common_type_t<TypeLhs, TypeRhs>;
+        if constexpr (binary_op_has_common_type_v<TypeLhs, TypeRhs>) {
+          using common_t = binary_op_common_type_t<TypeLhs, TypeRhs>;
           if constexpr (std::is_invocable_v<BinaryOperator, common_t, common_t>) {
             using ReturnType = std::invoke_result_t<BinaryOperator, common_t, common_t>;
             return is_constructible<ReturnType>(out_type) or
diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu
index f8e3a9a83e3..ba00527f6b6 100644
--- a/cpp/src/copying/sample.cu
+++ b/cpp/src/copying/sample.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index bfd89200786..ea80ae73c2f 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -24,136 +24,11 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
-constexpr int decode_block_size = 128;
-constexpr int rolling_buf_size  = decode_block_size * 2;
-// the required number of runs in shared memory we will need to provide the
-// rle_stream object
-constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<decode_block_size>();
-
-template <bool nullable, typename level_t, typename state_buf>
-static __device__ int gpuUpdateValidityOffsetsAndRowIndicesFlat(
-  int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
-{
-  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
-  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
-
-  auto& ni = s->nesting_info[0];
-
-  // how many (input) values we've processed in the page so far
-  int value_count = s->input_value_count;
-  int valid_count = ni.valid_count;
-
-  // cap by last row so that we don't process any rows past what we want to output.
-  int const first_row                 = s->first_row;
-  int const last_row                  = first_row + s->num_rows;
-  int const capped_target_value_count = min(target_value_count, last_row);
-
-  int const valid_map_offset      = ni.valid_map_offset;
-  int const row_index_lower_bound = s->row_index_lower_bound;
-
-  __syncthreads();
-
-  while (value_count < capped_target_value_count) {
-    int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
-
-    // definition level. only need to process for nullable columns
-    int d = 0;
-    if constexpr (nullable) {
-      d = t < batch_size
-            ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
-            : -1;
-    }
-
-    int const thread_value_count = t + 1;
-    int const block_value_count  = batch_size;
-
-    // compute our row index, whether we're in row bounds, and validity
-    int const row_index     = (thread_value_count + value_count) - 1;
-    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
-    int is_valid;
-    if constexpr (nullable) {
-      is_valid = ((d > 0) && in_row_bounds) ? 1 : 0;
-    } else {
-      is_valid = in_row_bounds;
-    }
-
-    // thread and block validity count
-    int thread_valid_count, block_valid_count;
-    if constexpr (nullable) {
-      using block_scan = cub::BlockScan<int, decode_block_size>;
-      __shared__ typename block_scan::TempStorage scan_storage;
-      block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
-      __syncthreads();
-
-      // validity is processed per-warp
-      //
-      // nested schemas always read and write to the same bounds (that is, read and write
-      // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
-      // at the first value, even if that is before first_row, because we cannot trivially jump to
-      // the correct position to start reading. since we are about to write the validity vector
-      // here we need to adjust our computed mask to take into account the write row bounds.
-      int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
-      int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
-      int warp_null_count   = 0;
-      if (write_start >= 0) {
-        uint32_t const warp_validity_mask = ballot(is_valid);
-        // lane 0 from each warp writes out validity
-        if ((t % cudf::detail::warp_size) == 0) {
-          int const vindex = (value_count + thread_value_count) - 1;  // absolute input value index
-          int const bit_offset = (valid_map_offset + vindex + write_start) -
-                                 first_row;  // absolute bit offset into the output validity map
-          int const write_end =
-            cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
-          int const bit_count = write_end - write_start;
-          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
-
-          store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
-        }
-      }
-
-      // sum null counts. we have to do it this way instead of just incrementing by (value_count -
-      // valid_count) because valid_count also includes rows that potentially start before our row
-      // bounds. if we could come up with a way to clean that up, we could remove this and just
-      // compute it directly at the end of the kernel.
-      size_type const block_null_count =
-        cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
-      if (t == 0) { ni.null_count += block_null_count; }
-    }
-    // trivial for non-nullable columns
-    else {
-      thread_valid_count = thread_value_count;
-      block_valid_count  = block_value_count;
-    }
-
-    // output offset
-    if (is_valid) {
-      int const dst_pos = (value_count + thread_value_count) - 1;
-      int const src_pos = (valid_count + thread_valid_count) - 1;
-      sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
-    }
-
-    // update stuff
-    value_count += block_value_count;
-    valid_count += block_valid_count;
-  }
-
-  if (t == 0) {
-    // update valid value count for decoding and total # of values we've processed
-    ni.valid_count       = valid_count;
-    ni.value_count       = value_count;
-    s->nz_count          = valid_count;
-    s->input_value_count = value_count;
-    s->input_row_count   = value_count;
-  }
-
-  return valid_count;
-}
-
-template <typename state_buf>
-__device__ inline void gpuDecodeValues(
+template <int block_size, typename state_buf>
+__device__ inline void gpuDecodeFixedWidthValues(
   page_state_s* s, state_buf* const sb, int start, int end, int t)
 {
-  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int num_warps      = block_size / cudf::detail::warp_size;
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
@@ -217,18 +92,22 @@ __device__ inline void gpuDecodeValues(
   }
 }
 
-template <typename state_buf>
-__device__ inline void gpuDecodeSplitValues(page_state_s* s,
-                                            state_buf* const sb,
-                                            int start,
-                                            int end)
+template <int block_size, typename state_buf>
+struct decode_fixed_width_values_func {
+  __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t)
+  {
+    gpuDecodeFixedWidthValues<block_size, state_buf>(s, sb, start, end, t);
+  }
+};
+
+template <int block_size, typename state_buf>
+__device__ inline void gpuDecodeFixedWidthSplitValues(
+  page_state_s* s, state_buf* const sb, int start, int end, int t)
 {
   using cudf::detail::warp_size;
-  constexpr int num_warps      = decode_block_size / warp_size;
+  constexpr int num_warps      = block_size / warp_size;
   constexpr int max_batch_size = num_warps * warp_size;
 
-  auto const t = threadIdx.x;
-
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
   int const dtype                          = s->col.physical_type;
   auto const data_len                      = thrust::distance(s->data_start, s->data_end);
@@ -307,266 +186,293 @@ __device__ inline void gpuDecodeSplitValues(page_state_s* s,
   }
 }
 
-// is the page marked nullable or not
-__device__ inline bool is_nullable(page_state_s* s)
-{
-  auto const lvl           = level_type::DEFINITION;
-  auto const max_def_level = s->col.max_level[lvl];
-  return max_def_level > 0;
-}
+template <int block_size, typename state_buf>
+struct decode_fixed_width_split_values_func {
+  __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t)
+  {
+    gpuDecodeFixedWidthSplitValues<block_size, state_buf>(s, sb, start, end, t);
+  }
+};
 
-// for a nullable page, check to see if it could have nulls
-__device__ inline bool has_nulls(page_state_s* s)
+template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+static __device__ int gpuUpdateValidityAndRowIndicesNested(
+  int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
-  auto const lvl      = level_type::DEFINITION;
-  auto const init_run = s->initial_rle_run[lvl];
-  // literal runs, lets assume they could hold nulls
-  if (is_literal_run(init_run)) { return true; }
-
-  // repeated run with number of items in the run not equal
-  // to the rows in the page, assume that means we could have nulls
-  if (s->page.num_input_values != (init_run >> 1)) { return true; }
-
-  auto const lvl_bits = s->col.level_bits[lvl];
-  auto const run_val  = lvl_bits == 0 ? 0 : s->initial_rle_value[lvl];
-
-  // the encoded repeated value isn't valid, we have (all) nulls
-  return run_val != s->col.max_level[lvl];
-}
+  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
-/**
- * @brief Kernel for computing fixed width non dictionary column data stored in the pages
- *
- * This function will write the page data and the page data's validity to the
- * output specified in the page's column chunk. If necessary, additional
- * conversion will be performed to translate from the Parquet datatype to
- * desired output datatype.
- *
- * @param pages List of pages
- * @param chunks List of column chunks
- * @param min_row Row index to start reading at
- * @param num_rows Maximum number of rows to read
- * @param error_code Error code to set if an error is encountered
- */
-template <typename level_t>
-CUDF_KERNEL void __launch_bounds__(decode_block_size)
-  gpuDecodePageDataFixed(PageInfo* pages,
-                         device_span<ColumnChunkDesc const> chunks,
-                         size_t min_row,
-                         size_t num_rows,
-                         kernel_error::pointer error_code)
-{
-  __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
-                                                1,                 // unused in this kernel
-                                                1>                 // unused in this kernel
-    state_buffers;
+  // how many (input) values we've processed in the page so far
+  int value_count = s->input_value_count;
 
-  page_state_s* const s = &state_g;
-  auto* const sb        = &state_buffers;
-  int const page_idx    = blockIdx.x;
-  int const t           = threadIdx.x;
-  PageInfo* pp          = &pages[page_idx];
+  // cap by last row so that we don't process any rows past what we want to output.
+  int const first_row                 = s->first_row;
+  int const last_row                  = first_row + s->num_rows;
+  int const capped_target_value_count = min(target_value_count, last_row);
 
-  if (!(BitAnd(pages[page_idx].kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT))) { return; }
+  int const row_index_lower_bound = s->row_index_lower_bound;
 
-  // must come after the kernel mask check
-  [[maybe_unused]] null_count_back_copier _{s, t};
+  int const max_depth = s->col.max_nesting_depth - 1;
+  __syncthreads();
 
-  if (!setupLocalPageInfo(s,
-                          pp,
-                          chunks,
-                          min_row,
-                          num_rows,
-                          mask_filter{decode_kernel_mask::FIXED_WIDTH_NO_DICT},
-                          page_processing_stage::DECODE)) {
-    return;
-  }
+  while (value_count < capped_target_value_count) {
+    int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-  // the level stream decoders
-  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
-  rle_stream<level_t, decode_block_size, rolling_buf_size> def_decoder{def_runs};
+    // definition level. only need to process for nullable columns
+    int d = 0;
+    if constexpr (nullable) {
+      if (def) {
+        d = t < batch_size
+              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
+              : -1;
+      } else {
+        d = t < batch_size ? 1 : -1;
+      }
+    }
 
-  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
-  if (s->num_rows == 0) { return; }
+    int const thread_value_count = t + 1;
+    int const block_value_count  = batch_size;
 
-  bool const nullable            = is_nullable(s);
-  bool const nullable_with_nulls = nullable && has_nulls(s);
+    // compute our row index, whether we're in row bounds, and validity
+    int const row_index           = (thread_value_count + value_count) - 1;
+    int const in_row_bounds       = (row_index >= row_index_lower_bound) && (row_index < last_row);
+    int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
+    int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
+
+    // iterate by depth
+    for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
+      auto& ni = s->nesting_info[d_idx];
+
+      int is_valid;
+      if constexpr (nullable) {
+        is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
+      } else {
+        is_valid = in_row_bounds;
+      }
 
-  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
-  level_t* const def = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
-  if (nullable_with_nulls) {
-    def_decoder.init(s->col.level_bits[level_type::DEFINITION],
-                     s->abs_lvl_start[level_type::DEFINITION],
-                     s->abs_lvl_end[level_type::DEFINITION],
-                     def,
-                     s->page.num_input_values);
-  }
-  __syncthreads();
+      // thread and block validity count
+      int thread_valid_count, block_valid_count;
+      if constexpr (nullable) {
+        using block_scan = cub::BlockScan<int, decode_block_size>;
+        __shared__ typename block_scan::TempStorage scan_storage;
+        block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
+        __syncthreads();
+
+        // validity is processed per-warp
+        //
+        // nested schemas always read and write to the same bounds (that is, read and write
+        // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
+        // at the first value, even if that is before first_row, because we cannot trivially jump to
+        // the correct position to start reading. since we are about to write the validity vector
+        // here we need to adjust our computed mask to take into account the write row bounds.
+        int warp_null_count = 0;
+        if (write_start >= 0 && ni.valid_map != nullptr) {
+          int const valid_map_offset        = ni.valid_map_offset;
+          uint32_t const warp_validity_mask = ballot(is_valid);
+          // lane 0 from each warp writes out validity
+          if ((t % cudf::detail::warp_size) == 0) {
+            int const vindex =
+              (value_count + thread_value_count) - 1;  // absolute input value index
+            int const bit_offset = (valid_map_offset + vindex + write_start) -
+                                   first_row;  // absolute bit offset into the output validity map
+            int const write_end = cudf::detail::warp_size -
+                                  __clz(in_write_row_bounds);  // last bit in the warp to store
+            int const bit_count = write_end - write_start;
+            warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
+
+            store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
+          }
+        }
 
-  // We use two counters in the loop below: processed_count and valid_count.
-  // - processed_count: number of rows out of num_input_values that we have decoded so far.
-  //   the definition stream returns the number of total rows it has processed in each call
-  //   to decode_next and we accumulate in process_count.
-  // - valid_count: number of non-null rows we have decoded so far. In each iteration of the
-  //   loop below, we look at the number of valid items (which could be all for non-nullable),
-  //   and valid_count is that running count.
-  int processed_count = 0;
-  int valid_count     = 0;
-  // the core loop. decode batches of level stream data using rle_stream objects
-  // and pass the results to gpuDecodeValues
-  while (s->error == 0 && processed_count < s->page.num_input_values) {
-    int next_valid_count;
+        // sum null counts. we have to do it this way instead of just incrementing by (value_count -
+        // valid_count) because valid_count also includes rows that potentially start before our row
+        // bounds. if we could come up with a way to clean that up, we could remove this and just
+        // compute it directly at the end of the kernel.
+        size_type const block_null_count =
+          cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+        if (t == 0) { ni.null_count += block_null_count; }
+      }
+      // trivial for non-nullable columns
+      else {
+        thread_valid_count = thread_value_count;
+        block_valid_count  = block_value_count;
+      }
 
-    // only need to process definition levels if the column has nulls
-    if (nullable_with_nulls) {
-      processed_count += def_decoder.decode_next(t);
-      __syncthreads();
+      // if this is valid and we're at the leaf, output dst_pos
+      __syncthreads();  // handle modification of ni.value_count from below
+      if (is_valid && d_idx == max_depth) {
+        // for non-list types, the value count is always the same across
+        int const dst_pos = (value_count + thread_value_count) - 1;
+        int const src_pos = (ni.valid_count + thread_valid_count) - 1;
+        sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+      }
+      __syncthreads();  // handle modification of ni.value_count from below
 
-      next_valid_count =
-        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
+      // update stuff
+      if (t == 0) { ni.valid_count += block_valid_count; }
     }
-    // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
-    // this function call entirely since all it will ever generate is a mapping of (i -> i) for
-    // nz_idx.  gpuDecodeValues would be the only work that happens.
-    else {
-      processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t);
-    }
-    __syncthreads();
 
-    // decode the values themselves
-    gpuDecodeValues(s, sb, valid_count, next_valid_count, t);
-    __syncthreads();
+    value_count += block_value_count;
+  }
 
-    valid_count = next_valid_count;
+  if (t == 0) {
+    // update valid value count for decoding and total # of values we've processed
+    s->nz_count          = s->nesting_info[max_depth].valid_count;
+    s->input_value_count = value_count;
+    s->input_row_count   = value_count;
   }
-  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
+
+  __syncthreads();
+  return s->nesting_info[max_depth].valid_count;
 }
 
-/**
- * @brief Kernel for computing fixed width dictionary column data stored in the pages
- *
- * This function will write the page data and the page data's validity to the
- * output specified in the page's column chunk. If necessary, additional
- * conversion will be performed to translate from the Parquet datatype to
- * desired output datatype.
- *
- * @param pages List of pages
- * @param chunks List of column chunks
- * @param min_row Row index to start reading at
- * @param num_rows Maximum number of rows to read
- * @param error_code Error code to set if an error is encountered
- */
-template <typename level_t>
-CUDF_KERNEL void __launch_bounds__(decode_block_size)
-  gpuDecodePageDataFixedDict(PageInfo* pages,
-                             device_span<ColumnChunkDesc const> chunks,
-                             size_t min_row,
-                             size_t num_rows,
-                             kernel_error::pointer error_code)
+template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+static __device__ int gpuUpdateValidityAndRowIndicesFlat(
+  int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
-  __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
-                                                rolling_buf_size,  // dictionary
-                                                1>                 // unused in this kernel
-    state_buffers;
-
-  page_state_s* const s = &state_g;
-  auto* const sb        = &state_buffers;
-  int const page_idx    = blockIdx.x;
-  int const t           = threadIdx.x;
-  PageInfo* pp          = &pages[page_idx];
+  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
-  if (!(BitAnd(pages[page_idx].kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT))) { return; }
+  auto& ni = s->nesting_info[0];
 
-  // must come after the kernel mask check
-  [[maybe_unused]] null_count_back_copier _{s, t};
+  // how many (input) values we've processed in the page so far
+  int value_count = s->input_value_count;
+  int valid_count = ni.valid_count;
 
-  if (!setupLocalPageInfo(s,
-                          pp,
-                          chunks,
-                          min_row,
-                          num_rows,
-                          mask_filter{decode_kernel_mask::FIXED_WIDTH_DICT},
-                          page_processing_stage::DECODE)) {
-    return;
-  }
+  // cap by last row so that we don't process any rows past what we want to output.
+  int const first_row                 = s->first_row;
+  int const last_row                  = first_row + s->num_rows;
+  int const capped_target_value_count = min(target_value_count, last_row);
 
-  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
-  rle_stream<level_t, decode_block_size, rolling_buf_size> def_decoder{def_runs};
+  int const valid_map_offset      = ni.valid_map_offset;
+  int const row_index_lower_bound = s->row_index_lower_bound;
 
-  __shared__ rle_run<uint32_t> dict_runs[rle_run_buffer_size];
-  rle_stream<uint32_t, decode_block_size, rolling_buf_size> dict_stream{dict_runs};
+  __syncthreads();
 
-  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
-  if (s->num_rows == 0) { return; }
+  while (value_count < capped_target_value_count) {
+    int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-  bool const nullable            = is_nullable(s);
-  bool const nullable_with_nulls = nullable && has_nulls(s);
+    // definition level. only need to process for nullable columns
+    int d = 0;
+    if constexpr (nullable) {
+      if (def) {
+        d = t < batch_size
+              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
+              : -1;
+      } else {
+        d = t < batch_size ? 1 : -1;
+      }
+    }
 
-  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
-  level_t* const def = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
-  if (nullable_with_nulls) {
-    def_decoder.init(s->col.level_bits[level_type::DEFINITION],
-                     s->abs_lvl_start[level_type::DEFINITION],
-                     s->abs_lvl_end[level_type::DEFINITION],
-                     def,
-                     s->page.num_input_values);
-  }
+    int const thread_value_count = t + 1;
+    int const block_value_count  = batch_size;
 
-  dict_stream.init(
-    s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values);
-  __syncthreads();
+    // compute our row index, whether we're in row bounds, and validity
+    int const row_index     = (thread_value_count + value_count) - 1;
+    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
+    int is_valid;
+    if constexpr (nullable) {
+      is_valid = ((d > 0) && in_row_bounds) ? 1 : 0;
+    } else {
+      is_valid = in_row_bounds;
+    }
 
-  // We use two counters in the loop below: processed_count and valid_count.
-  // - processed_count: number of rows out of num_input_values that we have decoded so far.
-  //   the definition stream returns the number of total rows it has processed in each call
-  //   to decode_next and we accumulate in process_count.
-  // - valid_count: number of non-null rows we have decoded so far. In each iteration of the
-  //   loop below, we look at the number of valid items (which could be all for non-nullable),
-  //   and valid_count is that running count.
-  int processed_count = 0;
-  int valid_count     = 0;
+    // thread and block validity count
+    int thread_valid_count, block_valid_count;
+    if constexpr (nullable) {
+      using block_scan = cub::BlockScan<int, decode_block_size>;
+      __shared__ typename block_scan::TempStorage scan_storage;
+      block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
+      __syncthreads();
 
-  // the core loop. decode batches of level stream data using rle_stream objects
-  // and pass the results to gpuDecodeValues
-  while (s->error == 0 && processed_count < s->page.num_input_values) {
-    int next_valid_count;
+      // validity is processed per-warp
+      //
+      // nested schemas always read and write to the same bounds (that is, read and write
+      // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
+      // at the first value, even if that is before first_row, because we cannot trivially jump to
+      // the correct position to start reading. since we are about to write the validity vector
+      // here we need to adjust our computed mask to take into account the write row bounds.
+      int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
+      int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
+      int warp_null_count   = 0;
+      if (write_start >= 0) {
+        uint32_t const warp_validity_mask = ballot(is_valid);
+        // lane 0 from each warp writes out validity
+        if ((t % cudf::detail::warp_size) == 0) {
+          int const vindex = (value_count + thread_value_count) - 1;  // absolute input value index
+          int const bit_offset = (valid_map_offset + vindex + write_start) -
+                                 first_row;  // absolute bit offset into the output validity map
+          int const write_end =
+            cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
+          int const bit_count = write_end - write_start;
+          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
 
-    // only need to process definition levels if the column has nulls
-    if (nullable_with_nulls) {
-      processed_count += def_decoder.decode_next(t);
-      __syncthreads();
+          store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
+        }
+      }
 
-      // count of valid items in this batch
-      next_valid_count =
-        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
+      // sum null counts. we have to do it this way instead of just incrementing by (value_count -
+      // valid_count) because valid_count also includes rows that potentially start before our row
+      // bounds. if we could come up with a way to clean that up, we could remove this and just
+      // compute it directly at the end of the kernel.
+      size_type const block_null_count =
+        cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+      if (t == 0) { ni.null_count += block_null_count; }
     }
-    // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
-    // this function call entirely since all it will ever generate is a mapping of (i -> i) for
-    // nz_idx.  gpuDecodeValues would be the only work that happens.
+    // trivial for non-nullable columns
     else {
-      processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t);
+      thread_valid_count = thread_value_count;
+      block_valid_count  = block_value_count;
     }
-    __syncthreads();
 
-    // We want to limit the number of dictionary items we decode, that correspond to
-    // the rows we have processed in this iteration that are valid.
-    // We know the number of valid rows to process with: next_valid_count - valid_count.
-    dict_stream.decode_next(t, next_valid_count - valid_count);
-    __syncthreads();
+    // output offset
+    if (is_valid) {
+      int const dst_pos = (value_count + thread_value_count) - 1;
+      int const src_pos = (valid_count + thread_valid_count) - 1;
+      sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+    }
 
-    // decode the values themselves
-    gpuDecodeValues(s, sb, valid_count, next_valid_count, t);
-    __syncthreads();
+    // update stuff
+    value_count += block_value_count;
+    valid_count += block_valid_count;
+  }
 
-    valid_count = next_valid_count;
+  if (t == 0) {
+    // update valid value count for decoding and total # of values we've processed
+    ni.valid_count       = valid_count;
+    ni.value_count       = value_count;  // TODO: remove? this is unused in the non-list path
+    s->nz_count          = valid_count;
+    s->input_value_count = value_count;
+    s->input_row_count   = value_count;
   }
-  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
+
+  return valid_count;
+}
+
+// is the page marked nullable or not
+__device__ inline bool is_nullable(page_state_s* s)
+{
+  auto const lvl           = level_type::DEFINITION;
+  auto const max_def_level = s->col.max_level[lvl];
+  return max_def_level > 0;
+}
+
+// for a nullable page, check to see if it could have nulls
+__device__ inline bool maybe_has_nulls(page_state_s* s)
+{
+  auto const lvl      = level_type::DEFINITION;
+  auto const init_run = s->initial_rle_run[lvl];
+  // literal runs, lets assume they could hold nulls
+  if (is_literal_run(init_run)) { return true; }
+
+  // repeated run with number of items in the run not equal
+  // to the rows in the page, assume that means we could have nulls
+  if (s->page.num_input_values != (init_run >> 1)) { return true; }
+
+  auto const lvl_bits = s->col.level_bits[lvl];
+  auto const run_val  = lvl_bits == 0 ? 0 : s->initial_rle_value[lvl];
+
+  // the encoded repeated value isn't valid, we have (all) nulls
+  return run_val != s->col.max_level[lvl];
 }
 
 /**
@@ -583,19 +489,28 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
  * @param num_rows Maximum number of rows to read
  * @param error_code Error code to set if an error is encountered
  */
-template <typename level_t>
-CUDF_KERNEL void __launch_bounds__(decode_block_size)
-  gpuDecodeSplitPageDataFlat(PageInfo* pages,
-                             device_span<ColumnChunkDesc const> chunks,
-                             size_t min_row,
-                             size_t num_rows,
-                             kernel_error::pointer error_code)
+template <typename level_t,
+          int decode_block_size_t,
+          decode_kernel_mask kernel_mask_t,
+          bool has_dict_t,
+          bool has_nesting_t,
+          template <int block_size, typename state_buf>
+          typename DecodeValuesFunc>
+CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
+  gpuDecodePageDataGeneric(PageInfo* pages,
+                           device_span<ColumnChunkDesc const> chunks,
+                           size_t min_row,
+                           size_t num_rows,
+                           kernel_error::pointer error_code)
 {
+  constexpr int rolling_buf_size    = decode_block_size_t * 2;
+  constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<decode_block_size_t>();
+
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
-                                                1,                 // unused in this kernel
-                                                1>                 // unused in this kernel
-    state_buffers;
+  using state_buf_t = page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
+                                           has_dict_t ? rolling_buf_size : 1,
+                                           1>;
+  __shared__ __align__(16) state_buf_t state_buffers;
 
   page_state_s* const s = &state_g;
   auto* const sb        = &state_buffers;
@@ -603,9 +518,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   int const t           = threadIdx.x;
   PageInfo* pp          = &pages[page_idx];
 
-  if (!(BitAnd(pages[page_idx].kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT))) {
-    return;
-  }
+  if (!(BitAnd(pages[page_idx].kernel_mask, kernel_mask_t))) { return; }
 
   // must come after the kernel mask check
   [[maybe_unused]] null_count_back_copier _{s, t};
@@ -615,30 +528,70 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
                           chunks,
                           min_row,
                           num_rows,
-                          mask_filter{decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT},
+                          mask_filter{kernel_mask_t},
                           page_processing_stage::DECODE)) {
     return;
   }
 
-  // the level stream decoders
-  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
-  rle_stream<level_t, decode_block_size, rolling_buf_size> def_decoder{def_runs};
-
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
   if (s->num_rows == 0) { return; }
 
-  bool const nullable            = is_nullable(s);
-  bool const nullable_with_nulls = nullable && has_nulls(s);
+  DecodeValuesFunc<decode_block_size_t, state_buf_t> decode_values;
+
+  bool const nullable             = is_nullable(s);
+  bool const should_process_nulls = nullable && maybe_has_nulls(s);
+
+  // shared buffer. all shared memory is suballocated out of here
+  // constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size *
+  // sizeof(rle_run<level_t>), size_t{16}) : 0;
+  constexpr int shared_dict_size =
+    has_dict_t
+      ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<uint32_t>), size_t{16})
+      : 0;
+  constexpr int shared_def_size =
+    cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), size_t{16});
+  constexpr int shared_buf_size = /*shared_rep_size +*/ shared_dict_size + shared_def_size;
+  __shared__ __align__(16) uint8_t shared_buf[shared_buf_size];
+
+  // setup all shared memory buffers
+  int shared_offset = 0;
+  /*
+  rle_run<level_t> *rep_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
+  if constexpr (has_lists_t){
+    shared_offset += shared_rep_size;
+  }
+  */
+  rle_run<uint32_t>* dict_runs = reinterpret_cast<rle_run<uint32_t>*>(shared_buf + shared_offset);
+  if constexpr (has_dict_t) { shared_offset += shared_dict_size; }
+  rle_run<level_t>* def_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
 
   // initialize the stream decoders (requires values computed in setupLocalPageInfo)
+  rle_stream<level_t, decode_block_size_t, rolling_buf_size> def_decoder{def_runs};
   level_t* const def = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
-  if (nullable_with_nulls) {
+  if (should_process_nulls) {
     def_decoder.init(s->col.level_bits[level_type::DEFINITION],
                      s->abs_lvl_start[level_type::DEFINITION],
                      s->abs_lvl_end[level_type::DEFINITION],
                      def,
                      s->page.num_input_values);
   }
+  /*
+  rle_stream<level_t, decode_block_size_t, rolling_buf_size> rep_decoder{rep_runs};
+  level_t* const rep = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
+  if constexpr(has_lists_t){
+    rep_decoder.init(s->col.level_bits[level_type::REPETITION],
+                     s->abs_lvl_start[level_type::REPETITION],
+                     s->abs_lvl_end[level_type::REPETITION],
+                     rep,
+                     s->page.num_input_values);
+  }
+  */
+
+  rle_stream<uint32_t, decode_block_size_t, rolling_buf_size> dict_stream{dict_runs};
+  if constexpr (has_dict_t) {
+    dict_stream.init(
+      s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values);
+  }
   __syncthreads();
 
   // We use two counters in the loop below: processed_count and valid_count.
@@ -655,26 +608,47 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
-    // only need to process definition levels if the column has nulls
-    if (nullable_with_nulls) {
+    // only need to process definition levels if this is a nullable column
+    if (should_process_nulls) {
       processed_count += def_decoder.decode_next(t);
       __syncthreads();
 
-      next_valid_count =
-        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
+      if constexpr (has_nesting_t) {
+        next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, true, level_t>(
+          processed_count, s, sb, def, t);
+      } else {
+        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, true, level_t>(
+          processed_count, s, sb, def, t);
+      }
     }
     // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
     // this function call entirely since all it will ever generate is a mapping of (i -> i) for
-    // nz_idx.  gpuDecodeValues would be the only work that happens.
+    // nz_idx.  gpuDecodeFixedWidthValues would be the only work that happens.
     else {
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t);
+
+      if constexpr (has_nesting_t) {
+        next_valid_count =
+          gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, false, level_t>(
+            processed_count, s, sb, nullptr, t);
+      } else {
+        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, false, level_t>(
+          processed_count, s, sb, nullptr, t);
+      }
     }
     __syncthreads();
 
+    // if we have dictionary data
+    if constexpr (has_dict_t) {
+      // We want to limit the number of dictionary items we decode, that correspond to
+      // the rows we have processed in this iteration that are valid.
+      // We know the number of valid rows to process with: next_valid_count - valid_count.
+      dict_stream.decode_next(t, next_valid_count - valid_count);
+      __syncthreads();
+    }
+
     // decode the values themselves
-    gpuDecodeSplitValues(s, sb, valid_count, next_valid_count);
+    decode_values(s, sb, valid_count, next_valid_count, t);
     __syncthreads();
 
     valid_count = next_valid_count;
@@ -689,18 +663,55 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                   size_t num_rows,
                                   size_t min_row,
                                   int level_type_size,
+                                  bool has_nesting,
                                   kernel_error::pointer error_code,
                                   rmm::cuda_stream_view stream)
 {
+  constexpr int decode_block_size = 128;
+
   dim3 dim_block(decode_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   if (level_type_size == 1) {
-    gpuDecodePageDataFixed<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
+                               false,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT,
+                               false,
+                               false,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   } else {
-    gpuDecodePageDataFixed<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
+                               false,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT,
+                               false,
+                               false,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   }
 }
 
@@ -709,40 +720,113 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
                                       size_t num_rows,
                                       size_t min_row,
                                       int level_type_size,
+                                      bool has_nesting,
                                       kernel_error::pointer error_code,
                                       rmm::cuda_stream_view stream)
 {
-  //  dim3 dim_block(decode_block_size, 1); // decode_block_size = 128 threads per block
-  // 1 full warp, and 1 warp of 1 thread
+  constexpr int decode_block_size = 128;
+
   dim3 dim_block(decode_block_size, 1);  // decode_block_size = 128 threads per block
   dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
-    gpuDecodePageDataFixedDict<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT,
+                               true,
+                               false,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   } else {
-    gpuDecodePageDataFixedDict<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT,
+                               true,
+                               false,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   }
 }
 
-void __host__ DecodeSplitPageDataFlat(cudf::detail::hostdevice_span<PageInfo> pages,
-                                      cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
-                                      size_t num_rows,
-                                      size_t min_row,
-                                      int level_type_size,
-                                      kernel_error::pointer error_code,
-                                      rmm::cuda_stream_view stream)
+void __host__
+DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
+                              cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                              size_t num_rows,
+                              size_t min_row,
+                              int level_type_size,
+                              bool has_nesting,
+                              kernel_error::pointer error_code,
+                              rmm::cuda_stream_view stream)
 {
+  constexpr int decode_block_size = 128;
+
   dim3 dim_block(decode_block_size, 1);  // decode_block_size = 128 threads per block
   dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
-    gpuDecodeSplitPageDataFlat<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
+                               true,
+                               true,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
+                               true,
+                               false,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   } else {
-    gpuDecodeSplitPageDataFlat<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
+                               true,
+                               true,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
+                               true,
+                               false,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   }
 }
 
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index cf0dd85e490..d604642be54 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -145,6 +145,11 @@ __device__ inline bool is_nested(ColumnChunkDesc const& chunk)
   return chunk.max_nesting_depth > 1;
 }
 
+__device__ inline bool is_list(ColumnChunkDesc const& chunk)
+{
+  return chunk.max_level[level_type::REPETITION] > 0;
+}
+
 __device__ inline bool is_byte_array(ColumnChunkDesc const& chunk)
 {
   return chunk.physical_type == BYTE_ARRAY;
@@ -178,14 +183,17 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
     return decode_kernel_mask::STRING;
   }
 
-  if (!is_nested(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
+  if (!is_list(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
     if (page.encoding == Encoding::PLAIN) {
-      return decode_kernel_mask::FIXED_WIDTH_NO_DICT;
+      return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED
+                              : decode_kernel_mask::FIXED_WIDTH_NO_DICT;
     } else if (page.encoding == Encoding::PLAIN_DICTIONARY ||
                page.encoding == Encoding::RLE_DICTIONARY) {
-      return decode_kernel_mask::FIXED_WIDTH_DICT;
+      return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED
+                              : decode_kernel_mask::FIXED_WIDTH_DICT;
     } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
-      return decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT;
+      return is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED
+                              : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT;
     }
   }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index d82c6f0de59..efc1f5ebab1 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -207,16 +207,20 @@ enum level_type {
  * Used to control which decode kernels to run.
  */
 enum class decode_kernel_mask {
-  NONE                   = 0,
-  GENERAL                = (1 << 0),  // Run catch-all decode kernel
-  STRING                 = (1 << 1),  // Run decode kernel for string data
-  DELTA_BINARY           = (1 << 2),  // Run decode kernel for DELTA_BINARY_PACKED data
-  DELTA_BYTE_ARRAY       = (1 << 3),  // Run decode kernel for DELTA_BYTE_ARRAY encoded data
-  DELTA_LENGTH_BA        = (1 << 4),  // Run decode kernel for DELTA_LENGTH_BYTE_ARRAY encoded data
-  FIXED_WIDTH_NO_DICT    = (1 << 5),  // Run decode kernel for fixed width non-dictionary pages
-  FIXED_WIDTH_DICT       = (1 << 6),  // Run decode kernel for fixed width dictionary pages
-  BYTE_STREAM_SPLIT      = (1 << 7),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data
-  BYTE_STREAM_SPLIT_FLAT = (1 << 8),  // Same as above but with a flat schema
+  NONE                = 0,
+  GENERAL             = (1 << 0),  // Run catch-all decode kernel
+  STRING              = (1 << 1),  // Run decode kernel for string data
+  DELTA_BINARY        = (1 << 2),  // Run decode kernel for DELTA_BINARY_PACKED data
+  DELTA_BYTE_ARRAY    = (1 << 3),  // Run decode kernel for DELTA_BYTE_ARRAY encoded data
+  DELTA_LENGTH_BA     = (1 << 4),  // Run decode kernel for DELTA_LENGTH_BYTE_ARRAY encoded data
+  FIXED_WIDTH_NO_DICT = (1 << 5),  // Run decode kernel for fixed width non-dictionary pages
+  FIXED_WIDTH_DICT    = (1 << 6),  // Run decode kernel for fixed width dictionary pages
+  BYTE_STREAM_SPLIT   = (1 << 7),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data
+  BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT = (1 << 8),  // Same as above but for flat, fixed-width data
+  BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED =
+    (1 << 9),                              // Same as above but for nested, fixed-width data
+  FIXED_WIDTH_NO_DICT_NESTED = (1 << 10),  // Run decode kernel for fixed width non-dictionary pages
+  FIXED_WIDTH_DICT_NESTED    = (1 << 11),  // Run decode kernel for fixed width dictionary pages
 };
 
 // mask representing all the ways in which a string can be encoded
@@ -888,6 +892,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -896,6 +901,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                          std::size_t num_rows,
                          size_t min_row,
                          int level_type_size,
+                         bool has_nesting,
                          kernel_error::pointer error_code,
                          rmm::cuda_stream_view stream);
 
@@ -910,6 +916,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -918,11 +925,12 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
                              std::size_t num_rows,
                              size_t min_row,
                              int level_type_size,
+                             bool has_nesting,
                              kernel_error::pointer error_code,
                              rmm::cuda_stream_view stream);
 
 /**
- * @brief Launches kernel for reading dictionary fixed width column data stored in the pages
+ * @brief Launches kernel for reading fixed width column data stored in the pages
  *
  * The page data will be written to the output pointed to in the page's
  * associated column chunk.
@@ -932,16 +940,18 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
-void DecodeSplitPageDataFlat(cudf::detail::hostdevice_span<PageInfo> pages,
-                             cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
-                             std::size_t num_rows,
-                             size_t min_row,
-                             int level_type_size,
-                             kernel_error::pointer error_code,
-                             rmm::cuda_stream_view stream);
+void DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
+                                   cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                                   std::size_t num_rows,
+                                   size_t min_row,
+                                   int level_type_size,
+                                   bool has_nesting,
+                                   kernel_error::pointer error_code,
+                                   rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for initializing encoder row group fragments
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 1bd2fae281c..f705f6626e7 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -267,14 +267,27 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
   }
 
   // launch byte stream split decoder
-  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT) != 0) {
-    DecodeSplitPageDataFlat(subpass.pages,
-                            pass.chunks,
-                            num_rows,
-                            skip_rows,
-                            level_type_size,
-                            error_code.data(),
-                            streams[s_idx++]);
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT) != 0) {
+    DecodeSplitPageFixedWidthData(subpass.pages,
+                                  pass.chunks,
+                                  num_rows,
+                                  skip_rows,
+                                  level_type_size,
+                                  false,
+                                  error_code.data(),
+                                  streams[s_idx++]);
+  }
+
+  // launch byte stream split decoder, for nested columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED) != 0) {
+    DecodeSplitPageFixedWidthData(subpass.pages,
+                                  pass.chunks,
+                                  num_rows,
+                                  skip_rows,
+                                  level_type_size,
+                                  true,
+                                  error_code.data(),
+                                  streams[s_idx++]);
   }
 
   // launch byte stream split decoder
@@ -288,22 +301,50 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                         streams[s_idx++]);
   }
 
+  // launch fixed width type decoder
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT) != 0) {
     DecodePageDataFixed(subpass.pages,
                         pass.chunks,
                         num_rows,
                         skip_rows,
                         level_type_size,
+                        false,
+                        error_code.data(),
+                        streams[s_idx++]);
+  }
+
+  // launch fixed width type decoder, for nested columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED) != 0) {
+    DecodePageDataFixed(subpass.pages,
+                        pass.chunks,
+                        num_rows,
+                        skip_rows,
+                        level_type_size,
+                        true,
                         error_code.data(),
                         streams[s_idx++]);
   }
 
+  // launch fixed width type decoder with dictionaries
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT) != 0) {
     DecodePageDataFixedDict(subpass.pages,
                             pass.chunks,
                             num_rows,
                             skip_rows,
                             level_type_size,
+                            false,
+                            error_code.data(),
+                            streams[s_idx++]);
+  }
+
+  // launch fixed width type decoder with dictionaries, for nested columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_NESTED) != 0) {
+    DecodePageDataFixedDict(subpass.pages,
+                            pass.chunks,
+                            num_rows,
+                            skip_rows,
+                            level_type_size,
+                            true,
                             error_code.data(),
                             streams[s_idx++]);
   }
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 9ad5a2d6e8d..d371ef5de93 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -337,7 +337,8 @@ int64_t find_next_split(int64_t cur_pos,
                         size_t cur_row_index,
                         size_t cur_cumulative_size,
                         cudf::host_span<cumulative_page_info const> sizes,
-                        size_t size_limit)
+                        size_t size_limit,
+                        size_t min_row_count)
 {
   auto const start = thrust::make_transform_iterator(
     sizes.begin(),
@@ -357,7 +358,7 @@ int64_t find_next_split(int64_t cur_pos,
   // this guarantees that even if we cannot fit the set of rows represented by our where our cur_pos
   // is, we will still move forward instead of failing.
   while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
-         (sizes[split_pos].end_row_index == cur_row_index)) {
+         (sizes[split_pos].end_row_index - cur_row_index < min_row_count)) {
     split_pos++;
   }
 
@@ -657,8 +658,10 @@ std::tuple<rmm::device_uvector<page_span>, size_t, size_t> compute_next_subpass(
   auto const start_index = find_start_index(h_aggregated_info, start_row);
   auto const cumulative_size =
     start_row == 0 || start_index == 0 ? 0 : h_aggregated_info[start_index - 1].size_bytes;
+  // when choosing subpasses, we need to guarantee at least 2 rows in the included pages so that all
+  // list columns have a clear start and end.
   auto const end_index =
-    find_next_split(start_index, start_row, cumulative_size, h_aggregated_info, size_limit);
+    find_next_split(start_index, start_row, cumulative_size, h_aggregated_info, size_limit, 2);
   auto const end_row = h_aggregated_info[end_index].end_row_index;
 
   // for each column, collect the set of pages that spans start_row / end_row
@@ -703,8 +706,8 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
   size_t cur_cumulative_size = 0;
   auto const max_row         = min(skip_rows + num_rows, h_aggregated_info.back().end_row_index);
   while (cur_row_index < max_row) {
-    auto const split_pos =
-      find_next_split(cur_pos, cur_row_index, cur_cumulative_size, h_aggregated_info, size_limit);
+    auto const split_pos = find_next_split(
+      cur_pos, cur_row_index, cur_cumulative_size, h_aggregated_info, size_limit, 1);
 
     auto const start_row = cur_row_index;
     cur_row_index        = min(max_row, h_aggregated_info[split_pos].end_row_index);
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 9df5c362cdd..f28a7311ccb 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1436,7 +1436,8 @@ void reader::impl::preprocess_subpass_pages(read_mode mode, size_t chunk_read_li
     // subpass since we know that will safely completed.
     bool const is_list = chunk.max_level[level_type::REPETITION] > 0;
     if (is_list && max_col_row < last_pass_row) {
-      size_t const min_col_row = static_cast<size_t>(chunk.start_row + last_page.chunk_row);
+      auto const& first_page   = subpass.pages[page_index];
+      size_t const min_col_row = static_cast<size_t>(chunk.start_row + first_page.chunk_row);
       CUDF_EXPECTS((max_col_row - min_col_row) > 1, "Unexpected short subpass");
       max_col_row--;
     }
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 9acd6a1e3a9..aed745c42dd 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -18,6 +18,7 @@
 
 #include "hostdevice_span.hpp"
 
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -124,26 +125,22 @@ class hostdevice_vector {
 
   void host_to_device_async(rmm::cuda_stream_view stream)
   {
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(device_ptr(), host_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
+    cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   void host_to_device_sync(rmm::cuda_stream_view stream)
   {
-    host_to_device_async(stream);
-    stream.synchronize();
+    cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   void device_to_host_async(rmm::cuda_stream_view stream)
   {
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(host_ptr(), device_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
+    cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   void device_to_host_sync(rmm::cuda_stream_view stream)
   {
-    device_to_host_async(stream);
-    stream.synchronize();
+    cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   /**
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index f02dee5f7f5..97a06d5a923 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -48,8 +48,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
 {
   if (right.num_rows() == 0) {
     switch (join_type) {
-      case join_kind::LEFT_ANTI_JOIN:
-        return std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
+      case join_kind::LEFT_ANTI_JOIN: return get_trivial_left_join_indices(left, stream, mr).first;
       case join_kind::LEFT_SEMI_JOIN:
         return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
       default: CUDF_FAIL("Invalid join kind."); break;
@@ -96,10 +95,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
     join_size = size.value(stream);
   }
 
-  if (left.num_rows() == 0) {
-    return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
-  }
-
   rmm::device_scalar<size_type> write_index(0, stream);
 
   auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
@@ -149,8 +144,7 @@ conditional_join(table_view const& left,
       // with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
       case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::FULL_JOIN:
-        return get_trivial_left_join_indices(left, stream, rmm::mr::get_current_device_resource());
+      case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left, stream, mr);
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN:
@@ -169,8 +163,7 @@ conditional_join(table_view const& left,
                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       // Full joins need to return the trivial complement.
       case join_kind::FULL_JOIN: {
-        auto ret_flipped =
-          get_trivial_left_join_indices(right, stream, rmm::mr::get_current_device_resource());
+        auto ret_flipped = get_trivial_left_join_indices(right, stream, mr);
         return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
       }
       default: CUDF_FAIL("Invalid join kind."); break;
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 89b1a126fc5..779eca438db 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/copy_range.cuh>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/indexalator.cuh>
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index a6f15cc49ec..e5cf29f3ebf 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -17,28 +17,62 @@
 #include "distinct_helpers.hpp"
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/stream_compaction.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <cuda/functional>
-#include <thrust/copy.h>
-#include <thrust/distance.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/discard_iterator.h>
-
 #include <utility>
 #include <vector>
 
 namespace cudf {
 namespace detail {
+namespace {
+/**
+ * @brief Invokes the given `func` with desired the row equality
+ *
+ * @tparam HasNested Flag indicating whether there are nested columns in the input
+ * @tparam Func Type of the helper function doing `distinct` check
+ *
+ * @param compare_nulls Control whether nulls should be compared as equal or not
+ * @param compare_nans Control whether floating-point NaNs values should be compared as equal or not
+ * @param has_nulls Flag indicating whether the input has nulls or not
+ * @param row_equal Self table comparator
+ * @param func The input functor to invoke
+ */
+template <bool HasNested, typename Func>
+rmm::device_uvector<cudf::size_type> dipatch_row_equal(
+  null_equality compare_nulls,
+  nan_equality compare_nans,
+  bool has_nulls,
+  cudf::experimental::row::equality::self_comparator row_equal,
+  Func&& func)
+{
+  if (compare_nans == nan_equality::ALL_EQUAL) {
+    auto const d_equal = row_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{has_nulls},
+      compare_nulls,
+      cudf::experimental::row::equality::nan_equal_physical_equality_comparator{});
+    return func(d_equal);
+  } else {
+    auto const d_equal = row_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{has_nulls},
+      compare_nulls,
+      cudf::experimental::row::equality::physical_equality_comparator{});
+    return func(d_equal);
+  }
+}
+}  // namespace
 
 rmm::device_uvector<size_type> distinct_indices(table_view const& input,
                                                 duplicate_keep_option keep,
@@ -47,97 +81,39 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
                                                 rmm::cuda_stream_view stream,
                                                 rmm::device_async_resource_ref mr)
 {
-  if (input.num_rows() == 0 or input.num_columns() == 0) {
+  auto const num_rows = input.num_rows();
+
+  if (num_rows == 0 or input.num_columns() == 0) {
     return rmm::device_uvector<size_type>(0, stream, mr);
   }
 
-  auto map = hash_map_type{compute_hash_table_size(input.num_rows()),
-                           cuco::empty_key{-1},
-                           cuco::empty_value{std::numeric_limits<size_type>::min()},
-                           cudf::detail::cuco_allocator{stream},
-                           stream.value()};
-
   auto const preprocessed_input =
     cudf::experimental::row::hash::preprocessed_table::create(input, stream);
   auto const has_nulls          = nullate::DYNAMIC{cudf::has_nested_nulls(input)};
   auto const has_nested_columns = cudf::detail::has_nested_columns(input);
 
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher = row_hasher.device_hasher(has_nulls);
-
-  auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
-
-  auto const pair_iter = cudf::detail::make_counting_transform_iterator(
-    size_type{0},
-    cuda::proclaim_return_type<cuco::pair<size_type, size_type>>(
-      [] __device__(size_type const i) { return cuco::make_pair(i, i); }));
-
-  auto const insert_keys = [&](auto const value_comp) {
-    if (has_nested_columns) {
-      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
-      map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
-    } else {
-      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
-      map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
-    }
+  auto const row_hash  = cudf::experimental::row::hash::row_hasher(preprocessed_input);
+  auto const row_equal = cudf::experimental::row::equality::self_comparator(preprocessed_input);
+
+  auto const helper_func = [&](auto const& d_equal) {
+    using RowHasher = std::decay_t<decltype(d_equal)>;
+    auto set        = hash_set_type<RowHasher>{num_rows,
+                                               0.5,  // desired load factor
+                                               cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+                                               d_equal,
+                                               {row_hash.device_hasher(has_nulls)},
+                                               {},
+                                               {},
+                                               cudf::detail::cuco_allocator{stream},
+                                               stream.value()};
+    return detail::reduce_by_row(set, num_rows, keep, stream, mr);
   };
 
-  if (nans_equal == nan_equality::ALL_EQUAL) {
-    using nan_equal_comparator =
-      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    insert_keys(nan_equal_comparator{});
+  if (cudf::detail::has_nested_columns(input)) {
+    return dipatch_row_equal<true>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
   } else {
-    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    insert_keys(nan_unequal_comparator{});
+    return dipatch_row_equal<false>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
   }
-
-  auto output_indices = rmm::device_uvector<size_type>(map.get_size(), stream, mr);
-
-  // If we don't care about order, just gather indices of distinct keys taken from map.
-  if (keep == duplicate_keep_option::KEEP_ANY) {
-    map.retrieve_all(output_indices.begin(), thrust::make_discard_iterator(), stream.value());
-    return output_indices;
-  }
-
-  // For other keep options, reduce by row on rows that compare equal.
-  auto const reduction_results = reduce_by_row(map,
-                                               std::move(preprocessed_input),
-                                               input.num_rows(),
-                                               has_nulls,
-                                               has_nested_columns,
-                                               keep,
-                                               nulls_equal,
-                                               nans_equal,
-                                               stream,
-                                               rmm::mr::get_current_device_resource());
-
-  // Extract the desired output indices from reduction results.
-  auto const map_end = [&] {
-    if (keep == duplicate_keep_option::KEEP_NONE) {
-      // Reduction results with `KEEP_NONE` are either group sizes of equal rows, or `0`.
-      // Thus, we only output index of the rows in the groups having group size of `1`.
-      return thrust::copy_if(rmm::exec_policy(stream),
-                             thrust::make_counting_iterator(0),
-                             thrust::make_counting_iterator(input.num_rows()),
-                             output_indices.begin(),
-                             [reduction_results = reduction_results.begin()] __device__(
-                               auto const idx) { return reduction_results[idx] == size_type{1}; });
-    }
-
-    // Reduction results with `KEEP_FIRST` and `KEEP_LAST` are row indices of the first/last row in
-    // each group of equal rows (which are the desired output indices), or the value given by
-    // `reduction_init_value()`.
-    return thrust::copy_if(rmm::exec_policy(stream),
-                           reduction_results.begin(),
-                           reduction_results.end(),
-                           output_indices.begin(),
-                           [init_value = reduction_init_value(keep)] __device__(auto const idx) {
-                             return idx != init_value;
-                           });
-  }();
-
-  output_indices.resize(thrust::distance(output_indices.begin(), map_end), stream);
-  return output_indices;
 }
 
 std::unique_ptr<table> distinct(table_view const& input,
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 99ca89cc021..9843bb889f4 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -15,16 +15,17 @@
  */
 
 #include "stream_compaction_common.cuh"
-#include "stream_compaction_common.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/stream_compaction.hpp>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu
index 13e89b15bb7..c3a004b7f28 100644
--- a/cpp/src/stream_compaction/distinct_helpers.cu
+++ b/cpp/src/stream_compaction/distinct_helpers.cu
@@ -16,96 +16,127 @@
 
 #include "distinct_helpers.hpp"
 
-#include <cudf/detail/hash_reduce_by_row.cuh>
-
-#include <rmm/resource_ref.hpp>
+#include <cuda/functional>
+#include <cuda/std/atomic>
 
 namespace cudf::detail {
 
-namespace {
-/**
- * @brief The functor to find the first/last/all duplicate row for rows that compared equal.
- */
-template <typename MapView, typename KeyHasher, typename KeyEqual>
-struct reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type> {
-  duplicate_keep_option const keep;
-
-  reduce_fn(MapView const& d_map,
-            KeyHasher const& d_hasher,
-            KeyEqual const& d_equal,
-            duplicate_keep_option const keep,
-            size_type* const d_output)
-    : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>{d_map,
-                                                                     d_hasher,
-                                                                     d_equal,
-                                                                     d_output},
-      keep{keep}
-  {
+template <typename RowHasher>
+rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
+                                             size_type num_rows,
+                                             duplicate_keep_option keep,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  auto output_indices = rmm::device_uvector<size_type>(num_rows, stream, mr);
+
+  // If we don't care about order, just gather indices of distinct keys taken from set.
+  if (keep == duplicate_keep_option::KEEP_ANY) {
+    auto const iter = thrust::counting_iterator<cudf::size_type>{0};
+    set.insert_async(iter, iter + num_rows, stream.value());
+    auto const output_end = set.retrieve_all(output_indices.begin(), stream.value());
+    output_indices.resize(thrust::distance(output_indices.begin(), output_end), stream);
+    return output_indices;
   }
 
-  __device__ void operator()(size_type const idx) const
-  {
-    auto const out_ptr = this->get_output_ptr(idx);
-
-    if (keep == duplicate_keep_option::KEEP_FIRST) {
-      // Store the smallest index of all rows that are equal.
-      atomicMin(out_ptr, idx);
-    } else if (keep == duplicate_keep_option::KEEP_LAST) {
-      // Store the greatest index of all rows that are equal.
-      atomicMax(out_ptr, idx);
-    } else {
-      // Count the number of rows in each group of rows that are compared equal.
-      atomicAdd(out_ptr, size_type{1});
+  auto reduction_results = rmm::device_uvector<size_type>(num_rows, stream, mr);
+  thrust::uninitialized_fill(rmm::exec_policy_nosync(stream),
+                             reduction_results.begin(),
+                             reduction_results.end(),
+                             reduction_init_value(keep));
+
+  auto set_ref = set.ref(cuco::op::insert_and_find);
+
+  thrust::for_each(rmm::exec_policy_nosync(stream),
+                   thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(num_rows),
+                   [set_ref, keep, reduction_results = reduction_results.begin()] __device__(
+                     size_type const idx) mutable {
+                     auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx);
+
+                     auto ref = cuda::atomic_ref<size_type, cuda::thread_scope_device>{
+                       reduction_results[*inserted_idx_ptr]};
+                     if (keep == duplicate_keep_option::KEEP_FIRST) {
+                       // Store the smallest index of all rows that are equal.
+                       ref.fetch_min(idx, cuda::memory_order_relaxed);
+                     } else if (keep == duplicate_keep_option::KEEP_LAST) {
+                       // Store the greatest index of all rows that are equal.
+                       ref.fetch_max(idx, cuda::memory_order_relaxed);
+                     } else {
+                       // Count the number of rows in each group of rows that are compared equal.
+                       ref.fetch_add(size_type{1}, cuda::memory_order_relaxed);
+                     }
+                   });
+
+  auto const map_end = [&] {
+    if (keep == duplicate_keep_option::KEEP_NONE) {
+      // Reduction results with `KEEP_NONE` are either group sizes of equal rows, or `0`.
+      // Thus, we only output index of the rows in the groups having group size of `1`.
+      return thrust::copy_if(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_rows),
+        output_indices.begin(),
+        cuda::proclaim_return_type<bool>(
+          [reduction_results = reduction_results.begin()] __device__(auto const idx) {
+            return reduction_results[idx] == size_type{1};
+          }));
     }
-  }
-};
 
-/**
- * @brief The builder to construct an instance of `reduce_fn` functor base on the given
- * value of the `duplicate_keep_option` member variable.
- */
-struct reduce_func_builder {
-  duplicate_keep_option const keep;
-
-  template <typename MapView, typename KeyHasher, typename KeyEqual>
-  auto build(MapView const& d_map,
-             KeyHasher const& d_hasher,
-             KeyEqual const& d_equal,
-             size_type* const d_output)
-  {
-    return reduce_fn<MapView, KeyHasher, KeyEqual>{d_map, d_hasher, d_equal, keep, d_output};
-  }
-};
+    // Reduction results with `KEEP_FIRST` and `KEEP_LAST` are row indices of the first/last row in
+    // each group of equal rows (which are the desired output indices), or the value given by
+    // `reduction_init_value()`.
+    return thrust::copy_if(
+      rmm::exec_policy(stream),
+      reduction_results.begin(),
+      reduction_results.end(),
+      output_indices.begin(),
+      cuda::proclaim_return_type<bool>([init_value = reduction_init_value(keep)] __device__(
+                                         auto const idx) { return idx != init_value; }));
+  }();
 
-}  // namespace
+  output_indices.resize(thrust::distance(output_indices.begin(), map_end), stream);
+  return output_indices;
+}
 
-// This function is split from `distinct.cu` to improve compile time.
-rmm::device_uvector<size_type> reduce_by_row(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
+template rmm::device_uvector<size_type> reduce_by_row(
+  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+    false,
+    cudf::nullate::DYNAMIC,
+    cudf::experimental::row::equality::nan_equal_physical_equality_comparator>>& set,
   size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
   duplicate_keep_option keep,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
   rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
-               "This function should not be called with KEEP_ANY");
-
-  return hash_reduce_by_row(map,
-                            preprocessed_input,
-                            num_rows,
-                            has_nulls,
-                            has_nested_columns,
-                            nulls_equal,
-                            nans_equal,
-                            reduce_func_builder{keep},
-                            reduction_init_value(keep),
-                            stream,
-                            mr);
-}
+  rmm::device_async_resource_ref mr);
+
+template rmm::device_uvector<size_type> reduce_by_row(
+  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+    true,
+    cudf::nullate::DYNAMIC,
+    cudf::experimental::row::equality::nan_equal_physical_equality_comparator>>& set,
+  size_type num_rows,
+  duplicate_keep_option keep,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template rmm::device_uvector<size_type> reduce_by_row(
+  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+    false,
+    cudf::nullate::DYNAMIC,
+    cudf::experimental::row::equality::physical_equality_comparator>>& set,
+  size_type num_rows,
+  duplicate_keep_option keep,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template rmm::device_uvector<size_type> reduce_by_row(
+  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+    true,
+    cudf::nullate::DYNAMIC,
+    cudf::experimental::row::equality::physical_equality_comparator>>& set,
+  size_type num_rows,
+  duplicate_keep_option keep,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp
index 40f97e00ce5..fca67c98873 100644
--- a/cpp/src/stream_compaction/distinct_helpers.hpp
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#include "stream_compaction_common.hpp"
-
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
@@ -24,6 +23,12 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cuco/static_set.cuh>
+#include <cuda/functional>
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/counting_iterator.h>
+
 namespace cudf::detail {
 
 /**
@@ -42,13 +47,28 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
   }
 }
 
+template <typename RowHasher>
+using hash_set_type =
+  cuco::static_set<size_type,
+                   cuco::extent<int64_t>,
+                   cuda::thread_scope_device,
+                   RowHasher,
+                   cuco::linear_probing<1,
+                                        cudf::experimental::row::hash::device_row_hasher<
+                                          cudf::hashing::detail::default_hash,
+                                          cudf::nullate::DYNAMIC>>,
+                   cudf::detail::cuco_allocator,
+                   cuco::storage<1>>;
+
 /**
- * @brief Perform a reduction on groups of rows that are compared equal.
+ * @brief Perform a reduction on groups of rows that are compared equal and returns output indices
+ * of the occurrences of the distinct elements based on `keep` parameter.
  *
  * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared
- * equal. A hash table is used to find groups of equal rows.
+ * equal. A hash set is used to find groups of equal rows.
  *
  * Depending on the `keep` parameter, the reduction operation for each row group is:
+ * - If `keep == KEEP_ANY` : order does not matter.
  * - If `keep == KEEP_FIRST`: min of row indices in the group.
  * - If `keep == KEEP_LAST`: max of row indices in the group.
  * - If `keep == KEEP_NONE`: count of equivalent rows (group size).
@@ -59,30 +79,18 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
  * the `reduction_init_value()` function. Then, the reduction result for each row group is written
  * into the output array at the index of an unspecified row in the group.
  *
- * @param map The auxiliary map to perform reduction
- * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
- *        comparisons
+ * @param set The auxiliary set to perform reduction
+ * @param set_size The number of elements in set
  * @param num_rows The number of all input rows
- * @param has_nulls Indicate whether the input rows has any nulls at any nested levels
- * @param has_nested_columns Indicates whether the input table has any nested columns
  * @param keep The parameter to determine what type of reduction to perform
- * @param nulls_equal Flag to specify whether null elements should be considered as equal
- * @param nans_equal Flag to specify whether NaN values in floating point column should be
- *        considered equal.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned vector
- * @return A device_uvector containing the reduction results
+ * @return A device_uvector containing the output indices
  */
-rmm::device_uvector<size_type> reduce_by_row(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
-  size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
-  duplicate_keep_option keep,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
-
+template <typename RowHasher>
+rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
+                                             size_type num_rows,
+                                             duplicate_keep_option keep,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr);
 }  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu
index 27b5a92ab69..074d4fd7d1a 100644
--- a/cpp/src/stream_compaction/stable_distinct.cu
+++ b/cpp/src/stream_compaction/stable_distinct.cu
@@ -79,11 +79,11 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
                                        duplicate_keep_option keep,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::stable_distinct(
-    input, keys, keep, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
+  return detail::stable_distinct(input, keys, keep, nulls_equal, nans_equal, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh
index 839672d6a56..0f9bc18e258 100644
--- a/cpp/src/stream_compaction/stream_compaction_common.cuh
+++ b/cpp/src/stream_compaction/stream_compaction_common.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,8 @@
  */
 #pragma once
 
-#include "stream_compaction_common.hpp"
-
 #include <cudf/stream_compaction.hpp>
+#include <cudf/utilities/bit.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp
deleted file mode 100644
index 13795f49781..00000000000
--- a/cpp/src/stream_compaction/stream_compaction_common.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/detail/cuco_helpers.hpp>
-#include <cudf/hashing/detail/helper_functions.cuh>
-#include <cudf/table/row_operators.cuh>
-#include <cudf/table/table_device_view.cuh>
-
-#include <cuco/static_map.cuh>
-#include <cuda/std/atomic>
-
-#include <limits>
-
-namespace cudf {
-namespace detail {
-
-using hash_map_type = cuco::legacy::
-  static_map<size_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator>;
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu
index c1f8b17938c..edb47984d13 100644
--- a/cpp/src/stream_compaction/unique.cu
+++ b/cpp/src/stream_compaction/unique.cu
@@ -15,7 +15,6 @@
  */
 
 #include "stream_compaction_common.cuh"
-#include "stream_compaction_common.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 18e726a6d7d..101004a5d06 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -13,16 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include "strings/char_types/char_cases.h"
 #include "strings/char_types/char_flags.h"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -36,8 +37,7 @@
 #include <cstdlib>
 #include <string>
 
-namespace cudf {
-namespace strings {
+namespace cudf::strings {
 namespace detail {
 
 /**
@@ -175,5 +175,17 @@ int64_t get_offset_value(cudf::column_view const& offsets,
 }
 
 }  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+
+rmm::device_uvector<string_view> create_string_vector_from_column(
+  cudf::strings_column_view const strings,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::create_string_vector_from_column(strings, stream, mr);
+}
+
+int64_t get_offset64_threshold() { return detail::get_offset64_threshold(); }
+bool is_large_strings_enabled() { return detail::is_large_strings_enabled(); }
+
+}  // namespace cudf::strings
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
new file mode 100644
index 00000000000..3d0822d8545
--- /dev/null
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+
+namespace cudf::detail {
+
+namespace {
+
+void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
+{
+  if (size == 0) return;
+
+  if (size < get_kernel_pinned_copy_threshold()) {
+    thrust::copy_n(rmm::exec_policy_nosync(stream),
+                   static_cast<const char*>(src),
+                   size,
+                   static_cast<char*>(dst));
+  } else {
+    CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
+  }
+}
+
+void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
+{
+  if (size == 0) return;
+
+  CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
+}
+
+};  // namespace
+
+void cuda_memcpy_async(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
+{
+  if (kind == host_memory_kind::PINNED) {
+    copy_pinned(dst, src, size, stream);
+  } else if (kind == host_memory_kind::PAGEABLE) {
+    copy_pageable(dst, src, size, stream);
+  } else {
+    CUDF_FAIL("Unsupported host memory kind");
+  }
+}
+
+void cuda_memcpy(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
+{
+  cuda_memcpy_async(dst, src, size, kind, stream);
+  stream.synchronize();
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp
index e90b7969b4d..3ea4293fc60 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/pinned_memory.cpp
@@ -211,4 +211,18 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts)
   return did_configure;
 }
 
+CUDF_EXPORT auto& kernel_pinned_copy_threshold()
+{
+  // use cudaMemcpyAsync for all pinned copies
+  static std::atomic<size_t> threshold = 0;
+  return threshold;
+}
+
+void set_kernel_pinned_copy_threshold(size_t threshold)
+{
+  kernel_pinned_copy_threshold() = threshold;
+}
+
+size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(); }
+
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 9f14455f42d..eef09954647 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -700,6 +700,7 @@ ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_ROLLING_TEST streams/rolling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_STREAM_COMPACTION_TEST streams/stream_compaction_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_STRINGS_TEST
   streams/strings/case_test.cpp
diff --git a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
index 6d097b2ff12..89824eb6511 100644
--- a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
@@ -843,3 +843,61 @@ TYPED_TEST(FixedPointTest_64_128_Reps, FixedPoint_64_128_ComparisonTests)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(h->view(), falses);
   }
 }
+
+template <typename ResultType>
+void test_fixed_floating(cudf::binary_operator op,
+                         double floating_value,
+                         int decimal_value,
+                         int decimal_scale,
+                         ResultType expected)
+{
+  auto const scale       = numeric::scale_type{decimal_scale};
+  auto const result_type = cudf::data_type(cudf::type_to_id<ResultType>());
+  auto const nullable =
+    (op == cudf::binary_operator::NULL_EQUALS || op == cudf::binary_operator::NULL_NOT_EQUALS ||
+     op == cudf::binary_operator::NULL_MIN || op == cudf::binary_operator::NULL_MAX);
+
+  cudf::test::fixed_width_column_wrapper<double> floating_col({floating_value});
+  cudf::test::fixed_point_column_wrapper<int> decimal_col({decimal_value}, scale);
+
+  auto result = binary_operation(floating_col, decimal_col, op, result_type);
+
+  if constexpr (cudf::is_fixed_point<ResultType>()) {
+    using wrapper_type      = cudf::test::fixed_point_column_wrapper<typename ResultType::rep>;
+    auto const expected_col = nullable ? wrapper_type({expected.value()}, {true}, expected.scale())
+                                       : wrapper_type({expected.value()}, expected.scale());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, *result.get());
+  } else {
+    using wrapper_type = cudf::test::fixed_width_column_wrapper<ResultType>;
+    auto const expected_col =
+      nullable ? wrapper_type({expected}, {true}) : wrapper_type({expected});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, *result.get());
+  }
+}
+
+TYPED_TEST(FixedPointCompiledTest, FixedPointWithFloating)
+{
+  using namespace numeric;
+
+  // BOOLEAN
+  test_fixed_floating(cudf::binary_operator::EQUAL, 1.0, 10, -1, true);
+  test_fixed_floating(cudf::binary_operator::NOT_EQUAL, 1.0, 10, -1, false);
+  test_fixed_floating(cudf::binary_operator::LESS, 2.0, 10, -1, false);
+  test_fixed_floating(cudf::binary_operator::GREATER, 2.0, 10, -1, true);
+  test_fixed_floating(cudf::binary_operator::LESS_EQUAL, 2.0, 20, -1, true);
+  test_fixed_floating(cudf::binary_operator::GREATER_EQUAL, 2.0, 30, -1, false);
+  test_fixed_floating(cudf::binary_operator::NULL_EQUALS, 1.0, 10, -1, true);
+  test_fixed_floating(cudf::binary_operator::NULL_NOT_EQUALS, 1.0, 10, -1, false);
+
+  // PRIMARY ARITHMETIC
+  auto const decimal_result = numeric::decimal32(4, numeric::scale_type{0});
+  test_fixed_floating(cudf::binary_operator::ADD, 1.0, 30, -1, decimal_result);
+  test_fixed_floating(cudf::binary_operator::SUB, 6.0, 20, -1, decimal_result);
+  test_fixed_floating(cudf::binary_operator::MUL, 2.0, 20, -1, decimal_result);
+  test_fixed_floating(cudf::binary_operator::DIV, 8.0, 2, 0, decimal_result);
+  test_fixed_floating(cudf::binary_operator::MOD, 9.0, 50, -1, decimal_result);
+
+  // OTHER ARITHMETIC
+  test_fixed_floating(cudf::binary_operator::NULL_MAX, 4.0, 20, -1, decimal_result);
+  test_fixed_floating(cudf::binary_operator::NULL_MIN, 4.0, 200, -1, decimal_result);
+}
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index dca36eaa4e7..603187f0330 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -24,7 +24,7 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -762,7 +762,7 @@ TEST_F(ColumnFactoryTest, FromStructScalarNull) { struct_from_scalar(false); }
 
 TEST_F(ColumnFactoryTest, FromScalarErrors)
 {
-  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+  if (cudf::strings::is_large_strings_enabled()) { return; }
   cudf::string_scalar ss("hello world");
   EXPECT_THROW(cudf::make_column_from_scalar(ss, 214748365), std::overflow_error);
 
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 078e0ef9bae..054441788d0 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -29,7 +29,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/filling.hpp>
-#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -189,7 +189,7 @@ TEST_F(StringColumnTest, ConcatenateManyColumns)
 
 TEST_F(StringColumnTest, ConcatenateTooLarge)
 {
-  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+  if (cudf::strings::is_large_strings_enabled()) { return; }
 
   std::string big_str(1000000, 'a');  // 1 million bytes x 5 = 5 million bytes
   cudf::test::strings_column_wrapper input{big_str, big_str, big_str, big_str, big_str};
@@ -379,7 +379,7 @@ TEST_F(OverflowTest, OverflowTest)
   }
 
   // string column, overflow on chars
-  if (!cudf::strings::detail::is_large_strings_enabled()) {
+  if (!cudf::strings::is_large_strings_enabled()) {
     constexpr auto size = static_cast<cudf::size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 6 string columns of with 1 billion chars in each
@@ -502,7 +502,7 @@ TEST_F(OverflowTest, Presliced)
   }
 
   // strings, overflow on chars
-  if (!cudf::strings::detail::is_large_strings_enabled()) {
+  if (!cudf::strings::is_large_strings_enabled()) {
     constexpr cudf::size_type total_chars_size = 1024 * 1024 * 1024;
     constexpr cudf::size_type string_size      = 64;
     constexpr cudf::size_type num_rows         = total_chars_size / string_size;
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 84ab83e33d0..a1f4c7b81d8 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -1785,7 +1785,8 @@ TEST_F(ParquetWriterTest, DeltaBinaryStartsWithNulls)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
-TEST_F(ParquetWriterTest, ByteStreamSplit)
+std::pair<std::unique_ptr<cudf::table>, cudf::io::table_input_metadata>
+make_byte_stream_split_table(bool as_struct)
 {
   constexpr auto num_rows = 100;
   std::mt19937 engine{31337};
@@ -1802,24 +1803,73 @@ TEST_F(ParquetWriterTest, ByteStreamSplit)
   // throw in a list to make sure both decoders are working
   auto col4 = make_parquet_list_col<int32_t>(engine, num_rows, 5, true);
 
-  auto expected = table_view{{col0, col1, col2, col3, *col4}};
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.reserve(5);
+  columns.push_back(col0.release());
+  columns.push_back(col1.release());
+  columns.push_back(col2.release());
+  columns.push_back(col3.release());
+  columns.push_back(std::move(col4));
+
+  return [&]() -> std::pair<std::unique_ptr<cudf::table>, cudf::io::table_input_metadata> {
+    auto const encoding = cudf::io::column_encoding::BYTE_STREAM_SPLIT;
+
+    // make as a nested struct
+    if (as_struct) {
+      auto valids =
+        cudf::detail::make_counting_transform_iterator(0, [](int i) { return i % 2 == 0; });
+      auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows);
+
+      std::vector<std::unique_ptr<cudf::column>> table_cols;
+      table_cols.push_back(
+        cudf::make_structs_column(num_rows, std::move(columns), null_count, std::move(null_mask)));
+
+      auto tbl      = std::make_unique<cudf::table>(std::move(table_cols));
+      auto expected = table_view{*tbl};
+
+      cudf::io::table_input_metadata expected_metadata(expected);
+      expected_metadata.column_metadata[0].set_name("struct");
+      expected_metadata.column_metadata[0].set_encoding(encoding);
+
+      expected_metadata.column_metadata[0].child(0).set_name("int32s");
+      expected_metadata.column_metadata[0].child(1).set_name("int64s");
+      expected_metadata.column_metadata[0].child(2).set_name("floats");
+      expected_metadata.column_metadata[0].child(3).set_name("doubles");
+      expected_metadata.column_metadata[0].child(4).set_name("int32list");
+      for (int idx = 0; idx <= 3; idx++) {
+        expected_metadata.column_metadata[0].child(idx).set_encoding(encoding);
+      }
+      expected_metadata.column_metadata[0].child(4).child(1).set_encoding(encoding);
 
-  cudf::io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_name("int32s");
-  expected_metadata.column_metadata[1].set_name("int64s");
-  expected_metadata.column_metadata[2].set_name("floats");
-  expected_metadata.column_metadata[3].set_name("doubles");
-  expected_metadata.column_metadata[4].set_name("int32list");
-  auto const encoding = cudf::io::column_encoding::BYTE_STREAM_SPLIT;
-  for (int i = 0; i <= 3; i++) {
-    expected_metadata.column_metadata[i].set_encoding(encoding);
-  }
+      return {std::move(tbl), expected_metadata};
+    }
+
+    // make flat
+    auto tbl      = std::make_unique<cudf::table>(std::move(columns));
+    auto expected = table_view{*tbl};
 
-  expected_metadata.column_metadata[4].child(1).set_encoding(encoding);
+    cudf::io::table_input_metadata expected_metadata(expected);
+    expected_metadata.column_metadata[0].set_name("int32s");
+    expected_metadata.column_metadata[1].set_name("int64s");
+    expected_metadata.column_metadata[2].set_name("floats");
+    expected_metadata.column_metadata[3].set_name("doubles");
+    expected_metadata.column_metadata[4].set_name("int32list");
+    for (int idx = 0; idx <= 3; idx++) {
+      expected_metadata.column_metadata[idx].set_encoding(encoding);
+    }
+
+    expected_metadata.column_metadata[4].child(1).set_encoding(encoding);
+    return {std::move(tbl), expected_metadata};
+  }();
+}
+
+TEST_F(ParquetWriterTest, ByteStreamSplit)
+{
+  auto [expected, expected_metadata] = make_byte_stream_split_table(false);
 
   auto const filepath = temp_env->get_temp_filepath("ByteStreamSplit.parquet");
   cudf::io::parquet_writer_options out_opts =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
       .metadata(expected_metadata);
   cudf::io::write_parquet(out_opts);
 
@@ -1827,7 +1877,24 @@ TEST_F(ParquetWriterTest, ByteStreamSplit)
     cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
   auto result = cudf::io::read_parquet(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, result.tbl->view());
+}
+
+TEST_F(ParquetWriterTest, ByteStreamSplitStruct)
+{
+  auto [expected, expected_metadata] = make_byte_stream_split_table(true);
+
+  auto const filepath = temp_env->get_temp_filepath("ByteStreamSplitStruct.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, result.tbl->view());
 }
 
 TEST_F(ParquetWriterTest, DecimalByteStreamSplit)
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
index 79968bcd7f4..7ab4a2ea465 100644
--- a/cpp/tests/join/conditional_join_tests.cu
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -20,6 +20,7 @@
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -222,21 +223,25 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
              std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs)
   {
     auto result_size = this->join_size(left, right, predicate);
-    EXPECT_TRUE(result_size == expected_outputs.size());
-
-    auto result = this->join(left, right, predicate);
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs;
-    for (size_t i = 0; i < result.first->size(); ++i) {
-      // Note: Not trying to be terribly efficient here since these tests are
-      // small, otherwise a batch copy to host before constructing the tuples
-      // would be important.
-      result_pairs.push_back({result.first->element(i, cudf::get_default_stream()),
-                              result.second->element(i, cudf::get_default_stream())});
-    }
+    EXPECT_EQ(result_size, expected_outputs.size());
+
+    auto result     = this->join(left, right, predicate);
+    auto lhs_result = cudf::detail::make_std_vector_sync(*result.first, cudf::get_default_stream());
+    auto rhs_result =
+      cudf::detail::make_std_vector_sync(*result.second, cudf::get_default_stream());
+    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs(lhs_result.size());
+    std::transform(lhs_result.begin(),
+                   lhs_result.end(),
+                   rhs_result.begin(),
+                   result_pairs.begin(),
+                   [](cudf::size_type lhs, cudf::size_type rhs) {
+                     return std::pair{lhs, rhs};
+                   });
     std::sort(result_pairs.begin(), result_pairs.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
 
-    EXPECT_TRUE(std::equal(expected_outputs.begin(), expected_outputs.end(), result_pairs.begin()));
+    EXPECT_TRUE(std::equal(
+      expected_outputs.begin(), expected_outputs.end(), result_pairs.begin(), result_pairs.end()));
   }
 
   /*
@@ -411,6 +416,11 @@ TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnLeftEmpty)
   this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
 };
 
+TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {});
+};
+
 TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoRowAllEqual)
 {
   this->test({{0, 1}}, {{0, 0}}, left_zero_eq_right_zero, {{0, 0}, {0, 1}});
@@ -600,6 +610,14 @@ TYPED_TEST(ConditionalLeftJoinTest, TestOneColumnLeftEmpty)
   this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
 };
 
+TYPED_TEST(ConditionalLeftJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}},
+             {{}},
+             left_zero_eq_right_zero,
+             {{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}});
+};
+
 TYPED_TEST(ConditionalLeftJoinTest, TestCompareRandomToHash)
 {
   auto [left, right] = gen_random_repeated_columns<TypeParam>();
@@ -666,6 +684,14 @@ TYPED_TEST(ConditionalFullJoinTest, TestOneColumnLeftEmpty)
              {{JoinNoneValue, 0}, {JoinNoneValue, 1}, {JoinNoneValue, 2}});
 };
 
+TYPED_TEST(ConditionalFullJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}},
+             {{}},
+             left_zero_eq_right_zero,
+             {{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}});
+};
+
 TYPED_TEST(ConditionalFullJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}},
@@ -705,20 +731,16 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
     auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] =
       this->parse_input(left_data, right_data);
     auto result_size = this->join_size(left, right, predicate);
-    EXPECT_TRUE(result_size == expected_outputs.size());
-
-    auto result = this->join(left, right, predicate);
-    std::vector<cudf::size_type> resulting_indices;
-    for (size_t i = 0; i < result->size(); ++i) {
-      // Note: Not trying to be terribly efficient here since these tests are
-      // small, otherwise a batch copy to host before constructing the tuples
-      // would be important.
-      resulting_indices.push_back(result->element(i, cudf::get_default_stream()));
-    }
-    std::sort(resulting_indices.begin(), resulting_indices.end());
+    EXPECT_EQ(result_size, expected_outputs.size());
+
+    auto result         = this->join(left, right, predicate);
+    auto result_indices = cudf::detail::make_std_vector_sync(*result, cudf::get_default_stream());
+    std::sort(result_indices.begin(), result_indices.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
-    EXPECT_TRUE(
-      std::equal(resulting_indices.begin(), resulting_indices.end(), expected_outputs.begin()));
+    EXPECT_TRUE(std::equal(result_indices.begin(),
+                           result_indices.end(),
+                           expected_outputs.begin(),
+                           expected_outputs.end()));
   }
 
   void _compare_to_hash_join(std::unique_ptr<rmm::device_uvector<cudf::size_type>> const& result,
@@ -826,6 +848,16 @@ struct ConditionalLeftSemiJoinTest : public ConditionalJoinSingleReturnTest<T> {
 
 TYPED_TEST_SUITE(ConditionalLeftSemiJoinTest, cudf::test::IntegralTypesNotBool);
 
+TYPED_TEST(ConditionalLeftSemiJoinTest, TestOneColumnLeftEmpty)
+{
+  this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
+};
+
+TYPED_TEST(ConditionalLeftSemiJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {});
+};
+
 TYPED_TEST(ConditionalLeftSemiJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {0, 1});
@@ -873,6 +905,16 @@ struct ConditionalLeftAntiJoinTest : public ConditionalJoinSingleReturnTest<T> {
 
 TYPED_TEST_SUITE(ConditionalLeftAntiJoinTest, cudf::test::IntegralTypesNotBool);
 
+TYPED_TEST(ConditionalLeftAntiJoinTest, TestOneColumnLeftEmpty)
+{
+  this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
+};
+
+TYPED_TEST(ConditionalLeftAntiJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {0, 1, 2});
+};
+
 TYPED_TEST(ConditionalLeftAntiJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {2});
diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp
new file mode 100644
index 00000000000..56443870602
--- /dev/null
+++ b/cpp/tests/streams/stream_compaction_test.cpp
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cmath>
+
+auto constexpr null{0};  // null at current level
+auto constexpr XXX{0};   // null pushed down from parent level
+auto constexpr NaN          = std::numeric_limits<double>::quiet_NaN();
+auto constexpr KEEP_ANY     = cudf::duplicate_keep_option::KEEP_ANY;
+auto constexpr KEEP_FIRST   = cudf::duplicate_keep_option::KEEP_FIRST;
+auto constexpr KEEP_LAST    = cudf::duplicate_keep_option::KEEP_LAST;
+auto constexpr KEEP_NONE    = cudf::duplicate_keep_option::KEEP_NONE;
+auto constexpr NULL_EQUAL   = cudf::null_equality::EQUAL;
+auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL;
+auto constexpr NAN_EQUAL    = cudf::nan_equality::ALL_EQUAL;
+auto constexpr NAN_UNEQUAL  = cudf::nan_equality::UNEQUAL;
+
+using int32s_col = cudf::test::fixed_width_column_wrapper<int32_t>;
+using floats_col = cudf::test::fixed_width_column_wrapper<float>;
+
+using cudf::nan_policy;
+using cudf::null_equality;
+using cudf::null_policy;
+using cudf::test::iterators::no_nulls;
+using cudf::test::iterators::null_at;
+using cudf::test::iterators::nulls_at;
+
+struct StableDistinctKeepAny : public cudf::test::BaseFixture {};
+
+struct StableDistinctKeepFirstLastNone : public cudf::test::BaseFixture {};
+
+TEST_F(StableDistinctKeepAny, NoNullsTableWithNaNs)
+{
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+  auto const col1  = int32s_col{6, 6, 6, 1, 1, 1, 3, 5, 8, 5};
+  auto const col2  = floats_col{6, 6, 6, 1, 1, 1, 3, 4, 9, 4};
+  auto const keys1 = int32s_col{20, 20, 20, 15, 15, 15, 20, 19, 21, 9};
+  auto const keys2 = floats_col{19., 19., 19., NaN, NaN, NaN, 20., 20., 9., 21.};
+
+  auto const input   = cudf::table_view{{col1, col2, keys1, keys2}};
+  auto const key_idx = std::vector<cudf::size_type>{2, 3};
+
+  // NaNs are unequal.
+  {
+    auto const exp_col1  = int32s_col{6, 1, 1, 1, 3, 5, 8, 5};
+    auto const exp_col2  = floats_col{6, 1, 1, 1, 3, 4, 9, 4};
+    auto const exp_keys1 = int32s_col{20, 15, 15, 15, 20, 19, 21, 9};
+    auto const exp_keys2 = floats_col{19., NaN, NaN, NaN, 20., 20., 9., 21.};
+    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // NaNs are equal.
+  {
+    auto const exp_col1  = int32s_col{6, 1, 3, 5, 8, 5};
+    auto const exp_col2  = floats_col{6, 1, 3, 4, 9, 4};
+    auto const exp_keys1 = int32s_col{20, 15, 20, 19, 21, 9};
+    auto const exp_keys2 = floats_col{19., NaN, 20., 20., 9., 21.};
+    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, InputWithNullsAndNaNs)
+{
+  auto constexpr null{0.0};  // shadow the global `null` variable of type int
+
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+  auto const col   = int32s_col{5, 4, 4, 1, 1, 1, 8, 8, 1};
+  auto const keys  = floats_col{{20., null, null, NaN, NaN, NaN, 19., 19., 21.}, nulls_at({1, 2})};
+  auto const input = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // Nulls are equal, NaNs are unequal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 1, 1, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, NaN, NaN, NaN, 19., 21.}, null_at(1)};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // Nulls are equal, NaNs are equal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, NaN, 19., 21.}, null_at(1)};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // Nulls are unequal, NaNs are unequal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 4, 1, 1, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, null, NaN, NaN, NaN, 19., 21.}, nulls_at({1, 2})};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // Nulls are unequal, NaNs are equal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 4, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, null, NaN, 19., 21.}, nulls_at({1, 2})};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_UNEQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsEqual)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6};
+  auto const keys    = floats_col{20., NaN, NaN, 19., 21., 19., 22.};
+  auto const input   = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col  = int32s_col{0, 1, 3, 4, 6};
+    auto const exp_keys = floats_col{20., NaN, 19., 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_FIRST, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col  = int32s_col{0, 2, 4, 5, 6};
+    auto const exp_keys = floats_col{20., NaN, 21., 19., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_LAST, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col  = int32s_col{0, 4, 6};
+    auto const exp_keys = floats_col{20., 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_NONE, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsUnequal)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6, 7};
+  auto const keys    = floats_col{20., NaN, NaN, 19., 21., 19., 22., 20.};
+  auto const input   = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col  = int32s_col{0, 1, 2, 3, 4, 6};
+    auto const exp_keys = floats_col{20., NaN, NaN, 19., 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_FIRST, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col  = int32s_col{1, 2, 4, 5, 6, 7};
+    auto const exp_keys = floats_col{NaN, NaN, 21., 19., 22., 20.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_LAST, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col  = int32s_col{1, 2, 4, 6};
+    auto const exp_keys = floats_col{NaN, NaN, 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_NONE, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp
index a1bb87a43fb..9c0ecaa52c0 100644
--- a/cpp/tests/strings/array_tests.cpp
+++ b/cpp/tests/strings/array_tests.cpp
@@ -23,8 +23,8 @@
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/sorting.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -153,7 +153,7 @@ TEST_F(StringsColumnTest, GatherZeroSizeStringsColumn)
 
 TEST_F(StringsColumnTest, GatherTooBig)
 {
-  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+  if (cudf::strings::is_large_strings_enabled()) { return; }
 
   std::vector<int8_t> h_chars(3000000);
   cudf::test::fixed_width_column_wrapper<int8_t> chars(h_chars.begin(), h_chars.end());
diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp
index 0539895c5f4..aa4d9320d7c 100644
--- a/cpp/tests/strings/repeat_strings_tests.cpp
+++ b/cpp/tests/strings/repeat_strings_tests.cpp
@@ -20,9 +20,9 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/utilities.hpp>
 
 using namespace cudf::test::iterators;
 
@@ -221,7 +221,7 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesInvalidInput)
 
 TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesOverflowOutput)
 {
-  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+  if (cudf::strings::is_large_strings_enabled()) { return; }
 
   auto const strs    = strs_col{"1", "12", "123", "1234", "12345", "123456", "1234567"};
   auto const strs_cv = cudf::strings_column_view(strs);
diff --git a/docs/cudf/source/_static/cudf-pandas-line-profile.png b/docs/cudf/source/_static/cudf-pandas-line-profile.png
new file mode 100644
index 0000000000000000000000000000000000000000..1d5a07c72eb06ab9270ab13e06aa31d55d4981ca
GIT binary patch
literal 15125
zcmeIZcT|&Y_vecR5l~TSBJCka7m?mU)KH`e2uN2zy7XREdXN&3-b9)ZYC>-jX`xFe
zRDsX~NC`+nnFOBaecy3@bIw|4%{lALTJs0FNyyE8wY~RefA@8TKUY(@PIi}!h=}OA
zlHyZMBBD!G_}|f2FXCVO-xRmtFBe=i6&@3n^f9dB-(0qoQ<WnkDvu^VdQFUfPwJ?s
z=R!nu+m-NhVa9^llZZ&7Q0b}M3s2*X^waKp9e&GOCSKo5ZZ~+c4C=qMLf*L*#9Hd0
zS{68Wf3A)54rBGjtV%DH+nN1o->#V5dj0(w?cS3wj@Z}1U!)T_?*J}pB;4dnyBA0)
zseF;1-Y(eX@~!R@=oqFzU0qjuEU_ecHN!i@tI?CS*Hu?rSG!*jlak?|ajbrTZyXWP
zw+p~4gzK+I;0uJS)N^yfg@}TRj&S*Q>5o5u0r_I)<-T27-jn+{l#h}7IP0y#+=>{L
zI(R1&<|Kd0ypToCJf^_!EKJAVBJ@xt#QIgzHCW(lcl@tNr=n1=?ZUfd<RJ9-hje-G
zUkj*QYB5h@Bz!@jC(2A(g-Y*dTk}K7JO93S=-a~s&m;_pq4)xEk~9&~IIpa4pZ>Sn
zW&f`Rua-ZPY=EUXICn;0?-LQds{V|){(-521k`Z9hY3GPG*q>6KLYn3ffIv~mx$g?
ze*lPH_5Q)cxdVOZLqt?&@EmnTj$+~x(3r-8f{4hS95nUUik#19ME7^}1)^{Ea$x$)
z(pD5yUV7&b=5!9kKTo3+5sIZv>}OcJ^7Y->zjfaDuA~>PW?Yl^KWmXfBVgC)8~gcF
zYX-g$TJ;q^@Bvms`afx?`ut|UvEL4*dJ`5_k8NlTmHoYB7A<`cNP)3tTWO}U=~sH|
zm1#kB7&_~RX63lCi5m~_SxBr~d@0b6-snW(+fVZd%<=1FBqG0FTw)th&7n3Zd$R2?
zN!H&8i;BrYyeaJO3~0cNgsQm;tC=)lo9qi;-vW+iwxcOnL20Jn(OC#0qKEOq)Q6k>
zCu@g%CrhOs-MwViSq^HoTiFx=*koOQbH2Q_FUuuf?poX9$Ijvaca=7gi7njl!HKp$
zcCrPxSj!|Z>NuWSza@2`DBAOEdPIMn6nWSC)$d;lrBb`Ap|akY!;M>Jnf^;70Rqx}
z>6FvUQF@~BQH>|!tB{PkJwNN!M7{u>jCGc5?ZBBL549`B#tU)E`)T(J5Iz@SBt%5@
zxwNq<Zi~n8ntNjlzJoTd^Cy-~vs~d1yf;%UMX95w`4U+NJBH<b*kqix3&CFkj`mr1
zDrC)z=01y_9yO{LvzT2m*!{W3Cit6s^hcoDX+}eUt7gvoDe>@^!S>vYKF2OlO;hPL
zdUYH42EG-?GerSvoBpS<iaYsD2mN(#xREZ<FaQbVg0OA0+jcy2<IbWwVtEuNvoAU8
zV$tEBQ<<5uiCb~-uRRTW@%cy5<XU*NukUYEC2^IMPi-epUxx4gQ4M-E<WREzcqB!1
z9dC6SUrZ$CA<iF7j2k_>hQ+pj#Fb7RRliaiUP6<(9?Uv4dU1|vwmo+naM9(cx5kz?
zE<G`g?5xttS3@c%$oHv9{eETHOA)Y}KC(Hry>-&2Wl=`Kbn>=_^NROPBhk~717@Lm
zX1}Jv4%a;Hy`PBzSTDi;bP=W|ul3pbsHEC}6Zp}AXq{iJ%NN`f9iq=EXw3yc-6LK;
zw%?ndM8YgYiTMq&xDwGZi%wfnN%vmkldp%G_bai2eCLB2u6i>a9mA`AFl)8l74$El
zz4H1-TNnL4@%9_r@9eao;S|o|$M-~Sh=>?`u(_<+;WDS#M)l+!4jG&?`pU((_AlSN
zwy@5gboiflZ>PMYx4!jm+*?`-N%DHQU?GH7J<WUV&>=;{jxu*4_3pq$xbH$InCYoE
zroUg-W`D2RO9g1j&EIzw85b+bf&Ibw$~}tpMJ8o|<L0klFcOkXzr(~OWOl!-Ll@FC
zcfrA1ZMY0KDU}y1%@HwUbu0Dr$;)YloK3UlX%Cd}OtqP8LAuX=Zva2DmF!WXXr{1F
zzRIIKz4Z1bWYfqAE+sPAdjCsl$rKtH{UTomr?54lp0RZ@VA)$^8<2B>=qDmeS^l-Q
z`;W!e`gu$K?x^aEZtI=OnY6ax#h12qkhp1~!|`q65{|*7W3|)yku4L65nh`B=hYY0
z@8GO&$Jw!!18r}oh9w`m7x(L~^j^rsnVeeW&9wy7A7Zyo_jP;OCW#th*;;bb55t=b
zww>gEkZrf1`ocF|ICv#HZrSvQHy$odxy)`=oDAp6uAWS8Bsf<kRV~knlx>R-d;rrp
zpU>Uewl0P>R&uMHR2UTvFi)w>mzyQI-1t6kAExjj*@92R3yECE-O?Q3BO(%u-CJJ{
z!VG><4l!bB7#XtzJ?Oyl*cz)w)YGW4)P2J|y$f69GmhtprjR`<11tKpjXc;W$Mjnp
z()uY}^4^1PG~>-+KBE5^Ig4n#u3@9Vxi<}p7W|+Sme@~l1N(gEO`jfpD?r^V&R@d)
z{mqttxZ9ZZEAr(W_n^o7X+mERPoxjLW}{<{G=fxMb0s&VpHDaBt@t!9A|o<zJr667
zk~TixCL)^1HMs~~2PvB~x+X9N0Kq16UaMPsAwLv*9p7@_GWzKah*a`2GcHu$!F>G$
zYa{FEXV~2AFOfD2sO)f_Kbdt8XtoS~Yl^)}oL<l>#%dKDChC4RzPLm>*H3nb<CnWd
zhO~WH25VDz-%>?oBBe2DR;BFG&k-4MdDO_0E*nhmScnR2U$eo-AlTotpg|(jG!Gfk
z<SWoJET?@cfe*b5Uap;pQl1C;%|=e?Z#e2Ma&7tUzJ~R0i#1+<jrH@ZT~Z)73T4o&
zQdIT%i_0<sSXi@yO=?n|--OR%Q)PWNBP=&&^&%?GQ~_Q=VpRsz-k$?%7KYm(;EW~q
zjZH{&svmYT;={)FUTL>&f4FoG*MqVBjrVq}gYs34#c7m&$lR!#96o9(I_8aDlSkgg
zQ|bnCd3w?mO9xLj8WE-ACNq-9JqG$U3&#+XlX1W4d}v3L(Xq0xV6XAsNPMbvu_IPi
z*3x0|3eh`1Io)?hb@N&95AKfG-Tu?a`*%G8ev!;KSs=qp9z8pid=u@v8*;SN-G9;_
z5HI_?4BLhiIPahxStvVwF_nCwXU>+-@ft^_SHbCn(-wM@I8w1~)JmT&qs!5b{q4!(
zyV5`4=`1|8dtJ5EqLS9$%B!mprQ@@U*a45e2z9>`_52xWBOf)p!_PI-oI{`9mYOZN
zeBrHOCTEY4<}lDf5luew&Fp(C)_=0g?C-NDN|)!xGiHb8lfE2q#dl@L&!nkjG7mb9
z>{c`O-#3L5uYDpWdbjWl?*5`-dDd@vGI~PURmOsMaA|&B-?h8|TP^&hw<A%OE~dzB
z?g#oJ=i9yYvIql1zHgCiGCKS_BrzDxNp!#ffaQMGd$f4?n$}3>W}!5PyY3JA`T->K
zPpa9yv^`O#7biM;_?2CPn9eyKg0#T7&5{R;AH$eWR^s^%PW^xQF(3Ex$uOUQ8`BMe
zH)5_p<8;#+4^AbpTlerGzdX-Ga&2aJgZc9YZ6qc0AtqGzxJOxb<%Mpla<T2qGe)Bq
zyH8W2`2s8+?MogzT2(bv43xG!Fy21l3knu>p1~Vq<M65NMT?_t@HqJ<wXS)*@ZRSZ
zNv=K=O<y<%k#oNDv@DT^m)f3xYMH2jIg+_}jRqeC_5!rMu)4GfXmMl$poQ#=rj+^a
zWhs34Lg~7hIx^Ng6M%67mv75DX<U>>z)r*m;9jY}Av5C7QE&c1H#V%nox;tlbT%WW
zyF)8-U`(IV=~i>`cE5TyLb>H{BIEc08Oc7~Jv|Av@7f(YJti(FTaxXFVER-b!*RNO
zaB9X_yjh@&Y7wkzPku5@M1782B}QUgJOiENW3J>+YjEm(0dth<Y95e-Q|zyDVSfcB
zIf}gJK591IIP-o|P=nhaxaUh-_BW>4S6JQL`z*cYQ1SD={W*RR1MLYP{YO9k7a(X+
zDdG~vcxI(oelI*IcD|Zvh=d<1+%Nr^!gPfh8nN>uIC?$%PHoDyxYu|8W@Y3#HlxKx
zTb_c6jTz3E`V6nHgmgZ<e$6F1G#o(QAa=)s1drV3pAqg}Vxn-V0~jbRStPri&qPu^
zxR$b(ACIVrG)gM3EB`qf?*V+|XVmroO1Fo3cIJ`;0|i5|m$}c|Vp#9E^Ai<pi@laY
zByAPlBQ6KF{A{uBj~Vk1zWMlS@*N{75CQrPEB{99;U42(GYVH(g3P}}DBL2<7pm}|
z`2wk2WiiiKnb)|CpELZ&_&E~?fqV}!lO8fPUTjnTi%TRPn|rl~kIOtPYn>PR(hbJ=
zqO0OedxocLZrgn_Z1PuD+p{udKJD?5_217Im2v5u7UPO;7|}l3UYNaJ?GQ_AUt%0n
zu8ZDQ40v#_+5|4eUFJV;Ih0m#U|ctg6u4pO(reylu@l<IzCBckZy-KFiGdIjl*JlO
z=x?Vu_PS4iRJ!q!&u9_n1P7A<iK!Vj!Q2flXWy$G{F4F1Rr!cC)mw`z>a#;*4hn_;
z?n{D}xmKxJzX?&tQ}E(NbxGZ*X%%hrt-B+?Z>#aIlY)FJ##d@~?|D2{&5T>tpIV7j
zke`gGJsqK6uXSS=FTkQ`{ZA%DH?1?g`N!*(n)ZgPCZ%vo-H;Ohg=G^-N$$UpO@NO6
zB`Totj-3pyvb2c310}f}cri6Epr8SlJ5Yjc_|2FPu}xPMqzI2#0W%xmXQn70kv9q|
ztTY@8V3D$~)2`MD)g6G0+**$^V0RiGzxO+oQ8=x@dH}n6k<@07D1pUA4d20?54E1!
zr}^{;aVbcb+28KTK>$(j_xbYRbQT0#a!PmqI*&7@8X8#gqRpZFN3!@l=swNlWH|MY
z;yW9M9>9U8fmOMPJfA>CFX;nVDG1%Rh8E%V`t1v)2JS;gkv)BZUiG^(X>Qw`rb4Bf
zbLJPe)?ccoRV5t0X|%A>zs~Lbl-mNNyz0_#RC&6qywc=v_QSEnBAQ`5hC8NpI87wC
zj4Ig$$_5lE<uWif7WX;1_%*q4q<-ds=}h8uM7rb)b5=)-?K^c;O#1iyM@A~C^r~wh
zpY$qp-j*=%OX)Csc*VIvqVqGvi&qzC#7jzxw=E*SyI(4=><Oz#k|f=~^sd*_dB;+)
z6w&E>5NR1;)8v~tEK!k;S_@>q;ykJtZK)yuHg0S;uU*KN*9HP&cq(zaLz{8En(x3#
znfh>0SslYsu@eH?$mo4&v6H<EGyd4-zu3!1w>#4WP%sAh4O+T<kIvW9pp6Dt7?o&>
zfuA^R2t*)1vCA)}<rV5P(z;(=dy}&2j+7oWe$F|WWv2?HM=;1n7JRzPj1K7$M`9Zh
zc}q=jA{&c-FJ?m#XwG$quALWDNMT-4YFd*q#G0u7i=jYr4siZD7t~|Y<iOJk?~geW
zD@uMnAM6`{RVQAoM^~nEKnpjAEp2a1R@apyEW-_|d)+-;m$nOcH}41s)5Vz`n(^fg
z6;wCP$X0sPjJ|XZXg^lJy4d>W@dKf{y6o9uhkk<TI7p3{+$(8%)i4bc@6w^JJw*$7
z_u6qei&PgP=F}@5FabYyj&M#XGQMe{x-Mj!q5JIVHxZq&wXYQs9!Ykc)SBF4!&U|r
z1us=!YkLc=aSg9rxc_7+?K48~xu6e3v^kivNsj{5(A7H9-FtYzbUfPV)R)d2f6XlM
zW&Wj-ipb0MTEH9fh5Rn_rQ2G>ZKA&odAU~Im9TL)BeIZkiRmY)14wH+;oE&Lt^Zo*
zbiP^p#n5Gye?Qs$@jRchkI}C<?C{-@i1yfO_DUCS=u9{Ngv>E@jWueMd@LM(@>@ON
z2>t8U*QDZ%-oKpJzgHQ$mu+}a>B*#C=L7Ps!mRy?Ty}rLW!L95T5Aj=zbD7rayn#o
zZd}P-?9o;68?DwEve8t&o>!>VX<cCH90Aqtp6pmC3~xeM7)&rDpQ_|4%DxUY?;{0C
zT@r?gHjq9@8R)8D1RBelX)z?kIn9bZM?JEfMqy*3XP;<w8*0!C?6LUmDTY{(1jnmQ
zl&21Kwuk-1O7qb<qR_euP1&=XnUZ1bCr}0|uNwGc^Tn&2(26w2%7W5ws)BX2dQCK(
z)9DRf&JF^m?IQHE4~IHN=8L(ZI9zb2h@hp`jL>c7c-4mR!M5zM^4hz>f;@@5vQo|;
zAND@y1jJD4$LDvgyv$25<uRz}E9<ns3hCCVo&n8$9+(bKX}NRM5yu}dZx)muD~{KM
zt_}BO?6TLG%kvD)a6%cZMpC%SZtHcv04A{Yb{B>w>jQymLaDl_ZO(g7m<K`kT~g6+
zS)5k~x}&I(@cKPPPAFy}=i-vF@??)9nAKoO@yISj+}7S`{5win5(2@vOk1(ck`}p4
zems8tF#;;8QRhj;AFG?~Xt79E`U}LFkRZrwzJAcLo;gx!^DJM2w-Rf!)5>?pR#>yB
zrOI5i`xa|s_{hiTGBPbL3FmRTWS0+s{ztuNafc7e78L?-6~Lkm1E!=v8j?y90iS7V
zo-q>H!LXqV@X*@ehY-Z71*8;!OJ3l1{ylPJUkZrWylmn~VN&40>cR;%_Wn3y`Z~^s
z_pRVI1VJsTUzA+nATap@`HOYP5X3OEFL=*IM44kEl@sr#?{q>m#CW3osbMMclQC|y
zCi^-3u&k<)sV41g#s%li873r+5J#h1HSz(CZHPh-Mku_1ih<N!<sU2QOZ^#3ny!Sp
zACpr68PgFM)H+y^Q_tDW0?6(qM&Dr!*t+1cV0Cr=gTyazaTK*z1JE=D^U*~Tlx8hb
zig*L&rsjOM&zbF`4WUL#(^Au=Yo)@nXThK&AUUo}sW(8z8d!BHyg_Y;3+I$L91fE$
zo5XEjqt@$N(khi{;(p%c-oM1jpI$7Lx<dx0XKDBH;H+9g?rD}Ha=q@%kg2%hUH!e%
zXLIdQ)--}}Qb^n!nJD+NkZ`Vc=#*M@fpzb*g7K^J!Yzf_DVm&Wh45P{JTv95m;ci$
z=sPM<<%g`K&8C8Gn2Bc>>8LM-$;Q9QTQ4p}<Y)c{qCl0hL9e!VY2DTt<_0<CB~vcf
ziyW}!eyo3yPy4+AV41>5B~plZBgbD%%*;J?Z9_fsl;b`Yexj3w5QrHFN{sW>>nesc
z0@H<MgzAFs3y!kkoB-eoz=37qQs|plW)G!?gy9?OmVDGE;B|+mGg#!_ZMfU(IGFWq
zIH;iyxn~Oh5tf16lYH`;DJ|2R@_b%nL<6heDeP$;J2!OU-dv*Ktg8DilKE05mo7gq
z;M32fXWrLZ+JDuuO{zCD@EiOUQt;^xGdc|K;u8Sf5SP}Brkgev4jA?tUvI7od-IuW
zk79&hKwS`dXBTOXa{JXwbIQg*rU{06YdC}ZYD&CJkZr?Qd6uqP_q6=!-Tpez6x)cM
z5;(9fHU*|viZDVwhN|XkNj8D*JNY;B_(ue1^bRA#TW5F}+G+ptbJ95~3)_vnUCSzd
zs6UvvSw)3Br5v-JGA&<((W&?xxrbq4t87l$z+<G;x=E(mk$A*|b^P*QD8Q2xtJetG
zGVc<2VwA&-CvCGl2p3oEUHP5$iYdR0RUz`T_8ZPyo6PzTMqD3~L~kNqbE}SAdU+Pi
zw9UV!65@9TM`6BsgwvFFS|yd2XlM6RF547CmeG;HOy5y;<HFIZ7K`u5iq5{kLow7b
zY7Osj$Biq$H#hvHyFtKWkRdab*9j`8>;6oK33!9}`&)R=(k<cCPcM=;Iw_<SIMY;9
zd!!K2#k0)+f>wf3&FSp-9K{xEd;%LwO6^L~e8YF$9tX8OPTYwPR~)HwC?B`pKe2l@
zDC;+vt3my*Tk*WhoFOhI=JHkjbL<%??Td5nJm&vEooP7|H44;wP>4xhnEXXEm`UE5
z04ys&<B*0MplInc@C4h^sxhTCII3pKQR3SVKJ%BK5kWT}FNNbHCP5f4naDN<A3k_R
zSu^nA=3|7^KwI`zQGKDS&Ld11QGG`m(F;KKpqm2fc;h1I>iCcGy8560f}pKaCI53f
zUSCIO6ZG{D07^W(bKo5&L1VAAA!zK+%^zQ7*?o$K61>j-&zSy?#4ZOWWeMsbEbnvf
zKN~^;mY^6f0`UKfr-}c!Y8(dS>$&vvXHS;Q`)4*5e4bmZ_?ZyEk2s=|oP&Z;S?K>v
zicb{I6S6-?3SW^Rq^MI?a)0MHLG-+L@uM1zrRNokNPYKT40hBPfQkR{emaG{F=@3)
z?O}7eq@R7$O|%hTRqy0j=8Y=#K5|hrBO^)rgZu|ejs_`8j}c7r|2a3^P8#M;9ry&v
zhhDVb;u&@iGs!$vdyHLl#9{Sbf;V39d5AqmY=i!_n#?W$MPwr$_#EFBV+Ywi7_D+|
z)N<W#k002-vYVT~*|;C+Q!0%yEt^JlMXGgGd$Oe6gvt_zz#D|-UDD>M)jRb5zF=X1
zIc_ev3j!*@MWM=fFfV<m-4Pq;uR8Yk7`@Sm3;|(pqV5=R4wxaZsx<T1kNPYjtcKmA
zsx6L453iR?=|1$36hG-Zq=m<6qF5KPK{0uRl>@u2Ym-)!<&ZTrDgZAUQ_0ds(n(>u
z)QnWJgLB0LC2wgukN-SC1kilUDfJ$4$htW6bNE)XfO%i1il}}P*0#|%idYy%+i<{3
zU#o^T*8+;rDmWP40#l#Hr2^_Yj&rSil%Jqq!krJXytRI&D}%p(<ar;_M4UW&4V+Bz
zDGjS;8)Bxf{x)17hR?fZJr9(ZoYcUz(;sd^jS2C~wZw*z=5%+oq(J%bYueC@rV+zl
z#Sx*rtg@m}iYV~L^D5bCwR&|C@3uVCiA*uTOS~69PnJI%G%eodoN#<UI;6@$Y5z<F
z<h#DVp%w)!b(mel=X~^r4<e1OC7AYU0IC7^;*Yyj^Bpw#?<8)mWWR644gtw1qz|G5
zxb%P-b9pw85i$f5ZC|h%odZjBg>cy(TD6JN55FWYex~G6JvsW-{x%n#tE;x;%FNdy
zm!v602Gh{g5Nn6Quvh#7%jPt|gt1{J_|>yjZg78&8X3mU9xs$Er@m-fT*0@w-u*@&
zoF18-9TA`&GYs}Fv6K8V9a{rXmfSWd?}s0(RBuMQY&MwOFGaW`rflV|7DNyTzhlHI
zO|Ld&XFqvL<GFqxLmNW|rkL75I28-n@#N%H6~E?lF(+ddAAJ02B`S=Nw#>Fh`Gddg
zUKkzR^NDK_TJoTBS6xc8(*moqy(v!YE6=jn$B$dC!0jK!T+qsw8s^*GcY*GH{8Cug
ztZN%4r6kH<y6AOLO*&Jt`8l5vak-f{ABW)6@m5d6UVS^A2L}G0C`q#qSut7-4rZDV
zi$pe;`v#nvk5XW~rFU*D$r`i7U$)eBzjvdYRjoxRfsPgZu17q1S)Lp>F*%ti*NZ0J
z@>5>)<bn!)y=_N1^X;$WP90_*XIk;EReGY-?3@BD<~Y1@gz1IZ3iTC!4wxS2Zxhl-
zjBZ7b$?{a$kgW(pZmg;)4`?_53~+%R8E)eX<Es0=^kpS)lW@`6;b+87=;An6vWvh_
zBjQB;Ta$*MPqWXgrno%F?s&X*RI(0a=g?T=B<Qx~pby5a93%Y#s$8;(@fwOAD*<%7
z9V6TIoc173_a1)WsvK15?z*s3Jke+mwB~VA<YGie*pW*zN-;AwuLhx&Ken=rf~}J-
zU5;#y>O2#b&gY+dF0yJr@55c7{|X;7zwz&xOMUI))Z%QP7WOXznaGl8Oj)^koD+MB
zI}057{?+9Wh>(4QeQH~e!{e@xIyD;N`2}{6rPnzYzQplH&0#qg;1zZtGj4?&Kli?`
zroNdvj(Vw{_j)4w#tnD*rL>-GMBn;b=7QwLpN$UXH5S%PvrWQ-twn%L|9RR4<$M^o
zbD?a;@3qAlIj|pZg8D^&%eq$T!{F3SHcPI!v`Eh#kX`B9juIQn$BGh!kzmrTz0o}}
zU24ovQQ^NzrFjH37Ch8>WAakO&L~E@e3Pof%0?m>eJAAl$u3&aJC9K`#QLF##5f~f
zMc~Do!@xIx_{H<_5WucmtwH4<@lbiCLl98@k2c$vflRY<S>U>fSE4~#NuC=uNUmf4
zfwdIylYD@MRF_8|u5I2}?J7&~`QmGAuZ{4^sls%QljZx|pF1&xyq6KHKbBsYL7eQ|
zRON?%8XLO-ukQp0joe?nA(qsna*jdC>+%ted4xqT6WcA3=uEHr?0}Tz9syPEWI+Z+
zlbFv1GrUVtK2i&3rL8><R7M2F2>%gvGx+!I%?@V5vlGl*dGQ|H_uqK%KL$BU(@hqc
zy2Mw#!N2O@AMw%a3I5yoYI<j29Bs*}pw&hEGX<6}12ZoC$q>Ul0<*++7e)06A|Qby
z=Hw&JkmW?4yl<ZU|6s`fDn`OHAVog|Lz?(7$8)7KuY@NPMRxp%*SgQ4n(>nQ9-Yty
zJp7$yuS9=JtZUTCTNg#%)AQmL*)wQ!%mxzRP9q?j*wTsePm(#~Xep19sT!B%QA~I9
zetzlf$<IE<-i>v2oKN(llBuJgeH#Q(R=2&U#q)QfvG-a&UW#o@+@z>V-2H7igWHD5
zHf|>?H}7|s&PJsLrzGvxNW2j(xmLgTVx|E)0pM3acRRfPvSucM-j9o6{O@?P8;`IF
z=JwN1sgTb`21XN$dkU@ZiBL@HqV~$!fk$H7O6*w(KTS%vLu@gf$tA#w!ge;TxCMcX
z&(ZdNAz~?iu&jgcB9+(XN9x0uE8R;l?x>_1C=Syh3G`oI&)!&++R@2JI7kUh%m5AD
zi)9?XjMYC!(b~@({_N|em7gTlBX|`(buf<2@T-1DZ`T+lljy8XL!NGj97MrAT7F#w
z%{(%_fZi1?U7?Rcm!tqDAHRuL?Nh4Cle}xWM&|NkE-}nQv0JP%FYRG<PnqD#z}JLo
zxVL+*_m|NsYOg!9`XkXKXRN{)Ur~T4h=@{Pqwg;qs6j0U=}~zdQ*fkSbN2#f9AJf5
zgwKLZl!6FY!7#hjEOFP`HrZs<Ht&Z`mBqVH8xh{>?Li$*F-9$N0&J2h(K>G$Ic#6+
zD|$%CyYE9$Hdyt>ShK0u>ZwxJKYj{v0|kxvT)%MKx5(fK5x!MQoXN`rE%4Z^g#?F-
zZJv{w9qM~hyKvu+iYu~?4p-b@?;eDP&O`y|kvl0yMqFqPxlNL??TG{ILYJ2JKH5xB
zx230^ZR98|0mu66@KtQCnSJ8uNDa=Uaof(aU&MAQoW<p(-&D}j8ZS+3>*6oP>DMyH
zo{h1G14iYPC<Y1iOHVaY<m)rS%Ccj)VHR4h>|Gm9h$+61+7y;NbQzD<L%8593p1A_
z`WbB+$eLwj^^R~lrgM}*I5zpA!wq(f0y(jdK<G-6L=o3U&Vf9?_HE&+Ld5aS-AiI8
z=e;sKl9Vf!u6djgE0r2F+mrB!Bou;(o8qMI-#8+5(jUDjK+660#1-kTBa@gaZQ%5E
zJzCiUGrH#SQ(nQ4@e3dNy4JPQ6Sa3+P20ne=mYh|XD*d&B$gbhBiAmI61$4Efpd)g
zH(!u!=PAnsUFP%8r*K}IPVR9Ji3!d=gIo2u8{Jlm3D>n+b4_r%N!M+-4|QCGFXd!t
zxeu4MeAP=V%TiGuq_<SBnpR4pxt$)xzm{T`G#GBzJ0NEMc{}6K<*#BCheIWsSXT||
zk-dpxQ_(E`o7~}d4PKyLi`ZQ8VK|Sd1s~nI8Vg*Rfc`%R8x`QfeLfT)pM}t2+>7q7
z<13aSt9|b_G3{aufHY>4LT8xe+n&{YQK11b4?{Ir(+Ch-)_3hAtL`T=cyeN|=i@G}
zhM+14BD=FQ<5StOoAE|a4-GyfCB={Le!kdpS4JvzFTD(oT8A}N+Ch*hG8XI4E1o=)
z6w^)%@ou<MSy*AgTz|bZ#>g004SZ6ePc!rw$Q$qg4YQsdTDDm`T9L7=YG@`~5HF*&
zfJ}B6o1|3*(p1iqRcq%5NE<^gE!^;;a#!FdQD=B-3qq?z9C-fC4Ou}=Z{`)BsO{L%
z#D|n0y#<0>l&(IV8W{R(X=ScFjkL4Z*q>biwx(b2@N0n%A+flJm-2<F#FOfj2t6u9
zfH%}|P*Fj*^^Z|8hv)gn7~=aYp_XhV$l-GY*Gx6|t+2rm&^?siNF}%Fg5RL*ddziy
zH){<=VV?L95&3Z%=XB7kyS){3rl3jIYtd;1T`>^xik}t2R4xzcSv#OlM1zE<biVNJ
zD|Y)0P0TzWw|BfD#lSzB$z;Io)pI$ICtRdD2QgTxO?-+xIj6tJ;iS2&WVkGZI~Bw!
zS!9}K2gGnUerTA68aX@9FlI-ArCA@z8HcYfEsb+Qxn|RQA5Jvwc12$SvcClqstIf*
z<;1o^C&FDykXjlj2DgCehZbAdN+hY~+Kasbv^B~vmSJCX$=^*BkQ05J_lfjjD9^Rl
zHg}&$m)PQh(}ShfOcrHVU#LcPio_<0+Q1duRb_*E<Y&^M*r-@vWwtG$rxF;hZ=iv4
z<Xp|#ICo;apK<_+Z2fsZMF)!fvfB(n_tzH0;!GNi!ETSMz9QbI73+&od~YtvM~p_N
z!*$B}biwHeW2$6^WCAi3wbDJI%L2`5^F#D%uY+j<kOtO|2+0*SG<Tdlf4GL!<1c5G
zYVpZ7r^8oTLDsp3Dv=<vD%+Ev=~?E8(NF$3Mx`FTaXn;0QbU>9+%8c;0j>kcdaQTL
z_N7FEq~TpHVW*t8oY0JL&NzB;wR3myq8-vy&lV$vT%?zu5wA+O3b3I%+@pj)l<;4T
zo{l8W7a~y&)A*73O4)v2es_ubMz8vnMJB|HEwZGHX!d|~3ex>qN+*9K2UQ@Gde}XQ
z4Zo%5DDz@FTQ=YWxA>ShXEraS%hfXeto^~4b-%LjkzL}diw3<TFI2$4-kIghe9~Eb
zjk;>@Ra!1I<h}H|aCkUOJ#<!B@=F=gu2+L~VQ@gaEc9+A_aaIzt&A>k%)bi8Dvv{F
zX|*XQO*Er0cH3i^EA4^PeCOc$K>qHP>>XaYBDM5yca6Uys=Ql4OW{tRnl8=rv}F4I
zFel!IP#hbhKyJ-kk^8wNj8`-l-T7pF-UD#dm#AGPS6ThkdKx~JHjb|zlSZ7B!biBF
zr~3MAG8=wJ%~7%O6Xr&evS6h$4$~wXQ6R9-<6n+VS3mHpJlxysMm9p^Nm(nQh)0HB
za_+6({PJsT29Z)OWTE|LjbX1K@y~*j^48bPE&E?d-d<HM$vYJi_}R)Z9=OQv@J;B}
zY<6$G<naPqlzo|k>%I~a^`8CJ-7scyU`AjHl6(e76Pcu%`Hku6O}6%SmComUPw!*b
z=*Ez0Iwg1@XMd4}DCka}xoH>j9z|8F?EbqGNgjRY#a^GHJZ8R*J+g!MAtjDNYq9}@
zaVr6E1iM8-5XY88e17GK!@B|citxhTTcJiRd(U8Am(&c|Im~I#0&n<wTQgZGo8~0h
zGrQm+=>*4ssu4rgf;)`CljcQ=CE5-qq1BU{A3r`UqT$i}B2S@H<fjG|?xv*LvU*6q
zI+qkENy3|6W^NgH2`k((!zt`y7nRB>8v)poJd6L}HdDWD*cwV0QT#?a!tHi_g;dUG
z4D8<N9ID3rgFGelIUz8jSO9=B<?}i<F;IpKYzul{+*gv6%I(~#z&a!Au+!zA1}Kee
zyu)QJjc1qt(DYE*G3BZgeDju=?p`Xd*ye6dnz|V}T#}Qo;A!HuAeH}4Z2SW>%{yNz
zy9?BB6c->MlCB(w$+gh>)<{^Mu8;ec96HbUF-;Uc^~Z1R@N(SwF6?(j^&Nf}z*$M+
zPsJ<R8@N>ePW7ie`9Bill+^%1iX5J`YSx>&{AYIYe-z69r}F84?%+7%foOb!)=7Zu
zu?J~VVqSA8GJ))mN+wuc)Sjd09{r)^{=J98F`M`wJ@@~11BV=#@-R~NGysmHNa;V8
ztMTZSI+>sxGJtS5(0XC98x)&U<+X0d&Sf4v{Gl~|$EyST?t}gmq;!Aet$mjD86bcI
zo34FN4no)7eaCO1%gV=qzvcAjQv87X;|YSybS{gZ3z-*ixa@Giav(r7$&2c62V(#;
zN}F+Z9~iNO@5lR;`Yp_Kn|QicM8ue%XTS)35SnxpYeO?C<hM%K_}p1?+HWZJx-h@U
z&0>1y?8#qWpH^8gX$fmMKtK&@vr(*P2D(9breFjB?zr1`tM0wrrudn}>chS~>$7J7
z_gu*u*mssW49<k_M`i|W9HsJQBXR`JZ7@f_qzH#u9<W(zz2Yo<oNwHDq=ro7PH`Az
z`#Q!F%x64uy4s%M<P>VxOPT}<_dctb;&;zf(<*rt+HE=LoOI?hZB?H(PvQsH;WT8^
z4=3usF_=G-AP{g83eh*RZHer2fV;Qr3?I!cqxWStv!*ArE1Sc?rAx=-lccQ`_D#Pa
zi6-#rtsUWkFf)@jn5K*R2Z+_+2Y;LcQ|fo;*$?*sE`!eLD2x&}bkg9!pA^=(o@(gJ
zxFrlW3OBeP3mY@2)cOWLWfAuAYir8!*i(Y#Hx|}S&*Ov144y9Cwx$kW`LN(O<ZbXc
z_(OdhX@Kk_XO1&r!T+LUwoXaA{TuxxXMLEkLS#*@i{$QcrwIMec;Q7mewcU>*gE&Q
z%k4bVFHC3EZ@hTB0h|XNg=V@8>-1Dsn`a|rz2@;sX(U=^dtW`nb9*qEHULm(XRy9s
za#=UXldaylsH6es>b6n>&Z39f7P9#~U>g+S99yIP>HXe?yn7+OQx9$^uBQafMXELR
z8jF9G+053b6Q0&onbrV`uM}3F7_~JP+Ay~#zQRS5wIs;YBG*$lC{WX<F{#p8{1Ak_
zgBdJwGNboW;zu^womSnz4K3$?cT(Eq{j?UBE^B4%_4lFjoufjHzfvC3OXT>v=U2>c
zx<$to(+P0ffw}#IpLWJ5_F5CB*=0IH>a|!$;#pAJ2RH%u7Yd6t%yBRGbXD}J^ccKz
zs#+dECN_bK=&ZJGV7)S_;pmW<HY)y^yB>0K{H`?YJ`YrDOzTnPos8vzI7%&4nQTT#
zNCz2i!F@*EeP8~`or%u=j7ibBFaWJZzqt^~TKu?`PeR#$Z;+W}GC~fl<VT1q35$)!
zAcGvu5|!;ah6zhuMlRJj$g0i8GiMwbGX0o__cilDu(GgG&r()Vi)ZN2bM>5Nl{hw)
zwx^A6xuDUEU01F%ygj3j$p{q<bDw6<=jM=x7V7y&;KtFX1#cpT!c_Nvw8XDImMYzt
z$LZ8~goUL*oQ$UGfWZBag5J{}8&CDgbB7X#i(=iC2r%*mkwJ36)nlv&&hIf61A&ro
zv0Ktup8zk3@r>&rl{ErYWz%RRzMIzI(#@-p4#n?Mf2xJDxxp*;7Sjfmq(-ue(dQ2f
z^;1>n1%4{irY;t=i@UzCwva-}Pfdwg(m-9udNbfR3*m)FzSv(c_+Kpd{j8++8rOVb
zZvCS>2Z8{+<XI#uD*24a_*xKG0IAGHI6tjjZtl+mrDZmL0OVVi-U601@EzU<sT2}g
zv%p14@!p8g8c)yZ@$LR=wJF#DgVsvj-TU>hC;RJ^FfYeOO;w7?9?v_2In`bbY%^9l
zQ$mC>tfkVEXHHG<!|$fU3*C*1D&S&flAXF{ZB&-8J~&r+S>$7<F1T1RvUtGaI#7qh
z_JkXnilG*l^s(iHO0TF<C)4vHKAhQF$blQReZsR^C}1^He(338gN1Dh38TG2PJ<oi
zmMeW_M|m*b{#reh=x0V1#kH)2i2oiD3r%x6%|iwzt-q)7DV5%dV(>vC=cumva)PP#
zXss&p<d}?CYAQM_xd`Fqp~3_8$bM|DjAozC0d#DpP{Thm*L?~Ti1Jy_%R%sG$D=IO
zLvG4i`R{+Qc%!QiKC>Nic!@$nB;I}ZNHkP;J3@xLM#d>r$xp1~1+p#99fP3cMwWcB
znvKZ4CiRrvYBf7yO_#fPzc6zjbe|D(rNZPK+iAm%fcqyw!Bi_pZj8YFHV;k<X83Vj
zR+;oOz8|9*JexqZx@@$nbproD^T<;k5`MQ#eOWbb=xkIyzuQJt%!;<Q^}{R}8&IuJ
z(|=z8KdI6l6N}}^pdf&PW&b^&y_cg$*$$A;h#VQb3+D%vcaS@!4tK|2)iwd59*Pw}
zdv8{yLRMul&n!G90*N=S^}~9-5lva?)a4TXkQ3PZsR1a##3wb)S0_P=%Ln0SMjqw%
z)Ul=LQ^l!mN1fA-wt1;3E%m&%_dzAKD*1c$N!N}PUuwO0<$HGVUm<N@0JfC=`3@zf
z6GnN&W_C8tbGNdY?uY2Q-4YVG2@gg&MSvKPU-0>@lBNrc!;LM^EH5V?q1Lv7Mz&Md
z&+8pz-VnF#`Ca)?_H+WNq4ZduutYS%rA<CgX6u(+D|rR%`#Lgg1*=Wb@4c<lhu(8{
zyR022?5BFIkXSN4OQlXWH|32AdH4?APc@94jk`kBqap{dyOw+g?XHdn;K23=zEx>S
zTUC-igOMqC5gL7uFj_kA0isv_sX}*1mY1$BuJM5w+GifjyVZ{%cO54G%9;N6hDrS9
zPdRY3NBfxEM)fJo_?=V=XbQMDrqM5A$;ZAKR6+Nz3iBV8c6=&;7zo2ialj@=z}n_M
zmjVwY1A2RBzJ>+~G2n0d&n>I?8thpWj_A*7GzQe;aUu0vB+Wk+cfz4CJYoNRC=6fT
z`QN*s{r{V-0AC4v!AVFgZ(vAYXc9_X%q&4SV+m=;f0a0V`;T*fe<uyE&JGq5WNX6N
zzjN98|GOR0f1E{(#!w*_-@9P{YF{0~zc@(eQJ9+BX81f}ZWYTujo$vh+85s5Qlxcu
rw)34I@gIje|LZfKjx`=zr<cmDQy1+-t2+s&L6v}NPfH%Z`taWXH<h@6

literal 0
HcmV?d00001

diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index 55976740105..cdf32216619 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -53,6 +53,22 @@ print(pd)
 <module 'pandas' (ModuleAccelerator(fast=cudf, slow=pandas))>
 ```
 
+## Which functions will run on the GPU?
+
+Generally, `cudf.pandas` will accelerate all the features in the
+{ref}`cuDF API <cudf-api>` on the GPU. There are some exceptions. For
+example, some functions are GPU-accelerated by cuDF but do not support
+every combination of keyword arguments. In cases like unsupported
+keyword arguments, cuDF is not able to provide GPU acceleration and
+`cudf.pandas` will fall back to the CPU.
+
+The most accurate way to assess which functions run on the GPU is to try
+running the code while using the `cudf.pandas` profiling features. The
+profiler will indicate which functions ran on GPU / CPU. To improve
+performance, try to use only functionality that can run entirely on GPU.
+This helps reduce the number of memory transfers needed to fallback to
+CPU.
+
 ## Does it work with third-party libraries?
 
 `cudf.pandas` is tested with numerous popular third-party libraries.
diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md
index 376784439aa..0398a8d7086 100644
--- a/docs/cudf/source/cudf_pandas/usage.md
+++ b/docs/cudf/source/cudf_pandas/usage.md
@@ -63,16 +63,22 @@ back to CPU for certain operations. Running your code with the
 `cudf.pandas.profile` magic generates a report showing which
 operations used the GPU and which used the CPU. This can help you
 identify parts of your code that could be rewritten to be more
-GPU-friendly:
+GPU-friendly.
+
+### Using the Function Profiler
+
+First, enable `cudf.pandas`:
 
 ```python
 %load_ext cudf.pandas
 import pandas as pd
 ```
 
+Next, use the IPython/Jupyter magic `cudf.pandas.profile`:
+
 ```python
 %%cudf.pandas.profile
-df = pd.DataFrame({'a': [0, 1, 2], 'b': [3,4,3]})
+df = pd.DataFrame({'a': [0, 1, 2], 'b': [3, 4, 3]})
 
 df.min(axis=1)
 out = df.groupby('a').filter(
@@ -80,13 +86,35 @@ out = df.groupby('a').filter(
 )
 ```
 
+This gives a profiler output after the cell runs, shown below.
+
 ![cudf-pandas-profile](../_static/cudf-pandas-profile.png)
 
 When an operation falls back to using the CPU, it's typically because
 that operation isn't implemented by cuDF. The profiler generates a
 handy link to report the missing functionality to the cuDF team.
 
-To profile a script being run from the command-line, pass the
+### Using the Line Profiler
+
+There is a line profiler activated by the IPython/Jupyter magic `cudf.pandas.line_profile`:
+
+```python
+%%cudf.pandas.line_profile
+df = pd.DataFrame({'a': [0, 1, 2], 'b': [3, 4, 3]})
+
+df.min(axis=1)
+out = df.groupby('a').filter(
+    lambda group: len(group) > 1
+)
+```
+
+The output of the line profiler shows the source code and how much time each line spent executing on the GPU and CPU.
+
+![cudf-pandas-line-profile](../_static/cudf-pandas-line-profile.png)
+
+### Profiling from the command line
+
+To profile a script being run from the command line, pass the
 `--profile` argument:
 
 ```bash
diff --git a/docs/cudf/source/user_guide/api_docs/index.rst b/docs/cudf/source/user_guide/api_docs/index.rst
index b3442908531..d05501f4a4a 100644
--- a/docs/cudf/source/user_guide/api_docs/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/index.rst
@@ -1,3 +1,5 @@
+.. _cudf-api:
+
 =============
 API reference
 =============
@@ -24,3 +26,4 @@ This page provides a list of all publicly accessible modules, methods and classe
     options
     extension_dtypes
     pylibcudf/index.rst
+    performance_tracking
diff --git a/docs/cudf/source/user_guide/api_docs/performance_tracking.rst b/docs/cudf/source/user_guide/api_docs/performance_tracking.rst
new file mode 100644
index 00000000000..9da79e69fb2
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/performance_tracking.rst
@@ -0,0 +1,12 @@
+.. _api.performance_tracking:
+
+====================
+Performance Tracking
+====================
+
+.. currentmodule:: cudf.utils.performance_tracking
+.. autosummary::
+   :toctree: api/
+
+   get_memory_records
+   print_memory_report
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index f98298ff052..e9dad705cbf 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -19,6 +19,7 @@ This page provides API documentation for pylibcudf.
     gpumemoryview
     groupby
     io/index.rst
+    interop
     join
     lists
     merge
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst
new file mode 100644
index 00000000000..881ab8d7be4
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst
@@ -0,0 +1,6 @@
+=======
+interop
+=======
+
+.. automodule:: cudf._lib.pylibcudf.interop
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index bfaef732555..cecf1ccc9bb 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -6,3 +6,4 @@ strings
 
     contains
     replace
+    slice
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst
new file mode 100644
index 00000000000..0ee5af71c03
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst
@@ -0,0 +1,6 @@
+=====
+slice
+=====
+
+.. automodule:: cudf._lib.pylibcudf.strings.slice
+   :members:
diff --git a/docs/cudf/source/user_guide/index.md b/docs/cudf/source/user_guide/index.md
index 486368c3b8b..df4e4795a08 100644
--- a/docs/cudf/source/user_guide/index.md
+++ b/docs/cudf/source/user_guide/index.md
@@ -16,5 +16,6 @@ options
 performance-comparisons/index
 PandasCompat
 copy-on-write
+memory-profiling
 pandas-2.0-breaking-changes
 ```
diff --git a/docs/cudf/source/user_guide/memory-profiling.md b/docs/cudf/source/user_guide/memory-profiling.md
new file mode 100644
index 00000000000..ab5433685e6
--- /dev/null
+++ b/docs/cudf/source/user_guide/memory-profiling.md
@@ -0,0 +1,44 @@
+(memory-profiling-user-doc)=
+
+# Memory Profiling
+
+Peak memory usage is a common concern in GPU programming because GPU memory is typically smaller than available CPU memory. To easily identify memory hotspots, cuDF provides a memory profiler. It comes with an overhead so avoid using it in performance-sensitive code.
+
+## Enabling Memory Profiling
+
+First, enable memory profiling in RMM by calling {py:func}`rmm.statistics.enable_statistics()`. This adds a statistics resource adaptor to the current RMM memory resource, which enables cuDF to access memory profiling information. See the [RMM documentation](https://docs.rapids.ai/api/rmm/stable/guide/#memory-statistics-and-profiling) for more details.
+
+Second, enable memory profiling in cuDF by setting the `memory_profiling` option to `True`. Use {py:func}`cudf.set_option` or set the environment variable ``CUDF_MEMORY_PROFILING=1`` prior to the launch of the Python interpreter.
+
+To get the result of the profiling, use {py:func}`cudf.utils.performance_tracking.print_memory_report` or access the raw profiling data by using: {py:func}`cudf.utils.performance_tracking.get_memory_records`.
+
+### Example
+In the following, we enable profiling, do some work, and then print the profiling results:
+
+```python
+>>> import cudf
+>>> from cudf.utils.performance_tracking import print_memory_report
+>>> from rmm.statistics import enable_statistics
+>>> enable_statistics()
+>>> cudf.set_option("memory_profiling", True)
+>>> cudf.DataFrame({"a": [1, 2, 3]})  # Some work
+   a
+0  1
+1  2
+2  3
+>>> print_memory_report()  # Pretty print the result of the profiling
+Memory Profiling
+================
+
+Legends:
+ncalls       - number of times the function or code block was called
+memory_peak  - peak memory allocated in function or code block (in bytes)
+memory_total - total memory allocated in function or code block (in bytes)
+
+Ordered by: memory_peak
+
+ncalls memory_peak memory_total filename:lineno(function)
+     1          32           32 cudf/core/dataframe.py:690(DataFrame.__init__)
+     2           0            0 cudf/core/index.py:214(RangeIndex.__init__)
+     6           0            0 cudf/core/index.py:424(RangeIndex.__len__)
+```
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 413eef1d762..047ed27329a 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -9,10 +9,6 @@ from libcpp.utility cimport move
 from cudf._lib.column cimport Column
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.lists.contains cimport (
-    contains,
-    index_of as cpp_index_of,
-)
 from cudf._lib.pylibcudf.libcudf.lists.extract cimport extract_list_element
 from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
     lists_column_view,
@@ -23,7 +19,6 @@ from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
 from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
     distinct as cpp_distinct,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     null_equality,
@@ -31,11 +26,12 @@ from cudf._lib.pylibcudf.libcudf.types cimport (
     order,
     size_type,
 )
-from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf._lib import pylibcudf
 
+from cudf._lib.pylibcudf cimport Scalar
+
 
 @acquire_spill_lock()
 def count_elements(Column col):
@@ -141,64 +137,36 @@ def extract_element_column(Column col, Column index):
 
 
 @acquire_spill_lock()
-def contains_scalar(Column col, object py_search_key):
-
-    cdef DeviceScalar search_key = py_search_key.device_value
-
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+def contains_scalar(Column col, py_search_key):
+    return Column.from_pylibcudf(
+        pylibcudf.lists.contains(
+            col.to_pylibcudf(mode="read"),
+            <Scalar> py_search_key.device_value.c_value,
+        )
     )
-    cdef const scalar* search_key_value = search_key.get_raw_ptr()
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(contains(
-            list_view.get()[0],
-            search_key_value[0],
-        ))
-    result = Column.from_unique_ptr(move(c_result))
-    return result
 
 
 @acquire_spill_lock()
 def index_of_scalar(Column col, object py_search_key):
-
-    cdef DeviceScalar search_key = py_search_key.device_value
-
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.index_of(
+            col.to_pylibcudf(mode="read"),
+            <Scalar> py_search_key.device_value.c_value,
+            True,
+        )
     )
-    cdef const scalar* search_key_value = search_key.get_raw_ptr()
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_index_of(
-            list_view.get()[0],
-            search_key_value[0],
-        ))
-    return Column.from_unique_ptr(move(c_result))
 
 
 @acquire_spill_lock()
 def index_of_column(Column col, Column search_keys):
-
-    cdef column_view keys_view = search_keys.view()
-
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.index_of(
+            col.to_pylibcudf(mode="read"),
+            search_keys.to_pylibcudf(mode="read"),
+            True,
+        )
     )
 
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_index_of(
-            list_view.get()[0],
-            keys_view,
-        ))
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def concatenate_rows(list source_columns):
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 7914ed7e9d9..d1ec5be9e62 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -20,6 +20,7 @@ from cudf.api.types import is_list_like
 
 from cudf._lib.utils cimport data_from_unique_ptr
 
+from cudf._lib import pylibcudf
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
 from libc.stdint cimport uint8_t
@@ -70,8 +71,11 @@ from cudf._lib.utils cimport table_view_from_table
 
 from pyarrow.lib import NativeFile
 
+from cudf._lib.concat import concat_columns
 from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
 
+from cudf._lib.utils cimport data_from_pylibcudf_table
+
 
 cdef class BufferArrayFromVector:
     cdef Py_ssize_t length
@@ -878,14 +882,32 @@ cdef class ParquetReader:
         return df
 
     def read(self):
-        dfs = []
+        dfs = self._read_chunk()
+        column_names = dfs._column_names
+        concatenated_columns = list(dfs._columns)
+        del dfs
         while self._has_next():
-            dfs.append(self._read_chunk())
-        df = cudf.concat(dfs)
-        df = _process_metadata(df, self.result_meta, self.names, self.row_groups,
-                               self.filepaths_or_buffers, self.pa_buffers,
-                               self.allow_range_index, self.cpp_use_pandas_metadata)
-        return df
+            new_chunk = list(self._read_chunk()._columns)
+            for i in range(len(column_names)):
+                concatenated_columns[i] = concat_columns(
+                    [concatenated_columns[i], new_chunk[i]]
+                )
+                # Must drop any residual GPU columns to save memory
+                new_chunk[i] = None
+
+        dfs = cudf.DataFrame._from_data(
+            *data_from_pylibcudf_table(
+                pylibcudf.Table(
+                    [col.to_pylibcudf(mode="read") for col in concatenated_columns]
+                ),
+                column_names=column_names,
+                index_names=None
+                )
+            )
+
+        return _process_metadata(dfs, self.result_meta, self.names, self.row_groups,
+                                 self.filepaths_or_buffers, self.pa_buffers,
+                                 self.allow_range_index, self.cpp_use_pandas_metadata)
 
 cpdef merge_filemetadata(object filemetadata_list):
     """
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd
index e121e856865..d13791d95cf 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd
@@ -8,6 +8,9 @@ from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
 from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type, size_type
 
 from .gpumemoryview cimport gpumemoryview
@@ -56,3 +59,4 @@ cdef class ListColumnView:
     cdef Column _column
     cpdef child(self)
     cpdef offsets(self)
+    cdef lists_column_view view(self) nogil
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index e726eca154f..e0cf8b7ee32 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -348,6 +348,15 @@ cdef class ListColumnView:
         """The offsets column of the underlying list column."""
         return self._column.child(1)
 
+    cdef lists_column_view view(self) nogil:
+        """Generate a libcudf lists_column_view to pass to libcudf algorithms.
+
+        This method is for pylibcudf's functions to use to generate inputs when
+        calling libcudf algorithms, and should generally not be needed by users
+        (even direct pylibcudf Cython users).
+        """
+        return lists_column_view(self._column.view())
+
 
 @functools.cache
 def _datatype_from_dtype_desc(desc):
diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pxd b/python/cudf/cudf/_lib/pylibcudf/join.pxd
index f560eeef06d..83b4776c16e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/join.pxd
@@ -35,3 +35,5 @@ cpdef Column left_anti_join(
     Table right_keys,
     null_equality nulls_equal
 )
+
+cpdef Table cross_join(Table left, Table right)
diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pyx b/python/cudf/cudf/_lib/pylibcudf/join.pyx
index cf2a6a8187f..308b1b39291 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/join.pyx
@@ -2,13 +2,14 @@
 
 from cython.operator import dereference
 
-from libcpp.memory cimport make_unique
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 
 from rmm._lib.device_buffer cimport device_buffer
 
 from cudf._lib.pylibcudf.libcudf cimport join as cpp_join
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.types cimport (
     data_type,
     null_equality,
@@ -88,7 +89,6 @@ cpdef tuple left_join(
     nulls_equal : NullEquality
         Should nulls compare equal?
 
-
     Returns
     -------
     Tuple[Column, Column]
@@ -122,7 +122,6 @@ cpdef tuple full_join(
     nulls_equal : NullEquality
         Should nulls compare equal?
 
-
     Returns
     -------
     Tuple[Column, Column]
@@ -156,7 +155,6 @@ cpdef Column left_semi_join(
     nulls_equal : NullEquality
         Should nulls compare equal?
 
-
     Returns
     -------
     Column
@@ -190,7 +188,6 @@ cpdef Column left_anti_join(
     nulls_equal : NullEquality
         Should nulls compare equal?
 
-
     Returns
     -------
     Column
@@ -204,3 +201,26 @@ cpdef Column left_anti_join(
             nulls_equal
         )
     return _column_from_gather_map(move(c_result))
+
+
+cpdef Table cross_join(Table left, Table right):
+    """Perform a cross join on two tables.
+
+    For details see :cpp:func:`cross_join`.
+
+    Parameters
+    ----------
+    left : Table
+        The left table to join.
+    right: Table
+        The right table to join.
+
+    Returns
+    -------
+    Table
+        The result of cross joining the two inputs.
+    """
+    cdef unique_ptr[table] result
+    with nogil:
+        result = move(cpp_join.cross_join(left.view(), right.view()))
+    return Table.from_libcudf(move(result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
index 89a30f0f255..32cd17f7c11 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
@@ -70,3 +70,8 @@ cdef extern from "cudf/join.hpp" namespace "cudf" nogil:
         const table_view right_keys,
         null_equality nulls_equal,
     ) except +
+
+    cdef unique_ptr[table] cross_join(
+        const table_view left,
+        const table_view right,
+    ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
index 721679f35c7..82aed7d70a0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 
 from cudf._lib.exception_handler cimport cudf_exception_handler
@@ -12,17 +13,33 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
+
+    cpdef enum class duplicate_find_option(int32_t):
+        FIND_FIRST
+        FIND_LAST
+
     cdef unique_ptr[column] contains(
-        lists_column_view lists,
-        scalar search_key,
+        const lists_column_view& lists,
+        const scalar& search_key,
+    ) except +cudf_exception_handler
+
+    cdef unique_ptr[column] contains(
+        const lists_column_view& lists,
+        const column_view& search_keys,
+    ) except +cudf_exception_handler
+
+    cdef unique_ptr[column] contains_nulls(
+        const lists_column_view& lists,
     ) except +cudf_exception_handler
 
     cdef unique_ptr[column] index_of(
-        lists_column_view lists,
-        scalar search_key,
+        const lists_column_view& lists,
+        const scalar& search_key,
+        duplicate_find_option find_option,
     ) except +cudf_exception_handler
 
     cdef unique_ptr[column] index_of(
-        lists_column_view lists,
-        column_view search_keys,
+        const lists_column_view& lists,
+        const column_view& search_keys,
+        duplicate_find_option find_option,
     ) except +cudf_exception_handler
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
index dbafc415e45..fd21e7b334b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
@@ -9,6 +9,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil:
     cdef cppclass lists_column_view(column_view):
+        lists_column_view() except +
         lists_column_view(const column_view& lists_column) except +
         column_view parent() except +
         column_view offsets() except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
index 5c4e5bf346f..c8220df8938 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
@@ -8,3 +8,4 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil:
     cdef unique_ptr[scalar] make_string_scalar(const string & _string) except +
+    cdef unique_ptr[scalar] make_fixed_width_scalar[T](T value) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index 74fdc6e2b2f..2ea83cc4c41 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -5,8 +5,12 @@ from libcpp cimport bool
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
+from .scalar cimport Scalar
 from .table cimport Table
 
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
 
 cpdef Table explode_outer(Table, size_type explode_column_idx)
 
@@ -14,4 +18,10 @@ cpdef Column concatenate_rows(Table)
 
 cpdef Column concatenate_list_elements(Column, bool dropna)
 
+cpdef Column contains(Column, ColumnOrScalar)
+
+cpdef Column contains_nulls(Column)
+
+cpdef Column index_of(Column, ColumnOrScalar, bool)
+
 cpdef Column count_elements(Column)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index 598bbda0a4a..92fc0d6aa33 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -1,11 +1,15 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from cython.operator cimport dereference
 from libcpp cimport bool
-from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists cimport explode as cpp_explode
+from cudf._lib.pylibcudf.libcudf.lists cimport (
+    contains as cpp_contains,
+    explode as cpp_explode,
+)
 from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_list_elements as cpp_concatenate_list_elements,
     concatenate_null_policy,
@@ -14,13 +18,12 @@ from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
 from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
     count_elements as cpp_count_elements,
 )
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.lists cimport ColumnOrScalar
 
-from .column cimport Column
+from .column cimport Column, ListColumnView
+from .scalar cimport Scalar
 from .table cimport Table
 
 
@@ -77,15 +80,15 @@ cpdef Column concatenate_list_elements(Column input, bool dropna):
     ----------
     input : Column
         The input column
+    dropna : bool
+        If true, null list elements will be ignored
+        from concatenation. Otherwise any input null values will result in
+        the corresponding output row being set to null.
 
     Returns
     -------
     Column
         A new Column of concatenated list elements
-    dropna : bool
-        If true, null list elements will be ignored
-        from concatenation. Otherwise any input null values will result in
-        the corresponding output row being set to null.
     """
     cdef concatenate_null_policy null_policy = (
         concatenate_null_policy.IGNORE if dropna
@@ -102,15 +105,130 @@ cpdef Column concatenate_list_elements(Column input, bool dropna):
     return Column.from_libcudf(move(c_result))
 
 
-cpdef Column count_elements(Column input):
-    # shared_ptr required because lists_column_view has no default
-    # ctor
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+cpdef Column contains(Column input, ColumnOrScalar search_key):
+    """Create a column of bool values indicating whether
+    the search_key is contained in the input.
+
+    ``search_key`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    For details, see :cpp:func:`contains`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    search_key : Union[Column, Scalar]
+        The search key.
+
+    Returns
+    -------
+    Column
+        A new Column of bools indicating if the search_key was
+        found in the list column.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+
+    if not isinstance(search_key, (Column, Scalar)):
+        raise TypeError("Must pass a Column or Scalar")
+
+    with nogil:
+        c_result = move(cpp_contains.contains(
+            list_view.view(),
+            search_key.view() if ColumnOrScalar is Column else dereference(
+                search_key.get()
+            ),
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column contains_nulls(Column input):
+    """Create a column of bool values indicating whether
+    each row in the lists column contains a null value.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+
+    Returns
+    -------
+    Column
+        A new Column of bools indicating if the list column
+        contains a null value.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+    with nogil:
+        c_result = move(cpp_contains.contains_nulls(list_view.view()))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_option):
+    """Create a column of index values indicating the position of a search
+    key row within the corresponding list row in the lists column.
+
+    ``search_key`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    For details, see :cpp:func:`index_of`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    search_key : Union[Column, Scalar]
+        The search key.
+    find_first_option : bool
+        If true, index_of returns the first match.
+        Otherwise the last match is returned.
+
+    Returns
+    -------
+    Column
+        A new Column of index values that indicate where in the
+        list column tthe search_key was found. An index value
+        of -1 indicates that the search_key was not found.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+    cdef cpp_contains.duplicate_find_option find_option = (
+        cpp_contains.duplicate_find_option.FIND_FIRST if find_first_option
+        else cpp_contains.duplicate_find_option.FIND_LAST
     )
+
+    with nogil:
+        c_result = move(cpp_contains.index_of(
+            list_view.view(),
+            search_key.view() if ColumnOrScalar is Column else dereference(
+                search_key.get()
+            ),
+            find_option,
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column count_elements(Column input):
+    """Count the number of rows in each
+    list element in the given lists column.
+
+    Parameters
+    ----------
+    input : Column
+        The input column
+
+    Returns
+    -------
+    Column
+        A new Column of the lengths of each list element
+    """
+    cdef ListColumnView list_view = input.list_view()
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(cpp_count_elements(list_view.get()[0]))
+        c_result = move(cpp_count_elements(list_view.view()))
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
index cb7f71b1912..b499a127541 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
-                   regex_program.pyx replace.pyx
+                   regex_program.pyx replace.pyx slice.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
index 959aa94737d..d1f632d6d8e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
@@ -9,4 +9,5 @@ from . cimport (
     regex_flags,
     regex_program,
     replace,
+    slice,
 )
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
index b7384913286..ef102aff2af 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
@@ -9,4 +9,5 @@
     regex_flags,
     regex_program,
     replace,
+    slice,
 )
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd
new file mode 100644
index 00000000000..7d8d0006ef4
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.scalar cimport Scalar
+
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
+
+cpdef Column slice_strings(
+    Column input,
+    ColumnOrScalar start=*,
+    ColumnOrScalar stop=*,
+    Scalar step=*
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx
new file mode 100644
index 00000000000..df75134fb71
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx
@@ -0,0 +1,102 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_fixed_width_scalar as cpp_make_fixed_width_scalar,
+)
+from cudf._lib.pylibcudf.libcudf.strings cimport substring as cpp_slice
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.scalar cimport Scalar
+
+from cython.operator import dereference
+
+
+cpdef Column slice_strings(
+    Column input,
+    ColumnOrScalar start=None,
+    ColumnOrScalar stop=None,
+    Scalar step=None
+):
+    """Perform a slice operation on a strings column.
+
+    ``start`` and ``stop`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`. But ``step`` must be a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    For details, see :cpp:func:`cudf::strings::slice_strings`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column for this operation
+    start : Union[Column, Scalar]
+        The start character position or positions.
+    stop : Union[Column, Scalar]
+        The end character position or positions
+    step : Scalar
+        Distance between input characters retrieved
+
+    Returns
+    -------
+    pylibcudf.Column
+        The result of the slice operation
+    """
+    cdef unique_ptr[column] c_result
+    cdef numeric_scalar[size_type]* cpp_start
+    cdef numeric_scalar[size_type]* cpp_stop
+    cdef numeric_scalar[size_type]* cpp_step
+
+    if input is None:
+        raise ValueError("input cannot be None")
+
+    if ColumnOrScalar is Column:
+        if step is not None:
+            raise ValueError("Column-wise slice does not support step")
+
+        if start is None or stop is None:
+            raise ValueError(
+                "start and stop must be provided for Column-wise slice"
+            )
+
+        with nogil:
+            c_result = cpp_slice.slice_strings(
+                input.view(),
+                start.view(),
+                stop.view()
+            )
+
+    elif ColumnOrScalar is Scalar:
+        if start is None:
+            start = Scalar.from_libcudf(
+                cpp_make_fixed_width_scalar(0)
+            )
+        if stop is None:
+            stop = Scalar.from_libcudf(
+                cpp_make_fixed_width_scalar(0)
+            )
+        if step is None:
+            step = Scalar.from_libcudf(
+                cpp_make_fixed_width_scalar(1)
+            )
+
+        cpp_start = <numeric_scalar[size_type]*>start.c_obj.get()
+        cpp_stop = <numeric_scalar[size_type]*>stop.c_obj.get()
+        cpp_step = <numeric_scalar[size_type]*>step.c_obj.get()
+
+        with nogil:
+            c_result = cpp_slice.slice_strings(
+                input.view(),
+                dereference(cpp_start),
+                dereference(cpp_stop),
+                dereference(cpp_step)
+            )
+    else:
+        raise ValueError("start, stop, and step must be either Column or Scalar")
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx
index d93ac78721b..d91fa0474b0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx
@@ -83,6 +83,8 @@ cdef class Table:
 
     cpdef int num_rows(self):
         """The number of rows in this table."""
+        if self.num_columns() == 0:
+            return 0
         return self._columns[0].size()
 
     cpdef list columns(self):
diff --git a/python/cudf/cudf/_lib/strings/substring.pyx b/python/cudf/cudf/_lib/strings/substring.pyx
index 170c1016b89..706c21c0634 100644
--- a/python/cudf/cudf/_lib/strings/substring.pyx
+++ b/python/cudf/cudf/_lib/strings/substring.pyx
@@ -2,24 +2,16 @@
 
 import numpy as np
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.substring cimport (
-    slice_strings as cpp_slice_strings,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.scalar import as_device_scalar
 
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
 from cudf._lib.scalar cimport DeviceScalar
 
+import cudf._lib.pylibcudf as plc
+
 
 @acquire_spill_lock()
 def slice_strings(Column source_strings,
@@ -32,30 +24,18 @@ def slice_strings(Column source_strings,
     performed in steps by skipping `step` number of
     characters in a string.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
     cdef DeviceScalar start_scalar = as_device_scalar(start, np.int32)
     cdef DeviceScalar end_scalar = as_device_scalar(end, np.int32)
     cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32)
 
-    cdef numeric_scalar[size_type]* start_numeric_scalar = \
-        <numeric_scalar[size_type]*>(
-            start_scalar.get_raw_ptr())
-    cdef numeric_scalar[size_type]* end_numeric_scalar = \
-        <numeric_scalar[size_type]*>(end_scalar.get_raw_ptr())
-    cdef numeric_scalar[size_type]* step_numeric_scalar = \
-        <numeric_scalar[size_type]*>(step_scalar.get_raw_ptr())
-
-    with nogil:
-        c_result = move(cpp_slice_strings(
-            source_view,
-            start_numeric_scalar[0],
-            end_numeric_scalar[0],
-            step_numeric_scalar[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.slice.slice_strings(
+            source_strings.to_pylibcudf(mode="read"),
+            start_scalar.c_value,
+            end_scalar.c_value,
+            step_scalar.c_value
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -67,19 +47,13 @@ def slice_from(Column source_strings,
     at given starts and stops positions. `starts` and `stops`
     here are positions per element in the string-column.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view starts_view = starts.view()
-    cdef column_view stops_view = stops.view()
-
-    with nogil:
-        c_result = move(cpp_slice_strings(
-            source_view,
-            starts_view,
-            stops_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.slice.slice_strings(
+            source_strings.to_pylibcudf(mode="read"),
+            starts.to_pylibcudf(mode="read"),
+            stops.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -90,8 +64,7 @@ def get(Column source_strings,
     character from each input string. The index of
     characters required can be controlled by passing `index`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
+
     if index < 0:
         next_index = index - 1
         step = -1
@@ -102,20 +75,11 @@ def get(Column source_strings,
     cdef DeviceScalar end_scalar = as_device_scalar(next_index, np.int32)
     cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32)
 
-    cdef numeric_scalar[size_type]* start_numeric_scalar = \
-        <numeric_scalar[size_type]*>(
-            start_scalar.get_raw_ptr())
-    cdef numeric_scalar[size_type]* end_numeric_scalar = \
-        <numeric_scalar[size_type]*>(end_scalar.get_raw_ptr())
-    cdef numeric_scalar[size_type]* step_numeric_scalar = \
-        <numeric_scalar[size_type]*>(step_scalar.get_raw_ptr())
-
-    with nogil:
-        c_result = move(cpp_slice_strings(
-            source_view,
-            start_numeric_scalar[0],
-            end_numeric_scalar[0],
-            step_numeric_scalar[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.slice.slice_strings(
+            source_strings.to_pylibcudf(mode="read"),
+            start_scalar.c_value,
+            end_scalar.c_value,
+            step_scalar.c_value
+        )
+    )
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index caf07b286cd..e160fa697ee 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1104,7 +1104,11 @@ def difference(self, other, sort=None):
                 f"of [None, False, True]; {sort} was passed."
             )
 
-        other = cudf.Index(other, name=getattr(other, "name", self.name))
+        if not isinstance(other, BaseIndex):
+            other = cudf.Index(
+                other,
+                name=getattr(other, "name", self.name),
+            )
 
         if not len(other):
             res = self._get_reconciled_name_object(other).unique()
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 51a32e29886..e8b82ff60c2 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -6,7 +6,7 @@
 
 from cudf.core.column import as_column
 from cudf.core.copy_types import BooleanMask
-from cudf.core.index import Index, RangeIndex
+from cudf.core.index import RangeIndex, ensure_index
 from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.scalar import Scalar
 from cudf.options import get_option
@@ -107,7 +107,7 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
         dtype="int64" if get_option("mode.pandas_compatible") else None,
     ).values
 
-    return labels, cats.values if return_cupy_array else Index(cats)
+    return labels, cats.values if return_cupy_array else ensure_index(cats)
 
 
 def _linear_interpolation(column, index=None):
diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index 762cd7f9e86..ed351a6b107 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -18,14 +18,14 @@
 import rmm.mr
 
 from cudf.options import get_option
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.string import format_bytes
 
 if TYPE_CHECKING:
     from cudf.core.buffer.spillable_buffer import SpillableBufferOwner
 
 _spill_cudf_nvtx_annotate = partial(
-    _cudf_nvtx_annotate, domain="cudf_python-spill"
+    _performance_tracking, domain="cudf_python-spill"
 )
 
 
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index eb57a371965..4c9e524ee05 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -10,6 +10,7 @@
 from typing import TYPE_CHECKING, Any, Literal
 
 import numpy
+import nvtx
 from typing_extensions import Self
 
 import rmm
@@ -21,7 +22,7 @@
     host_memory_allocation,
 )
 from cudf.core.buffer.exposure_tracked_buffer import ExposureTrackedBuffer
-from cudf.utils.nvtx_annotation import _get_color_for_nvtx, annotate
+from cudf.utils.performance_tracking import _get_color_for_nvtx
 from cudf.utils.string import format_bytes
 
 if TYPE_CHECKING:
@@ -200,7 +201,7 @@ def spill(self, target: str = "cpu") -> None:
                 )
 
             if (ptr_type, target) == ("gpu", "cpu"):
-                with annotate(
+                with nvtx.annotate(
                     message="SpillDtoH",
                     color=_get_color_for_nvtx("SpillDtoH"),
                     domain="cudf_python-spill",
@@ -218,7 +219,7 @@ def spill(self, target: str = "cpu") -> None:
                 # trigger a new call to this buffer's `spill()`.
                 # Therefore, it is important that spilling-on-demand doesn't
                 # try to unspill an already locked buffer!
-                with annotate(
+                with nvtx.annotate(
                     message="SpillHtoD",
                     color=_get_color_for_nvtx("SpillHtoD"),
                     domain="cudf_python-spill",
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index dfcdfbb9d91..5db6fd904a9 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -927,13 +927,13 @@ def is_unique(self) -> bool:
 
     @property
     def is_monotonic_increasing(self) -> bool:
-        return not self.has_nulls() and libcudf.sort.is_sorted(
+        return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
             [self], [True], None
         )
 
     @property
     def is_monotonic_decreasing(self) -> bool:
-        return not self.has_nulls() and libcudf.sort.is_sorted(
+        return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
             [self], [False], None
         )
 
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 1bf9a393566..f30a557efb0 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -472,6 +472,7 @@ def swaplevel(self, i=-2, j=-1):
             new_keys[n][i], new_keys[n][j] = row[j], row[i]
             new_dict.update({row: tuple(new_keys[n])})
 
+        # TODO: Change to deep=False when copy-on-write is default
         new_data = {new_dict[k]: v.copy(deep=True) for k, v in self.items()}
 
         # swap level_names for i and j
@@ -669,10 +670,11 @@ def rename_column(x):
                 raise ValueError("Duplicate column names are not allowed")
 
         data = dict(zip(new_col_names, self.values()))
-        return self.__class__(
+        return type(self)(
             data=data,
             level_names=self.level_names,
             multiindex=self.multiindex,
+            label_dtype=self.label_dtype,
             verify=False,
         )
 
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index 54c5e829e8a..d9f62f51f92 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -292,7 +292,7 @@ def cut(
     )
 
     # we return a categorical index, as we don't have a Categorical method
-    categorical_index = cudf.Index(col)
+    categorical_index = cudf.CategoricalIndex._from_data({None: col})
 
     if isinstance(orig_x, (pd.Series, cudf.Series)):
         # if we have a series input we return a series output
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f0d8157011d..4dfeb68b7ba 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -58,7 +58,12 @@
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.copy_types import BooleanMask
 from cudf.core.groupby.groupby import DataFrameGroupBy, groupby_doc_template
-from cudf.core.index import BaseIndex, RangeIndex, _index_from_data, as_index
+from cudf.core.index import (
+    BaseIndex,
+    RangeIndex,
+    _index_from_data,
+    ensure_index,
+)
 from cudf.core.indexed_frame import (
     IndexedFrame,
     _FrameIndexer,
@@ -83,7 +88,7 @@
     min_scalar_type,
     numeric_normalize_types,
 )
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
 
 if TYPE_CHECKING:
@@ -145,7 +150,7 @@ def __setitem__(self, key, value):
             key = (key, slice(None))
         return self._setitem_tuple_arg(key, value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _can_downcast_to_series(self, df, arg):
         """
         This method encapsulates the logic used
@@ -188,7 +193,7 @@ def _can_downcast_to_series(self, df, arg):
                 return True
         return False
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _downcast_to_series(self, df, arg):
         """
         "Downcast" from a DataFrame to a Series
@@ -233,11 +238,11 @@ class _DataFrameLocIndexer(_DataFrameIndexer):
     For selection by label.
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _getitem_scalar(self, arg):
         return self._frame[arg[1]].loc[arg[0]]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _getitem_tuple_arg(self, arg):
         from uuid import uuid4
 
@@ -338,7 +343,7 @@ def _getitem_tuple_arg(self, arg):
                                 range(len(tmp_arg[0]))
                             )
                         },
-                        index=as_index(tmp_arg[0]),
+                        index=cudf.Index(tmp_arg[0]),
                     )
                     columns_df[cantor_name] = column.as_column(
                         range(len(columns_df))
@@ -363,7 +368,7 @@ def _getitem_tuple_arg(self, arg):
             return self._downcast_to_series(df, arg)
         return df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _setitem_tuple_arg(self, key, value):
         if (
             isinstance(self._frame.index, MultiIndex)
@@ -532,7 +537,7 @@ def __getitem__(self, arg):
             return frame._empty_like(keep_index=True)
         assert_never(row_spec)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _setitem_tuple_arg(self, key, value):
         columns_df = self._frame._from_data(
             self._frame._data.select_by_index(key[1]), self._frame.index
@@ -677,7 +682,7 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     _groupby = DataFrameGroupBy
     _resampler = DataFrameResampler
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -702,7 +707,7 @@ def __init__(
                     data = data.reindex(index)
                     index = data.index
                 else:
-                    index = cudf.Index(index)
+                    index = ensure_index(index)
             else:
                 index = data.index
 
@@ -751,7 +756,7 @@ def __init__(
             if index is None:
                 self._index = RangeIndex(0)
             else:
-                self._index = cudf.Index(index)
+                self._index = ensure_index(index)
             if columns is not None:
                 rangeindex = isinstance(
                     columns, (range, pd.RangeIndex, cudf.RangeIndex)
@@ -859,7 +864,7 @@ def __init__(
             columns, pd.MultiIndex
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _init_from_series_list(self, data, columns, index):
         if index is None:
             # When `index` is `None`, the final index of
@@ -909,7 +914,7 @@ def _init_from_series_list(self, data, columns, index):
                         f"not match length of index ({index_length})"
                     )
 
-            final_index = cudf.Index(index)
+            final_index = ensure_index(index)
 
         series_lengths = list(map(len, data))
         data = numeric_normalize_types(*data)
@@ -972,14 +977,14 @@ def _init_from_series_list(self, data, columns, index):
         else:
             self._data.rangeindex = True
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _init_from_list_like(self, data, index=None, columns=None):
         if index is None:
             index = RangeIndex(start=0, stop=len(data))
         else:
-            index = cudf.Index(index)
+            index = ensure_index(index)
 
-        self._index = cudf.Index(index)
+        self._index = index
         # list-of-dicts case
         if len(data) > 0 and isinstance(data[0], dict):
             data = DataFrame.from_pandas(pd.DataFrame(data))
@@ -1030,7 +1035,7 @@ def _init_from_list_like(self, data, index=None, columns=None):
             )
             self._data.label_dtype = getattr(columns, "dtype", None)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _init_from_dict_like(
         self, data, index=None, columns=None, nan_as_null=None
     ):
@@ -1085,7 +1090,7 @@ def _init_from_dict_like(
 
             self._index = RangeIndex(0, num_rows)
         else:
-            self._index = cudf.Index(index)
+            self._index = ensure_index(index)
 
         if len(data):
             self._data.multiindex = True
@@ -1119,10 +1124,8 @@ def _from_data(
         return out
 
     @staticmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _align_input_series_indices(data, index):
-        data = data.copy()
-
         input_series = [
             Series(val)
             for val in data.values()
@@ -1142,6 +1145,7 @@ def _align_input_series_indices(data, index):
                 )
                 index = aligned_input_series[0].index
 
+            data = data.copy()
             for name, val in data.items():
                 if isinstance(val, (pd.Series, Series, dict)):
                     data[name] = aligned_input_series.pop(0)
@@ -1188,7 +1192,7 @@ def deserialize(cls, header, frames):
         return obj
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def shape(self):
         """Returns a tuple representing the dimensionality of the DataFrame."""
         return self._num_rows, self._num_columns
@@ -1271,7 +1275,7 @@ def __setattr__(self, key, col):
         else:
             super().__setattr__(key, col)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, arg):
         """
         If *arg* is a ``str`` or ``int`` type, return the column Series.
@@ -1365,7 +1369,7 @@ def __getitem__(self, arg):
                 f"__getitem__ on type {type(arg)} is not supported"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __setitem__(self, arg, value):
         """Add/set column by *arg or DataFrame*"""
         if isinstance(arg, DataFrame):
@@ -1483,7 +1487,7 @@ def __setitem__(self, arg, value):
     def __delitem__(self, name):
         self._drop_column(name)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, index=True, deep=False):
         mem_usage = [col.memory_usage for col in self._data.columns]
         names = [str(name) for name in self._data.names]
@@ -1492,10 +1496,10 @@ def memory_usage(self, index=True, deep=False):
             names.append("Index")
         return Series._from_data(
             data={None: as_column(mem_usage)},
-            index=as_index(names),
+            index=cudf.Index(names),
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_function__(self, func, types, args, kwargs):
         if "out" in kwargs or not all(
             issubclass(t, (Series, DataFrame)) for t in types
@@ -1529,7 +1533,7 @@ def __array_function__(self, func, types, args, kwargs):
         return NotImplemented
 
     # The _get_numeric_data method is necessary for dask compatibility.
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_numeric_data(self):
         """Return a dataframe with only numeric data types"""
         columns = [
@@ -1539,7 +1543,7 @@ def _get_numeric_data(self):
         ]
         return self[columns]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def assign(self, **kwargs: Callable[[Self], Any] | Any):
         """
         Assign columns to DataFrame from keyword arguments.
@@ -1572,7 +1576,7 @@ def assign(self, **kwargs: Callable[[Self], Any] | Any):
         return new_df
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _concat(
         cls, objs, axis=0, join="outer", ignore_index=False, sort=False
     ):
@@ -1964,12 +1968,12 @@ def _get_renderable_dataframe(self):
 
         return output
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __repr__(self):
         output = self._get_renderable_dataframe()
         return self._clean_renderable_dataframe(output)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _repr_html_(self):
         lines = (
             self._get_renderable_dataframe()
@@ -1985,7 +1989,7 @@ def _repr_html_(self):
             lines.append("</div>")
         return "\n".join(lines)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _repr_latex_(self):
         return self._get_renderable_dataframe().to_pandas()._repr_latex_()
 
@@ -2099,7 +2103,7 @@ def _make_operands_and_index_for_binop(
         return operands, index, can_use_self_column_name
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_dict(
         cls,
         data: dict,
@@ -2234,7 +2238,7 @@ def from_dict(
                 f"parameter. Got '{orient}' instead"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_dict(
         self,
         orient: str = "dict",
@@ -2355,7 +2359,7 @@ def to_dict(
 
         return self.to_pandas().to_dict(orient=orient, into=into)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def scatter_by_map(
         self, map_index, map_size=None, keep_index=True, debug: bool = False
     ):
@@ -2448,7 +2452,7 @@ def scatter_by_map(
 
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def update(
         self,
         other,
@@ -2543,23 +2547,23 @@ def update(
 
         self._mimic_inplace(source_df, inplace=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __iter__(self):
         return iter(self._column_names)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __contains__(self, item):
         # This must check against containment in the pandas Index and not
         # self._column_names to handle NA, None, nan, etc. correctly.
         return item in self._data.to_pandas_index()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def items(self):
         """Iterate over column names and series pairs"""
         for k in self:
             yield (k, self[k])
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:
         ret = super().equals(other)
         # If all other checks matched, validate names.
@@ -2592,13 +2596,13 @@ def at(self):
         "index is absolutely necessary. For checking if the columns are a "
         "MultiIndex, use _data.multiindex."
     )
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def columns(self):
         """Returns a tuple of columns"""
         return self._data.to_pandas_index()
 
     @columns.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def columns(self, columns):
         multiindex = False
         rangeindex = False
@@ -2666,7 +2670,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None:
             verify=False,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def reindex(
         self,
         labels=None,
@@ -2814,7 +2818,7 @@ def reindex(
             fill_value=fill_value,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def set_index(
         self,
         keys,
@@ -2969,6 +2973,7 @@ def set_index(
             idx = MultiIndex._from_data(dict(enumerate(data_to_add)))
             idx.names = names
 
+        # TODO: Change to deep=False when copy-on-write is default
         df = self if inplace else self.copy(deep=True)
 
         if verify_integrity and not idx.is_unique:
@@ -2980,7 +2985,7 @@ def set_index(
         df.index = idx
         return df if not inplace else None
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
     ):  # noqa: D102
@@ -3006,7 +3011,7 @@ def fillna(
             value=value, method=method, axis=axis, inplace=inplace, limit=limit
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         from cudf.core._internals.where import (
             _check_and_cast_columns_with_other,
@@ -3163,7 +3168,7 @@ def reset_index(
             inplace=inplace,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def insert(self, loc, name, value, nan_as_null=no_default):
         """Add a column to DataFrame at the index specified by loc.
 
@@ -3189,7 +3194,7 @@ def insert(self, loc, name, value, nan_as_null=no_default):
             ignore_index=False,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
         """
         Same as `insert`, with additional `ignore_index` param.
@@ -3271,7 +3276,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
         self._data.insert(name, value, loc=loc)
 
     @property  # type:ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def axes(self):
         """
         Return a list representing the axes of the DataFrame.
@@ -3363,7 +3368,7 @@ def diff(self, periods=1, axis=0):
 
         return self - self.shift(periods=periods)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop_duplicates(
         self,
         subset=None,
@@ -3451,14 +3456,14 @@ def drop_duplicates(
 
         return self._mimic_inplace(outdf, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pop(self, item):
         """Return a column and drop it from the DataFrame."""
         popped = self[item]
         del self[item]
         return popped
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rename(
         self,
         mapper=None,
@@ -3565,6 +3570,9 @@ def rename(
             mapper if columns is None and axis in (1, "columns") else columns
         )
 
+        result = self if inplace else self.copy(deep=copy)
+
+        out_index = None
         if index:
             if (
                 any(isinstance(item, str) for item in index.values())
@@ -3586,51 +3594,52 @@ def rename(
                 )
                 out_index._data[level] = column.as_column(level_values)
                 out_index._compute_levels_and_codes()
-                out = DataFrame(index=out_index)
             else:
                 to_replace = list(index.keys())
                 vals = list(index.values())
                 is_all_na = vals.count(None) == len(vals)
 
                 try:
-                    index_data = {
-                        name: col.find_and_replace(to_replace, vals, is_all_na)
-                        for name, col in self.index._data.items()
-                    }
+                    out_index = _index_from_data(
+                        {
+                            name: col.find_and_replace(
+                                to_replace, vals, is_all_na
+                            )
+                            for name, col in self.index._data.items()
+                        }
+                    )
                 except OverflowError:
-                    index_data = self.index._data.copy(deep=True)
+                    pass
 
-                out = DataFrame(index=_index_from_data(index_data))
-        else:
-            out = DataFrame(index=self.index)
+        if out_index is not None:
+            result.index = out_index
 
         if columns:
-            out._data = self._data.rename_levels(mapper=columns, level=level)
-        else:
-            out._data = self._data.copy(deep=copy)
+            result._data = result._data.rename_levels(
+                mapper=columns, level=level
+            )
 
-        if inplace:
-            self._data = out._data
-        else:
-            return out.copy(deep=copy)
+        return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def add_prefix(self, prefix):
+        # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
         out.columns = [
             prefix + col_name for col_name in list(self._data.keys())
         ]
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def add_suffix(self, suffix):
+        # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
         out.columns = [
             col_name + suffix for col_name in list(self._data.keys())
         ]
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def agg(self, aggs, axis=None):
         """
         Aggregate using one or more operations over the specified axis.
@@ -3766,7 +3775,7 @@ def agg(self, aggs, axis=None):
         else:
             raise ValueError("argument must be a string, list or dict")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nlargest(self, n, columns, keep="first"):
         """Return the first *n* rows ordered by *columns* in descending order.
 
@@ -3906,7 +3915,7 @@ def nsmallest(self, n, columns, keep="first"):
         """
         return self._n_largest_or_smallest(False, n, columns, keep)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def swaplevel(self, i=-2, j=-1, axis=0):
         """
         Swap level i with level j.
@@ -3956,7 +3965,8 @@ def swaplevel(self, i=-2, j=-1, axis=0):
                            weight    1.0    0.8
                            length    0.3    0.2
         """
-        result = self.copy()
+        # TODO: Change to deep=False when copy-on-write is default
+        result = self.copy(deep=True)
 
         # To get axis number
         axis = self._get_axis_from_axis_arg(axis)
@@ -3972,7 +3982,7 @@ def swaplevel(self, i=-2, j=-1, axis=0):
 
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def transpose(self):
         """Transpose index and columns.
 
@@ -4027,8 +4037,8 @@ def transpose(self):
 
         # Set the old column names as the new index
         result = self.__class__._from_data(
-            {i: col for i, col in enumerate(result_columns)},
-            index=as_index(index),
+            ColumnAccessor(dict(enumerate(result_columns)), verify=False),
+            index=cudf.Index(index),
         )
         # Set the old index as the new column names
         result.columns = columns
@@ -4036,7 +4046,7 @@ def transpose(self):
 
     T = property(transpose, doc=transpose.__doc__)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def melt(self, **kwargs):
         """Unpivots a DataFrame from wide format to long format,
         optionally leaving identifier variables set.
@@ -4066,7 +4076,7 @@ def melt(self, **kwargs):
 
         return melt(self, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def merge(
         self,
         right,
@@ -4219,7 +4229,7 @@ def merge(
             suffixes=suffixes,
         ).perform_merge()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def join(
         self,
         other,
@@ -4268,7 +4278,7 @@ def join(
         )
         return df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         groupby_doc_template.format(
             ret=textwrap.dedent(
@@ -4402,7 +4412,7 @@ def query(self, expr, local_dict=None):
                 BooleanMask.from_column_unchecked(boolmask)
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def apply(
         self, func, axis=1, raw=False, result_type=None, args=(), **kwargs
     ):
@@ -4686,7 +4696,7 @@ def _func(x):  # pragma: no cover
 
         return DataFrame._from_data(result, index=self.index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @applyutils.doc_apply()
     def apply_rows(
         self,
@@ -4765,7 +4775,7 @@ def apply_rows(
             cache_key=cache_key,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @applyutils.doc_applychunks()
     def apply_chunks(
         self,
@@ -4832,7 +4842,7 @@ def apply_chunks(
             tpb=tpb,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def partition_by_hash(self, columns, nparts, keep_index=True):
         """Partition the dataframe by the hashed value of data in *columns*.
 
@@ -5176,7 +5186,7 @@ def _sizeof_fmt(num, size_qualifier):
 
         cudf.utils.ioutils.buffer_write_lines(buf, lines)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_describe()
     def describe(
         self,
@@ -5238,7 +5248,7 @@ def describe(
                 )
             return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.DataFrame:
@@ -5328,7 +5338,7 @@ def to_pandas(
         return out_df
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_pandas(cls, dataframe, nan_as_null=no_default):
         """
         Convert from a Pandas DataFrame.
@@ -5401,7 +5411,7 @@ def from_pandas(cls, dataframe, nan_as_null=no_default):
             )
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrow(cls, table):
         """
         Convert from PyArrow Table to DataFrame.
@@ -5487,7 +5497,7 @@ def from_arrow(cls, table):
 
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_arrow(self, preserve_index=None):
         """
         Convert to a PyArrow Table.
@@ -5528,7 +5538,7 @@ def to_arrow(self, preserve_index=None):
         b: [[4,5,6]]
         """
 
-        data = self.copy(deep=False)
+        data = self
         index_descr = []
         write_index = preserve_index is not False
         keep_range_index = write_index and preserve_index is None
@@ -5556,6 +5566,7 @@ def to_arrow(self, preserve_index=None):
                     index_descr = (
                         index.names if index.name is not None else ("index",)
                     )
+                data = data.copy(deep=False)
                 for gen_name, col_name in zip(index_descr, index._data.names):
                     data._insert(
                         data.shape[1],
@@ -5576,7 +5587,7 @@ def to_arrow(self, preserve_index=None):
 
         return out.replace_schema_metadata(metadata)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_records(self, index=True):
         """Convert to a numpy recarray
 
@@ -5600,7 +5611,7 @@ def to_records(self, index=True):
         return ret
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         """
         Convert structured or record ndarray to DataFrame.
@@ -5651,7 +5662,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
             }
 
         if not is_scalar(index):
-            new_index = cudf.Index(index)
+            new_index = ensure_index(index)
         else:
             new_index = None
 
@@ -5679,7 +5690,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         return df
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
         """Convert a numpy/cupy array to DataFrame.
 
@@ -5735,7 +5746,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             }
 
         if index is not None:
-            index = cudf.Index(index)
+            index = ensure_index(index)
 
         if isinstance(columns, (pd.Index, cudf.Index)):
             level_names = tuple(columns.names)
@@ -5757,7 +5768,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             index=index,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def interpolate(
         self,
         method="linear",
@@ -5787,7 +5798,7 @@ def interpolate(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quantile(
         self,
         q=0.5,
@@ -5930,7 +5941,7 @@ def quantile(
         result.index = cudf.Index(list(map(float, qs)), dtype="float64")
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isin(self, values):
         """
         Whether each element in the DataFrame is contained in values.
@@ -6074,7 +6085,7 @@ def make_false_column_like_self():
     #
     # Stats
     #
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
         """Prepare a DataFrame for CuPy-based row-wise operations."""
 
@@ -6126,7 +6137,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
             coerced = coerced.astype("int64", copy=False)
         return coerced, mask, common_dtype
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def count(self, axis=0, numeric_only=False):
         """
         Count ``non-NA`` cells for each column or row.
@@ -6178,7 +6189,7 @@ def count(self, axis=0, numeric_only=False):
         "columns": 1,
     }
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _reduce(
         self,
         op,
@@ -6302,7 +6313,7 @@ def _reduce(
         else:
             raise ValueError(f"Invalid value of {axis=} received for {op}")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _scan(
         self,
         op,
@@ -6319,7 +6330,7 @@ def _scan(
         elif axis == 1:
             return self._apply_cupy_method_axis_1(op, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def mode(self, axis=0, numeric_only=False, dropna=True):
         """
         Get the mode(s) of each element along the selected axis.
@@ -6426,17 +6437,17 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
 
         return df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def all(self, axis=0, bool_only=None, skipna=True, **kwargs):
         obj = self.select_dtypes(include="bool") if bool_only else self
         return super(DataFrame, obj).all(axis, skipna, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def any(self, axis=0, bool_only=None, skipna=True, **kwargs):
         obj = self.select_dtypes(include="bool") if bool_only else self
         return super(DataFrame, obj).any(axis, skipna, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
         # This method uses cupy to perform scans and reductions along rows of a
         # DataFrame. Since cuDF is designed around columnar storage and
@@ -6536,7 +6547,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
             result_df._set_columns_like(prepared._data)
             return result_df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _columns_view(self, columns):
         """
         Return a subset of the DataFrame's columns as a view.
@@ -6545,7 +6556,7 @@ def _columns_view(self, columns):
             {col: self._data[col] for col in columns}, index=self.index
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def select_dtypes(self, include=None, exclude=None):
         """Return a subset of the DataFrame's columns based on the column dtypes.
 
@@ -6810,7 +6821,7 @@ def to_orc(
             index=index,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def stack(self, level=-1, dropna=no_default, future_stack=False):
         """Stack the prescribed level(s) from columns to index
 
@@ -7155,7 +7166,7 @@ def unnamed_group_generator():
         else:
             return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cov(self, **kwargs):
         """Compute the covariance matrix of a DataFrame.
 
@@ -7210,7 +7221,7 @@ def corr(self, method="pearson", min_periods=None):
         df._set_columns_like(self._data)
         return df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_struct(self, name=None):
         """
         Return a struct Series composed of the columns of the DataFrame.
@@ -7244,7 +7255,7 @@ def to_struct(self, name=None):
             name=name,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def keys(self):
         """
         Get the columns.
@@ -7304,14 +7315,14 @@ def iterrows(self):
             "if you wish to iterate over each row."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @copy_docstring(reshape.pivot)
     def pivot(self, *, columns, index=no_default, values=no_default):
         return cudf.core.reshape.pivot(
             self, index=index, columns=columns, values=values
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @copy_docstring(reshape.pivot_table)
     def pivot_table(
         self,
@@ -7340,14 +7351,14 @@ def pivot_table(
             sort=sort,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @copy_docstring(reshape.unstack)
     def unstack(self, level=-1, fill_value=None):
         return cudf.core.reshape.unstack(
             self, level=level, fill_value=fill_value
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def explode(self, column, ignore_index=False):
         """
         Transform each element of a list-like to a row, replicating index
@@ -7543,7 +7554,7 @@ def _from_columns_like_self(
         result._set_columns_like(self._data)
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def interleave_columns(self):
         """
         Interleave Series columns of a table into a single column.
@@ -7591,7 +7602,7 @@ def interleave_columns(self):
             {None: libcudf.reshape.interleave_columns([*self._columns])}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def eval(self, expr: str, inplace: bool = False, **kwargs):
         """Evaluate a string describing operations on DataFrame columns.
 
@@ -7947,7 +7958,7 @@ def func(left, right, output):
     )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def from_pandas(obj, nan_as_null=no_default):
     """
     Convert certain Pandas objects into the cudf equivalent.
@@ -8074,7 +8085,7 @@ def from_pandas(obj, nan_as_null=no_default):
         )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def merge(left, right, *args, **kwargs):
     if isinstance(left, Series):
         left = left.to_frame()
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 8ca71180c00..9bac75dc6ac 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -32,7 +32,7 @@
 from cudf.core.mixins import BinaryOperand, Scannable
 from cudf.utils import ioutils
 from cudf.utils.dtypes import find_common_type
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf
 
 if TYPE_CHECKING:
@@ -86,7 +86,7 @@ def _dtypes(self) -> abc.Iterable:
     def ndim(self) -> int:
         raise NotImplementedError()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def serialize(self):
         # TODO: See if self._data can be serialized outright
         header = {
@@ -101,7 +101,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def deserialize(cls, header, frames):
         cls_deserialize = pickle.loads(header["type-serialized"])
         column_names = pickle.loads(header["column_names"])
@@ -122,7 +122,7 @@ def deserialize(cls, header, frames):
         return cls_deserialize._from_data(col_accessor)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data(cls, data: MutableMapping) -> Self:
         """
         Construct cls from a ColumnAccessor-like mapping.
@@ -131,7 +131,7 @@ def _from_data(cls, data: MutableMapping) -> Self:
         Frame.__init__(obj, data)
         return obj
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data_like_self(self, data: MutableMapping) -> Self:
         """
         Return type(self) from a ColumnAccessor-like mapping but
@@ -139,7 +139,7 @@ def _from_data_like_self(self, data: MutableMapping) -> Self:
         """
         return self._from_data(data)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_columns_like_self(
         self,
         columns: list[ColumnBase],
@@ -155,7 +155,7 @@ def _from_columns_like_self(
         frame = self.__class__._from_data(data)
         return frame._copy_type_metadata(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _mimic_inplace(
         self, result: Self, inplace: bool = False
     ) -> Self | None:
@@ -171,7 +171,7 @@ def _mimic_inplace(
             return result
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def size(self) -> int:
         """
         Return the number of elements in the underlying data.
@@ -263,11 +263,11 @@ def memory_usage(self, deep=False):
         """
         raise NotImplementedError
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __len__(self) -> int:
         return self._num_rows
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self:
         casted = (
             col.astype(dtype.get(col_name, col.dtype), copy=copy)
@@ -276,7 +276,7 @@ def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self:
         ca = self._data._from_columns_like_self(casted, verify=False)
         return self._from_data_like_self(ca)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:
         """
         Test whether two objects contain the same elements.
@@ -347,7 +347,7 @@ def equals(self, other) -> bool:
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_columns_by_label(self, labels) -> Self:
         """
         Returns columns of the Frame specified by `labels`.
@@ -357,7 +357,7 @@ def _get_columns_by_label(self, labels) -> Self:
         return self._from_data_like_self(self._data.select_by_label(labels))
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values(self) -> cupy.ndarray:
         """
         Return a CuPy representation of the DataFrame.
@@ -373,7 +373,7 @@ def values(self) -> cupy.ndarray:
         return self.to_cupy()
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values_host(self) -> np.ndarray:
         """
         Return a NumPy representation of the data.
@@ -388,7 +388,7 @@ def values_host(self) -> np.ndarray:
         """
         return self.to_numpy()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array__(self, dtype=None):
         raise TypeError(
             "Implicit conversion to a host NumPy array via __array__ is not "
@@ -397,14 +397,14 @@ def __array__(self, dtype=None):
             "using .to_numpy()."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __arrow_array__(self, type=None):
         raise TypeError(
             "Implicit conversion to a host PyArrow object via __arrow_array__ "
             "is not allowed. Consider using .to_arrow()"
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _to_array(
         self,
         get_array: Callable,
@@ -468,7 +468,7 @@ def to_array(
     # particular, we need to benchmark how much of the overhead is coming from
     # (potentially unavoidable) local copies in to_cupy and how much comes from
     # inefficiencies in the implementation.
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_cupy(
         self,
         dtype: Dtype | None = None,
@@ -502,7 +502,7 @@ def to_cupy(
             na_value,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_numpy(
         self,
         dtype: Dtype | None = None,
@@ -537,7 +537,7 @@ def to_numpy(
             lambda col: col.values_host, numpy, copy, dtype, na_value
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace: bool = False) -> Self | None:
         """
         Replace values where the condition is False.
@@ -610,7 +610,7 @@ def where(self, cond, other=None, inplace: bool = False) -> Self | None:
         """
         raise NotImplementedError
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(
         self,
         value: None | ScalarLike | cudf.Series = None,
@@ -767,14 +767,14 @@ def fillna(
             inplace=inplace,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _drop_column(self, name):
         """Drop a column by *name*"""
         if name not in self._data:
             raise KeyError(f"column '{name}' does not exist")
         del self._data[name]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _quantile_table(
         self,
         q: float,
@@ -808,7 +808,7 @@ def _quantile_table(
         )
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrow(cls, data: pa.Table) -> Self:
         """Convert from PyArrow Table to Frame
 
@@ -968,7 +968,7 @@ def from_arrow(cls, data: pa.Table) -> Self:
 
         return cls._from_data({name: result[name] for name in column_names})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_arrow(self):
         """
         Convert to arrow Table
@@ -992,7 +992,7 @@ def to_arrow(self):
             {str(name): col.to_arrow() for name, col in self._data.items()}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _positions_from_column_names(self, column_names) -> list[int]:
         """Map each column name into their positions in the frame.
 
@@ -1005,7 +1005,7 @@ def _positions_from_column_names(self, column_names) -> list[int]:
             if name in set(column_names)
         ]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _copy_type_metadata(self: Self, other: Self) -> Self:
         """
         Copy type metadata from each column of `other` to the corresponding
@@ -1020,7 +1020,7 @@ def _copy_type_metadata(self: Self, other: Self) -> Self:
 
         return self
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isna(self):
         """
         Identify missing values.
@@ -1101,7 +1101,7 @@ def isna(self):
     # Alias for isna
     isnull = isna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def notna(self):
         """
         Identify non-missing values.
@@ -1182,7 +1182,7 @@ def notna(self):
     # Alias for notna
     notnull = notna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def searchsorted(
         self,
         values,
@@ -1296,7 +1296,7 @@ def searchsorted(
         else:
             return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def argsort(
         self,
         by=None,
@@ -1383,7 +1383,7 @@ def argsort(
             by=by, ascending=ascending, na_position=na_position
         ).values
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_sorted_inds(
         self,
         by=None,
@@ -1411,7 +1411,7 @@ def _get_sorted_inds(
             stable=True,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _split(self, splits):
         """Split a frame with split points in ``splits``. Returns a list of
         Frames of length `len(splits) + 1`.
@@ -1426,13 +1426,13 @@ def _split(self, splits):
             for split_idx in range(len(splits) + 1)
         ]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _encode(self):
         columns, indices = libcudf.transform.table_encode([*self._columns])
         keys = self._from_columns_like_self(columns)
         return keys, indices
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _unaryop(self, op):
         data_columns = (col.unary_operator(op) for col in self._columns)
         return self._from_data_like_self(
@@ -1440,7 +1440,7 @@ def _unaryop(self, op):
         )
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _colwise_binop(
         cls,
         operands: dict[str | None, tuple[ColumnBase, Any, bool, Any]],
@@ -1519,11 +1519,11 @@ def _colwise_binop(
 
         return output
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return _array_ufunc(self, ufunc, method, inputs, kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @acquire_spill_lock()
     def _apply_cupy_ufunc_to_operands(
         self, ufunc, cupy_func, operands, **kwargs
@@ -1565,7 +1565,7 @@ def _apply_cupy_ufunc_to_operands(
         return data
 
     # Unary logical operators
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __neg__(self):
         """Negate for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
@@ -1579,30 +1579,30 @@ def __neg__(self):
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __pos__(self):
         return self.copy(deep=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __abs__(self):
         return self._unaryop("abs")
 
     # Reductions
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_axis_from_axis_arg(cls, axis):
         try:
             return cls._SUPPORT_AXIS_LOOKUP[axis]
         except KeyError:
             raise ValueError(f"No axis named {axis} for object type {cls}")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _reduce(self, *args, **kwargs):
         raise NotImplementedError(
             f"Reductions are not supported for objects of type {type(self)}."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def min(
         self,
         axis=0,
@@ -1653,7 +1653,7 @@ def min(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def max(
         self,
         axis=0,
@@ -1701,7 +1701,7 @@ def max(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def all(self, axis=0, skipna=True, **kwargs):
         """
         Return whether all elements are True in DataFrame.
@@ -1754,7 +1754,7 @@ def all(self, axis=0, skipna=True, **kwargs):
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def any(self, axis=0, skipna=True, **kwargs):
         """
         Return whether any elements is True in DataFrame.
@@ -1807,26 +1807,26 @@ def any(self, axis=0, skipna=True, **kwargs):
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @ioutils.doc_to_dlpack()
     def to_dlpack(self):
         """{docstring}"""
 
         return cudf.io.dlpack.to_dlpack(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __str__(self):
         return repr(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __deepcopy__(self, memo):
         return self.copy(deep=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __copy__(self):
         return self.copy(deep=False)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __invert__(self):
         """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
@@ -1835,7 +1835,7 @@ def __invert__(self):
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nunique(self, dropna: bool = True):
         """
         Returns a per column mapping with counts of unique values for
@@ -1856,7 +1856,7 @@ def nunique(self, dropna: bool = True):
         )
 
     @staticmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _repeat(
         columns: list[ColumnBase], repeats, axis=None
     ) -> list[ColumnBase]:
@@ -1870,7 +1870,7 @@ def _repeat(
 
         return libcudf.filling.repeat(columns, repeats)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @_warn_no_dask_cudf
     def __dask_tokenize__(self):
         from dask.base import normalize_token
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 77b54a583d3..eccb3acabf6 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -31,7 +31,7 @@
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
 from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin
 
 if TYPE_CHECKING:
@@ -392,7 +392,7 @@ def indices(self):
             zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1]))
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_group(self, name, obj=None):
         """
         Construct DataFrame from group with provided name.
@@ -436,7 +436,7 @@ def get_group(self, name, obj=None):
             )
         return obj.iloc[self.indices[name]]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def size(self):
         """
         Return the size of each group.
@@ -451,7 +451,7 @@ def size(self):
             .agg("size")
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cumcount(self):
         """
         Return the cumulative count of keys in each group.
@@ -467,7 +467,7 @@ def cumcount(self):
             .agg("cumcount")
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rank(
         self,
         method="average",
@@ -521,7 +521,7 @@ def _groupby(self):
             [*self.grouping.keys._columns], dropna=self._dropna
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def agg(self, func):
         """
         Apply aggregation(s) to the groups.
@@ -821,7 +821,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
         else:
             return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def head(self, n: int = 5, *, preserve_order: bool = True):
         """Return first n rows of each group
 
@@ -874,7 +874,7 @@ def head(self, n: int = 5, *, preserve_order: bool = True):
             n, take_head=True, preserve_order=preserve_order
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def tail(self, n: int = 5, *, preserve_order: bool = True):
         """Return last n rows of each group
 
@@ -928,7 +928,7 @@ def tail(self, n: int = 5, *, preserve_order: bool = True):
             n, take_head=False, preserve_order=preserve_order
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nth(self, n):
         """
         Return the nth row from each group.
@@ -949,7 +949,7 @@ def nth(self, n):
         del self.obj._data["__groupbynth_order__"]
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ngroup(self, ascending=True):
         """
         Number each group from 0 to the number of groups - 1.
@@ -1261,7 +1261,7 @@ def _normalize_aggs(
         ]
         return column_names, columns, normalized_aggs
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pipe(self, func, *args, **kwargs):
         """
         Apply a function `func` with arguments to this GroupBy
@@ -1316,7 +1316,7 @@ def pipe(self, func, *args, **kwargs):
         """
         return cudf.core.common.pipe(self, func, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _jit_groupby_apply(
         self, function, group_names, offsets, group_keys, grouped_values, *args
     ):
@@ -1327,7 +1327,7 @@ def _jit_groupby_apply(
             chunk_results, group_names, group_keys, grouped_values
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _iterative_groupby_apply(
         self, function, group_names, offsets, group_keys, grouped_values, *args
     ):
@@ -1415,7 +1415,7 @@ def _post_process_chunk_results(
                 result.index = cudf.MultiIndex._from_data(index_data)
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def apply(
         self, function, *args, engine="auto", include_groups: bool = True
     ):
@@ -1573,7 +1573,7 @@ def mult(df):
             result = result.reset_index()
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def apply_grouped(self, function, **kwargs):
         """Apply a transformation function over the grouped chunk.
 
@@ -1712,7 +1712,7 @@ def rolling_avg(val, avg):
         kwargs.update({"chunks": offsets})
         return grouped_values.apply_chunks(function, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _broadcast(self, values):
         """
         Broadcast the results of an aggregation to the group
@@ -1736,7 +1736,7 @@ def _broadcast(self, values):
             values.index = self.obj.index
         return values
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def transform(self, function):
         """Apply an aggregation, then broadcast the result to the group size.
 
@@ -1801,7 +1801,7 @@ def rolling(self, *args, **kwargs):
         """
         return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def count(self, dropna=True):
         """Compute the number of values in each column.
 
@@ -1816,7 +1816,7 @@ def func(x):
 
         return self.agg(func)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def describe(self, include=None, exclude=None):
         """
         Generate descriptive statistics that summarizes the central tendency,
@@ -1888,7 +1888,7 @@ def describe(self, include=None, exclude=None):
         )
         return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def corr(self, method="pearson", min_periods=1):
         """
         Compute pairwise correlation of columns, excluding NA/null values.
@@ -1950,7 +1950,7 @@ def corr(self, method="pearson", min_periods=1):
             lambda x: x.corr(method, min_periods), "Correlation"
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cov(self, min_periods=0, ddof=1):
         """
         Compute the pairwise covariance among the columns of a DataFrame,
@@ -2129,7 +2129,7 @@ def _cov_or_corr(self, func, method_name):
 
         return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def var(self, ddof=1):
         """Compute the column-wise variance of the values in each group.
 
@@ -2145,7 +2145,7 @@ def func(x):
 
         return self.agg(func)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def std(self, ddof=1):
         """Compute the column-wise std of the values in each group.
 
@@ -2161,7 +2161,7 @@ def func(x):
 
         return self.agg(func)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quantile(self, q=0.5, interpolation="linear"):
         """Compute the column-wise quantiles of the values in each group.
 
@@ -2179,18 +2179,18 @@ def func(x):
 
         return self.agg(func)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def collect(self):
         """Get a list of all the values for each column in each group."""
         _deprecate_collect()
         return self.agg(list)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def unique(self):
         """Get a list of the unique values for each column in each group."""
         return self.agg("unique")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def diff(self, periods=1, axis=0):
         """Get the difference between the values in each group.
 
@@ -2258,7 +2258,7 @@ def bfill(self, limit=None):
 
         return self._scan_fill("bfill", limit)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(
         self,
         value=None,
@@ -2325,7 +2325,7 @@ def fillna(
             value=value, inplace=inplace, axis=axis, limit=limit
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         """
         Shift each group by ``periods`` positions.
@@ -2388,7 +2388,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         result = self._mimic_pandas_order(result)
         return result._copy_type_metadata(values)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pct_change(
         self,
         periods=1,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 71658695b80..b398ee2343e 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -58,13 +58,24 @@
     is_mixed_with_object_dtype,
     numeric_normalize_types,
 )
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
 
 if TYPE_CHECKING:
     from collections.abc import Generator, Iterable
 
 
+def ensure_index(index_like: Any) -> BaseIndex:
+    """
+    Ensure an Index is returned.
+
+    Avoids a shallow copy compared to calling cudf.Index(...)
+    """
+    if not isinstance(index_like, BaseIndex):
+        return cudf.Index(index_like)
+    return index_like
+
+
 class IndexMeta(type):
     """Custom metaclass for Index that overrides instance/subclass tests."""
 
@@ -204,7 +215,7 @@ class RangeIndex(BaseIndex, BinaryOperand):
 
     _range: range
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self, start, stop=None, step=1, dtype=None, copy=False, name=None
     ):
@@ -259,17 +270,17 @@ def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
         return codes, uniques
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self):
         return self._name
 
     @name.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self, value):
         self._name = value
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def start(self) -> int:
         """
         The value of the `start` parameter (0 if this was not supplied).
@@ -277,7 +288,7 @@ def start(self) -> int:
         return self._range.start
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def stop(self) -> int:
         """
         The value of the stop parameter.
@@ -285,7 +296,7 @@ def stop(self) -> int:
         return self._range.stop
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def step(self) -> int:
         """
         The value of the step parameter.
@@ -293,12 +304,12 @@ def step(self) -> int:
         return self._range.step
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _num_rows(self) -> int:
         return len(self)
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _values(self):
         if len(self) > 0:
             return column.as_column(self._range, dtype=self.dtype)
@@ -330,18 +341,18 @@ def _is_interval(self) -> bool:
         return False
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hasnans(self) -> bool:
         return False
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _data(self):
         return cudf.core.column_accessor.ColumnAccessor(
             {self.name: self._values}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __contains__(self, item):
         hash(item)
         if isinstance(item, bool) or not isinstance(
@@ -357,7 +368,7 @@ def __contains__(self, item):
         except (ValueError, OverflowError):
             return False
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def copy(self, name=None, deep=False):
         """
         Make a copy of this object.
@@ -377,7 +388,7 @@ def copy(self, name=None, deep=False):
 
         return RangeIndex(self._range, name=name)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(self, dtype, copy: bool = True):
         if is_dtype_equal(dtype, self.dtype):
             return self
@@ -386,15 +397,15 @@ def astype(self, dtype, copy: bool = True):
     def fillna(self, value, downcast=None):
         return self.copy()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop_duplicates(self, keep="first"):
         return self
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def duplicated(self, keep="first") -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __repr__(self):
         return (
             f"{self.__class__.__name__}(start={self.start}, stop={self.stop}"
@@ -408,15 +419,15 @@ def __repr__(self):
         )
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def size(self) -> int:
         return len(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __len__(self):
         return len(self._range)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, index):
         if isinstance(index, slice):
             sl_start, sl_stop, sl_step = index.indices(len(self))
@@ -435,13 +446,13 @@ def __getitem__(self, index):
             return self.start + index * self.step
         return self._as_int_index()[index]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:
         if isinstance(other, RangeIndex):
             return self._range == other._range
         return self._as_int_index().equals(other)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def serialize(self):
         header = {}
         header["index_column"] = {}
@@ -462,7 +473,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def deserialize(cls, header, frames):
         h = header["index_column"]
         name = pickle.loads(header["name"])
@@ -472,7 +483,7 @@ def deserialize(cls, header, frames):
         return RangeIndex(start=start, stop=stop, step=step, name=name)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dtype(self):
         """
         `dtype` of the range of values in RangeIndex.
@@ -487,7 +498,7 @@ def dtype(self):
     def _dtypes(self) -> Iterable:
         return [(self.name, self.dtype)]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.RangeIndex:
@@ -508,16 +519,16 @@ def is_unique(self) -> bool:
         return True
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_increasing(self) -> bool:
         return self.step > 0 or len(self) <= 1
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_decreasing(self):
         return self.step < 0 or len(self) <= 1
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, deep: bool = False) -> int:
         if deep:
             warnings.warn(
@@ -530,7 +541,7 @@ def unique(self) -> Self:
         # RangeIndex always has unique values
         return self.copy()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __mul__(self, other):
         # Multiplication by raw ints must return a RangeIndex to match pandas.
         if isinstance(other, cudf.Scalar) and other.dtype.kind in "iu":
@@ -547,24 +558,24 @@ def __mul__(self, other):
             )
         return self._as_int_index().__mul__(other)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __rmul__(self, other):
         # Multiplication is commutative.
         return self.__mul__(other)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _as_int_index(self):
         # Convert self to an integer index. This method is used to perform ops
         # that are not defined directly on RangeIndex.
         return cudf.Index._from_data(self._data)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return self._as_int_index().__array_ufunc__(
             ufunc, method, *inputs, **kwargs
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_indexer(self, target, limit=None, method=None, tolerance=None):
         target_col = cudf.core.column.as_column(target)
         if method is not None or not isinstance(
@@ -594,7 +605,7 @@ def get_indexer(self, target, limit=None, method=None, tolerance=None):
             locs[valid] = len(self) - 1 - locs[valid]
         return locs
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_loc(self, key):
         if not is_scalar(key):
             raise TypeError("Should be a scalar-like")
@@ -608,7 +619,7 @@ def get_loc(self, key):
             raise KeyError(key)
         return idx_int
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _union(self, other, sort=None):
         if isinstance(other, RangeIndex):
             # Variable suffixes are of the
@@ -685,7 +696,7 @@ def _union(self, other, sort=None):
             self._as_int_index()._union(other, sort=sort)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _intersection(self, other, sort=None):
         if not isinstance(other, RangeIndex):
             return self._try_reconstruct_range_index(
@@ -733,7 +744,7 @@ def _intersection(self, other, sort=None):
 
         return self._try_reconstruct_range_index(new_index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def difference(self, other, sort=None):
         if isinstance(other, RangeIndex) and self.equals(other):
             return self[:0]._get_reconciled_name_object(other)
@@ -785,14 +796,14 @@ def sort_values(
         else:
             return sorted_index
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _gather(self, gather_map, nullify=False, check_bounds=True):
         gather_map = cudf.core.column.as_column(gather_map)
         return cudf.Index._from_data(
             {self.name: self._values.take(gather_map, nullify, check_bounds)}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _apply_boolean_mask(self, boolean_mask):
         return cudf.Index._from_data(
             {self.name: self._values.apply_boolean_mask(boolean_mask)}
@@ -838,21 +849,21 @@ def join(
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _column(self):
         return self._as_int_index()._column
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _columns(self):
         return self._as_int_index()._columns
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values_host(self) -> np.ndarray:
         return np.arange(start=self.start, stop=self.stop, step=self.step)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def argsort(
         self,
         ascending=True,
@@ -865,19 +876,19 @@ def argsort(
         else:
             return cupy.arange(len(self))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         return self._as_int_index().where(cond, other, inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_numpy(self) -> np.ndarray:
         return self.values_host
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_cupy(self) -> cupy.ndarray:
         return self.values
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_arrow(self) -> pa.Array:
         return pa.array(self._range, type=pa.from_numpy_dtype(self.dtype))
 
@@ -889,23 +900,23 @@ def __array__(self, dtype=None):
             "using .to_numpy()."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nunique(self, dropna: bool = True) -> int:
         return len(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isna(self) -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
     isnull = isna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def notna(self) -> cupy.ndarray:
         return cupy.ones(len(self), dtype=bool)
 
     notnull = isna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _minmax(self, meth: str):
         no_steps = len(self) - 1
         if no_steps == -1:
@@ -1004,12 +1015,12 @@ class Index(SingleColumnFrame, BaseIndex, metaclass=IndexMeta):
         Column's, the data Column will be cloned to adopt this name.
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(self, data, **kwargs):
         name = _getdefault_name(data, name=kwargs.get("name"))
         super().__init__({name: data})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         ret = super().__array_ufunc__(ufunc, method, *inputs, **kwargs)
 
@@ -1046,7 +1057,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return NotImplemented
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self:
         out = super()._from_data(data=data)
         if name is not no_default:
@@ -1054,7 +1065,7 @@ def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self:
         return out
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data_like_self(
         cls, data: MutableMapping, name: Any = no_default
     ) -> Self:
@@ -1064,7 +1075,7 @@ def _from_data_like_self(
         return out
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrow(cls, obj):
         try:
             return cls(ColumnBase.from_arrow(obj))
@@ -1118,12 +1129,12 @@ def _binaryop(
         return ret
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _values(self):
         return self._column
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _concat(cls, objs):
         non_empties = [index for index in objs if len(index)]
         if len(objs) != len(non_empties):
@@ -1166,16 +1177,16 @@ def _concat(cls, objs):
         result.name = name
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, deep=False):
         return self._column.memory_usage
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_unique(self):
         return self._column.is_unique
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:
         if not isinstance(other, BaseIndex) or len(self) != len(other):
             return False
@@ -1198,7 +1209,7 @@ def equals(self, other) -> bool:
         except TypeError:
             return False
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def copy(self, name=None, deep=False):
         """
         Make a copy of this object.
@@ -1221,11 +1232,11 @@ def copy(self, name=None, deep=False):
             {name: self._values.copy(True) if deep else self._values}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(self, dtype, copy: bool = True):
         return super().astype({self.name: dtype}, copy)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_indexer(self, target, method=None, limit=None, tolerance=None):
         if is_scalar(target):
             raise TypeError("Should be a sequence")
@@ -1297,7 +1308,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
 
         return _return_get_indexer_result(result_series.to_cupy())
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_loc(self, key):
         if not is_scalar(key):
             raise TypeError("Should be a scalar-like")
@@ -1333,7 +1344,7 @@ def get_loc(self, key):
         mask[true_inds] = True
         return mask
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __repr__(self):
         max_seq_items = pd.get_option("max_seq_items") or len(self)
         mr = 0
@@ -1419,7 +1430,7 @@ def __repr__(self):
         lines.append(f"{prior_to_dtype} {keywords})")
         return "\n".join(lines)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, index):
         res = self._get_elements_from_column(index)
         if isinstance(res, ColumnBase):
@@ -1427,20 +1438,20 @@ def __getitem__(self, index):
         return res
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dtype(self):
         """
         `dtype` of the underlying values in Index.
         """
         return self._values.dtype
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isna(self):
         return self._column.isnull().values
 
     isnull = isna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def notna(self):
         return self._column.notnull().values
 
@@ -1470,11 +1481,11 @@ def _is_interval(self):
         return False
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hasnans(self):
         return self._column.has_nulls(include_nan=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def argsort(
         self,
         axis=0,
@@ -1518,7 +1529,7 @@ def repeat(self, repeats, axis=None):
             Frame._repeat([*self._columns], repeats, axis), self._column_names
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
@@ -1569,7 +1580,7 @@ def append(self, other):
                 to_concat.append(obj)
         else:
             this = self
-            other = cudf.Index(other)
+            other = ensure_index(other)
 
             if len(this) == 0 or len(other) == 0:
                 # we'll filter out empties later in ._concat
@@ -1615,7 +1626,7 @@ def _indices_of(self, value):
 
     @copy_docstring(StringMethods)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def str(self):
         if is_string_dtype(self.dtype):
             return StringMethods(parent=self)
@@ -1698,7 +1709,7 @@ class DatetimeIndex(Index):
                   dtype='datetime64[ns]', name='a')
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -1761,7 +1772,7 @@ def __init__(
             ):
                 raise ValueError("No unique frequency found")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _copy_type_metadata(self: Self, other: Self) -> Self:
         super()._copy_type_metadata(other)
         self._freq = _validate_freq(other._freq)
@@ -1783,7 +1794,7 @@ def __getitem__(self, index):
             return pd.Timestamp(value)
         return value
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def copy(self, name=None, deep=False):
         idx_copy = super().copy(name=name, deep=deep)
         return idx_copy._copy_type_metadata(self)
@@ -1801,7 +1812,7 @@ def searchsorted(
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def year(self):
         """
         The year of the datetime.
@@ -1820,7 +1831,7 @@ def year(self):
         return self._get_dt_field("year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def month(self):
         """
         The month as January=1, December=12.
@@ -1839,7 +1850,7 @@ def month(self):
         return self._get_dt_field("month")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day(self):
         """
         The day of the datetime.
@@ -1858,7 +1869,7 @@ def day(self):
         return self._get_dt_field("day")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hour(self):
         """
         The hours of the datetime.
@@ -1879,7 +1890,7 @@ def hour(self):
         return self._get_dt_field("hour")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def minute(self):
         """
         The minutes of the datetime.
@@ -1900,7 +1911,7 @@ def minute(self):
         return self._get_dt_field("minute")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def second(self):
         """
         The seconds of the datetime.
@@ -1921,7 +1932,7 @@ def second(self):
         return self._get_dt_field("second")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def microsecond(self):
         """
         The microseconds of the datetime.
@@ -1952,7 +1963,7 @@ def microsecond(self):
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nanosecond(self):
         """
         The nanoseconds of the datetime.
@@ -1974,7 +1985,7 @@ def nanosecond(self):
         return self._get_dt_field("nanosecond")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def weekday(self):
         """
         The day of the week with Monday=0, Sunday=6.
@@ -1996,7 +2007,7 @@ def weekday(self):
         return self._get_dt_field("weekday")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dayofweek(self):
         """
         The day of the week with Monday=0, Sunday=6.
@@ -2018,7 +2029,7 @@ def dayofweek(self):
         return self._get_dt_field("weekday")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dayofyear(self):
         """
         The day of the year, from 1-365 in non-leap years and
@@ -2041,7 +2052,7 @@ def dayofyear(self):
         return self._get_dt_field("day_of_year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day_of_year(self):
         """
         The day of the year, from 1-365 in non-leap years and
@@ -2064,7 +2075,7 @@ def day_of_year(self):
         return self._get_dt_field("day_of_year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_leap_year(self):
         """
         Boolean indicator if the date belongs to a leap year.
@@ -2083,7 +2094,7 @@ def is_leap_year(self):
         return cupy.asarray(res)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quarter(self):
         """
         Integer indicator for which quarter of the year the date belongs in.
@@ -2108,7 +2119,7 @@ def quarter(self):
         res = extract_quarter(self._values)
         return Index(res, dtype="int8")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day_name(self, locale: str | None = None) -> Index:
         """
         Return the day names. Currently supports English locale only.
@@ -2128,7 +2139,7 @@ def day_name(self, locale: str | None = None) -> Index:
         day_names = self._column.get_day_names(locale)
         return Index._from_data({self.name: day_names})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def month_name(self, locale: str | None = None) -> Index:
         """
         Return the month names. Currently supports English locale only.
@@ -2147,7 +2158,7 @@ def month_name(self, locale: str | None = None) -> Index:
         month_names = self._column.get_month_names(locale)
         return Index._from_data({self.name: month_names})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isocalendar(self) -> cudf.DataFrame:
         """
         Returns a DataFrame with the year, week, and day
@@ -2172,7 +2183,7 @@ def isocalendar(self) -> cudf.DataFrame:
         )
         return cudf.DataFrame._from_data(ca, index=self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.DatetimeIndex:
@@ -2181,7 +2192,7 @@ def to_pandas(
             result.freq = self._freq._maybe_as_fast_pandas_offset()
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_dt_field(self, field):
         out_column = self._values.get_dt_field(field)
         # column.column_empty_like always returns a Column object
@@ -2198,7 +2209,7 @@ def _get_dt_field(self, field):
     def _is_boolean(self):
         return False
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ceil(self, freq):
         """
         Perform ceil operation on the data to the specified freq.
@@ -2231,7 +2242,7 @@ def ceil(self, freq):
 
         return self.__class__._from_data({self.name: out_column})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def floor(self, freq):
         """
         Perform floor operation on the data to the specified freq.
@@ -2264,7 +2275,7 @@ def floor(self, freq):
 
         return self.__class__._from_data({self.name: out_column})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def round(self, freq):
         """
         Perform round operation on the data to the specified freq.
@@ -2452,7 +2463,7 @@ class TimedeltaIndex(Index):
                   dtype='timedelta64[s]', name='delta-index')
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -2500,7 +2511,7 @@ def __getitem__(self, index):
         return value
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def days(self):
         """
         Number of days for each element.
@@ -2509,7 +2520,7 @@ def days(self):
         return Index(self._values.days, name=self.name, dtype="int64")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def seconds(self):
         """
         Number of seconds (>= 0 and less than 1 day) for each element.
@@ -2517,7 +2528,7 @@ def seconds(self):
         return Index(self._values.seconds, name=self.name, dtype="int32")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def microseconds(self):
         """
         Number of microseconds (>= 0 and less than 1 second) for each element.
@@ -2525,7 +2536,7 @@ def microseconds(self):
         return Index(self._values.microseconds, name=self.name, dtype="int32")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nanoseconds(self):
         """
         Number of nanoseconds (>= 0 and less than 1 microsecond) for each
@@ -2534,7 +2545,7 @@ def nanoseconds(self):
         return Index(self._values.nanoseconds, name=self.name, dtype="int32")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def components(self):
         """
         Return a dataframe of the components (days, hours, minutes,
@@ -2612,7 +2623,7 @@ class CategoricalIndex(Index):
     CategoricalIndex([1, 2, 3, <NA>], categories=[1, 2, 3], ordered=False, dtype='category', name='a')
     """  # noqa: E501
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -2667,7 +2678,7 @@ def __init__(
         super().__init__(data, name=name)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def codes(self):
         """
         The category codes of this categorical.
@@ -2675,7 +2686,7 @@ def codes(self):
         return Index(self._values.codes)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def categories(self):
         """
         The categories of this categorical.
@@ -2689,7 +2700,7 @@ def _is_categorical(self):
         return True
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def interval_range(
     start=None,
     end=None,
@@ -2841,7 +2852,7 @@ class IntervalIndex(Index):
     IntervalIndex
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data,
@@ -2900,7 +2911,7 @@ def closed(self):
         return self.dtype.closed
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_breaks(
         cls,
         breaks,
@@ -2975,7 +2986,7 @@ def _clean_nulls_from_index(self):
         return self
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def as_index(
     arbitrary, nan_as_null=no_default, copy=False, name=no_default, dtype=None
 ) -> BaseIndex:
@@ -3090,7 +3101,7 @@ def _getdefault_name(values, name):
     return name
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex:
     """
     An internal Utility function to concat RangeIndex objects.
@@ -3131,7 +3142,7 @@ def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex:
     return RangeIndex(start, stop, step)
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _extended_gcd(a: int, b: int) -> tuple[int, int, int]:
     """
     Extended Euclidean algorithms to solve Bezout's identity:
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 280a6e92eab..ff10051c52d 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -33,7 +33,6 @@
     is_list_like,
     is_scalar,
 )
-from cudf.core._base_index import BaseIndex
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import ColumnBase, as_column
@@ -42,7 +41,7 @@
 from cudf.core.dtypes import ListDtype
 from cudf.core.frame import Frame
 from cudf.core.groupby.groupby import GroupBy
-from cudf.core.index import Index, RangeIndex, _index_from_data
+from cudf.core.index import RangeIndex, _index_from_data, ensure_index
 from cudf.core.missing import NA
 from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import _Resampler
@@ -56,7 +55,7 @@
 from cudf.utils import docutils, ioutils
 from cudf.utils._numba import _CUDFNumbaConfig
 from cudf.utils.docutils import copy_docstring
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf
 
 if TYPE_CHECKING:
@@ -66,6 +65,8 @@
         Dtype,
         NotImplementedType,
     )
+    from cudf.core._base_index import BaseIndex
+
 
 doc_reset_index_template = """
         Reset the index of the {klass}, or a level of it.
@@ -301,13 +302,13 @@ def _from_data(
         out._index = RangeIndex(out._data.nrows) if index is None else index
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data_like_self(self, data: MutableMapping):
         out = super()._from_data_like_self(data)
         out.index = self.index
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_columns_like_self(
         self,
         columns: list[ColumnBase],
@@ -363,7 +364,7 @@ def _mimic_inplace(
             self._index = result.index
         return super()._mimic_inplace(result, inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _scan(self, op, axis=None, skipna=True):
         """
         Return {op_name} of the {cls}.
@@ -439,7 +440,7 @@ def _check_data_index_length_match(self) -> None:
             )
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def empty(self):
         """
         Indicator whether DataFrame or Series is empty.
@@ -501,7 +502,7 @@ def empty(self):
         """
         return self.size == 0
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @ioutils.doc_to_json()
     def to_json(self, path_or_buf=None, *args, **kwargs):
         """{docstring}"""
@@ -510,14 +511,14 @@ def to_json(self, path_or_buf=None, *args, **kwargs):
             self, path_or_buf=path_or_buf, *args, **kwargs
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @ioutils.doc_to_hdf()
     def to_hdf(self, path_or_buf, key, *args, **kwargs):
         """{docstring}"""
 
         cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_string(self):
         r"""
         Convert to string
@@ -606,7 +607,7 @@ def copy(self, deep: bool = True) -> Self:
             self.index.copy(deep=False),
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:  # noqa: D102
         return super().equals(other) and self.index.equals(other.index)
 
@@ -627,12 +628,10 @@ def index(self, value):
                 f"new values have {len(value)} elements"
             )
         # avoid unnecessary cast to Index
-        if not isinstance(value, BaseIndex):
-            value = Index(value)
-
+        value = ensure_index(value)
         self._index = value
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def replace(
         self,
         to_replace=None,
@@ -900,7 +899,7 @@ def replace(
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def clip(self, lower=None, upper=None, inplace=False, axis=1):
         """
         Trim values at input threshold(s).
@@ -1026,7 +1025,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1):
         )
         return self._mimic_inplace(output, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def abs(self):
         """
         Return a Series/DataFrame with absolute numeric value of each element.
@@ -1052,7 +1051,7 @@ def abs(self):
         """
         return self._unaryop("abs")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dot(self, other, reflect=False):
         """
         Get dot product of frame and other, (binary operator `dot`).
@@ -1159,15 +1158,15 @@ def dot(self, other, reflect=False):
             )
         return result.item()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __matmul__(self, other):
         return self.dot(other)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __rmatmul__(self, other):
         return self.dot(other, reflect=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def head(self, n=5):
         """
         Return the first `n` rows.
@@ -1246,7 +1245,7 @@ def head(self, n=5):
         """
         return self.iloc[:n]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def tail(self, n=5):
         """
         Returns the last n rows as a new DataFrame or Series
@@ -1277,7 +1276,7 @@ def tail(self, n=5):
 
         return self.iloc[-n:]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pipe(self, func, *args, **kwargs):
         """
         Apply ``func(self, *args, **kwargs)``.
@@ -1324,7 +1323,7 @@ def pipe(self, func, *args, **kwargs):
         """
         return cudf.core.common.pipe(self, func, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sum(
         self,
         axis=no_default,
@@ -1385,7 +1384,7 @@ def sum(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def product(
         self,
         axis=no_default,
@@ -1452,7 +1451,7 @@ def product(
     # Alias for pandas compatibility.
     prod = product
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         """
         Return the mean of the values for the requested axis.
@@ -1541,7 +1540,7 @@ def median(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def std(
         self,
         axis=no_default,
@@ -1600,7 +1599,7 @@ def std(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def var(
         self,
         axis=no_default,
@@ -1658,7 +1657,7 @@ def var(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         """
         Return Fisher's unbiased kurtosis of a sample.
@@ -1718,7 +1717,7 @@ def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs):
     # Alias for kurtosis.
     kurt = kurtosis
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         """
         Return unbiased Fisher-Pearson skew of a sample.
@@ -1777,7 +1776,7 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs):
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def mask(self, cond, other=None, inplace: bool = False) -> Self | None:
         """
         Replace values where the condition is True.
@@ -1839,7 +1838,7 @@ def mask(self, cond, other=None, inplace: bool = False) -> Self | None:
 
         return self.where(cond=~cond, other=other, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @copy_docstring(Rolling)
     def rolling(
         self, window, min_periods=None, center=False, axis=0, win_type=None
@@ -1879,7 +1878,7 @@ def ewm(
             times=times,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nans_to_nulls(self):
         """
         Convert nans (if any) to nulls
@@ -1935,7 +1934,7 @@ def nans_to_nulls(self):
             self._data._from_columns_like_self(result)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def interpolate(
         self,
         method="linear",
@@ -2034,7 +2033,7 @@ def interpolate(
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         """Shift values by `periods` positions."""
         axis = self._get_axis_from_axis_arg(axis)
@@ -2050,7 +2049,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
             self._data._from_columns_like_self(data_columns)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def truncate(self, before=None, after=None, axis=0, copy=True):
         """
         Truncate a Series or DataFrame before and after some index value.
@@ -2398,7 +2397,7 @@ def iloc(self):
         return self._iloc_indexer_type(self)
 
     @property  # type:ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def axes(self):
         """
         Return a list representing the axes of the Series.
@@ -2530,7 +2529,7 @@ def squeeze(self, axis: Literal["index", "columns", 0, 1, None] = None):
         )
         return self.iloc[indexer]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def scale(self):
         """
         Scale values to [0, 1] in float64
@@ -2565,7 +2564,7 @@ def scale(self):
         scaled.index = self.index.copy(deep=False)
         return scaled
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sort_index(
         self,
         axis=0,
@@ -3070,7 +3069,7 @@ def drop_duplicates(
             self.index.names if not ignore_index else None,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def duplicated(self, subset=None, keep="first"):
         """
         Return boolean Series denoting duplicate rows.
@@ -3180,7 +3179,7 @@ def duplicated(self, subset=None, keep="first"):
         )
         return cudf.Series(result, index=self.index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _empty_like(self, keep_index=True) -> Self:
         result = self._from_columns_like_self(
             libcudf.copying.columns_empty_like(
@@ -3217,7 +3216,7 @@ def _split(self, splits, keep_index=True):
             for i in range(len(splits) + 1)
         ]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def bfill(self, value=None, axis=None, inplace=None, limit=None):
         """
         Synonym for :meth:`Series.fillna` with ``method='bfill'``.
@@ -3236,7 +3235,7 @@ def bfill(self, value=None, axis=None, inplace=None, limit=None):
                 limit=limit,
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def backfill(self, value=None, axis=None, inplace=None, limit=None):
         """
         Synonym for :meth:`Series.fillna` with ``method='bfill'``.
@@ -3256,7 +3255,7 @@ def backfill(self, value=None, axis=None, inplace=None, limit=None):
         )
         return self.bfill(value=value, axis=axis, inplace=inplace, limit=limit)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ffill(self, value=None, axis=None, inplace=None, limit=None):
         """
         Synonym for :meth:`Series.fillna` with ``method='ffill'``.
@@ -3275,7 +3274,7 @@ def ffill(self, value=None, axis=None, inplace=None, limit=None):
                 limit=limit,
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pad(self, value=None, axis=None, inplace=None, limit=None):
         """
         Synonym for :meth:`Series.fillna` with ``method='ffill'``.
@@ -3415,7 +3414,7 @@ def add_suffix(self, suffix):
         raise NotImplementedError
 
     @acquire_spill_lock()
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _apply(self, func, kernel_getter, *args, **kwargs):
         """Apply `func` across the rows of the frame."""
         if kwargs:
@@ -3595,7 +3594,7 @@ def _align_to_index(
         sort: bool = True,
         allow_non_unique: bool = False,
     ) -> Self:
-        index = cudf.Index(index)
+        index = ensure_index(index)
 
         if self.index.equals(index):
             return self
@@ -3626,7 +3625,7 @@ def _align_to_index(
         out.index.names = self.index.names
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _reindex(
         self,
         column_names,
@@ -4154,7 +4153,7 @@ def dropna(
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _drop_na_columns(self, how="any", subset=None, thresh=None):
         """
         Drop columns containing nulls
@@ -4471,7 +4470,7 @@ def last(self, offset):
             slice_func=lambda i: self.iloc[i:],
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sample(
         self,
         n=None,
@@ -4751,7 +4750,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
 
         return NotImplemented
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def repeat(self, repeats, axis=None):
         """Repeats elements consecutively.
 
@@ -4949,7 +4948,7 @@ def astype(
                 raise e
             return self
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop(
         self,
         labels=None,
@@ -5161,7 +5160,7 @@ def drop(
         if not inplace:
             return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _explode(self, explode_column: Any, ignore_index: bool):
         # Helper function for `explode` in `Series` and `Dataframe`, explodes a
         # specified nested column. Other columns' corresponding rows are
@@ -5200,7 +5199,7 @@ def _explode(self, explode_column: Any, ignore_index: bool):
             self.index.names if not ignore_index else None,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def tile(self, count):
         """Repeats the rows `count` times to form a new Frame.
 
@@ -5233,7 +5232,7 @@ def tile(self, count):
             index_names=self._index_names,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def groupby(
         self,
         by=None,
@@ -5283,7 +5282,7 @@ def groupby(
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Addition",
@@ -5324,7 +5323,7 @@ def add(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__add__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Addition",
@@ -5365,7 +5364,7 @@ def radd(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__radd__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Subtraction",
@@ -5408,7 +5407,7 @@ def subtract(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
     sub = subtract
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Subtraction",
@@ -5449,7 +5448,7 @@ def rsub(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rsub__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Multiplication",
@@ -5492,7 +5491,7 @@ def multiply(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
     mul = multiply
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Multiplication",
@@ -5533,7 +5532,7 @@ def rmul(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rmul__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Modulo",
@@ -5574,7 +5573,7 @@ def mod(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__mod__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Modulo",
@@ -5615,7 +5614,7 @@ def rmod(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rmod__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Exponential",
@@ -5656,7 +5655,7 @@ def pow(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__pow__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Exponential",
@@ -5697,7 +5696,7 @@ def rpow(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rpow__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Integer division",
@@ -5738,7 +5737,7 @@ def floordiv(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__floordiv__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Integer division",
@@ -5779,7 +5778,7 @@ def rfloordiv(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rfloordiv__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Floating division",
@@ -5824,7 +5823,7 @@ def truediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
     div = truediv
     divide = truediv
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Floating division",
@@ -5868,7 +5867,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
     # Alias for rtruediv
     rdiv = rtruediv
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Equal to",
@@ -5908,7 +5907,7 @@ def eq(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__eq__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Not equal to",
@@ -5948,7 +5947,7 @@ def ne(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__ne__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Less than",
@@ -5988,7 +5987,7 @@ def lt(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__lt__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Less than or equal to",
@@ -6028,7 +6027,7 @@ def le(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__le__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Greater than",
@@ -6068,7 +6067,7 @@ def gt(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__gt__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Greater than or equal to",
@@ -6123,7 +6122,7 @@ def _preprocess_subset(self, subset):
             raise KeyError(f"columns {diff} do not exist")
         return subset
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rank(
         self,
         axis=0,
@@ -6291,7 +6290,7 @@ def _check_duplicate_level_names(specified, level_names):
         )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _get_replacement_values_for_columns(
     to_replace: Any, value: Any, columns_dtype_map: dict[Any, Any]
 ) -> tuple[dict[Any, bool], dict[Any, Any], dict[Any, Any]]:
@@ -6458,7 +6457,7 @@ def _is_series(obj):
     return isinstance(obj, Frame) and obj.ndim == 1 and obj.index is not None
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _drop_rows_by_labels(
     obj: DataFrameOrSeries,
     labels: ColumnLike | abc.Iterable | str,
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index a01242d957d..9cbe863142b 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -23,15 +23,17 @@
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
 from cudf.core import column
 from cudf.core._base_index import _return_get_indexer_result
+from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
 from cudf.core.index import (
     BaseIndex,
     _get_indexer_basic,
     _lexsorted_equal_range,
+    ensure_index,
 )
 from cudf.core.join._join_helpers import _match_join_keys
 from cudf.utils.dtypes import is_column_like
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
 if TYPE_CHECKING:
@@ -125,7 +127,7 @@ class MultiIndex(Frame, BaseIndex, NotIterable):
                )
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         levels=None,
@@ -172,7 +174,7 @@ def __init__(
                     "codes and is inconsistent!"
                 )
 
-        levels = [cudf.Index(level) for level in levels]
+        levels = [ensure_index(level) for level in levels]
 
         if len(levels) != len(codes._data):
             raise ValueError(
@@ -210,12 +212,12 @@ def __init__(
         self.names = names
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def names(self):
         return self._names
 
     @names.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def names(self, value):
         if value is None:
             value = [None] * self.nlevels
@@ -241,13 +243,13 @@ def names(self, value):
             )
         self._names = pd.core.indexes.frozen.FrozenList(value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_series(self, index=None, name=None):
         raise NotImplementedError(
             "MultiIndex.to_series isn't implemented yet."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(self, dtype, copy: bool = True):
         if not is_object_dtype(dtype):
             raise TypeError(
@@ -256,7 +258,7 @@ def astype(self, dtype, copy: bool = True):
             )
         return self
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rename(self, names, inplace=False):
         """
         Alter MultiIndex level names
@@ -303,7 +305,7 @@ def rename(self, names, inplace=False):
         """
         return self.set_names(names, level=None, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def set_names(self, names, level=None, inplace=False):
         names_is_list_like = is_list_like(names)
         level_is_list_like = is_list_like(level)
@@ -341,7 +343,7 @@ def set_names(self, names, level=None, inplace=False):
         return self._set_names(names=names, inplace=inplace)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data(
         cls,
         data: MutableMapping,
@@ -353,16 +355,16 @@ def _from_data(
         return obj
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self):
         return self._name
 
     @name.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self, value):
         self._name = value
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def copy(
         self,
         names=None,
@@ -431,7 +433,7 @@ def copy(
 
         return mi
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __repr__(self):
         max_seq_items = pd.get_option("display.max_seq_items") or len(self)
 
@@ -446,45 +448,26 @@ def __repr__(self):
             )
             preprocess = self.take(indices)
         else:
-            preprocess = self.copy(deep=False)
-
-        if any(col.has_nulls() for col in preprocess._data.columns):
-            preprocess_df = preprocess.to_frame(index=False)
-            for name, col in preprocess._data.items():
-                if isinstance(
-                    col,
-                    (
-                        column.datetime.DatetimeColumn,
-                        column.timedelta.TimeDeltaColumn,
-                    ),
-                ):
-                    preprocess_df[name] = col.astype("str").fillna(
-                        str(cudf.NaT)
-                    )
+            preprocess = self
 
-            tuples_list = list(
-                zip(
-                    *list(
-                        map(lambda val: pd.NA if val is None else val, col)
-                        for col in preprocess_df.to_arrow()
-                        .to_pydict()
-                        .values()
-                    )
-                )
-            )
+        arrays = []
+        for name, col in zip(self.names, preprocess._columns):
+            try:
+                pd_idx = col.to_pandas(nullable=True)
+            except NotImplementedError:
+                pd_idx = col.to_pandas(nullable=False)
+            pd_idx.name = name
+            arrays.append(pd_idx)
 
-            preprocess = preprocess.to_pandas(nullable=True)
-            preprocess.values[:] = tuples_list
-        else:
-            preprocess = preprocess.to_pandas(nullable=True)
+        preprocess_pd = pd.MultiIndex.from_arrays(arrays)
 
-        output = repr(preprocess)
+        output = repr(preprocess_pd)
         output_prefix = self.__class__.__name__ + "("
         output = output.lstrip(output_prefix)
         lines = output.split("\n")
 
         if len(lines) > 1:
-            if "length=" in lines[-1] and len(self) != len(preprocess):
+            if "length=" in lines[-1] and len(self) != len(preprocess_pd):
                 last_line = lines[-1]
                 length_index = last_line.index("length=")
                 last_line = last_line[:length_index] + f"length={len(self)})"
@@ -502,7 +485,7 @@ def _codes_frame(self):
 
     @property  # type: ignore
     @_external_only_api("Use ._codes_frame instead")
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def codes(self):
         """
         Returns the codes of the underlying MultiIndex.
@@ -528,13 +511,13 @@ def get_slice_bound(self, label, side, kind=None):
         raise NotImplementedError()
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nlevels(self):
         """Integer number of levels in this MultiIndex."""
         return self._num_columns
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def levels(self):
         """
         Returns list of levels in the MultiIndex
@@ -566,12 +549,12 @@ def levels(self):
         return self._levels
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ndim(self) -> int:
         """Dimension of the data. For MultiIndex ndim is always 2."""
         return 2
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_level_label(self, level):
         """Get name of the level.
 
@@ -588,7 +571,7 @@ def _get_level_label(self, level):
         else:
             return self._data.names[level]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isin(self, values, level=None):
         """Return a boolean array where the index values are in values.
 
@@ -687,7 +670,7 @@ def where(self, cond, other=None, inplace=False):
             ".where is not supported for MultiIndex operations"
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _compute_levels_and_codes(self):
         levels = []
 
@@ -701,7 +684,7 @@ def _compute_levels_and_codes(self):
         self._levels = levels
         self._codes = cudf.DataFrame._from_data(codes)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _compute_validity_mask(self, index, row_tuple, max_length):
         """Computes the valid set of indices of values in the lookup"""
         lookup = cudf.DataFrame()
@@ -749,7 +732,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
                     raise KeyError(row)
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
         # Instructions for Slicing
         # if tuple, get first and last elements of tuple
@@ -779,7 +762,7 @@ def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
             return row_tuple
         return self._compute_validity_mask(index, row_tuple, max_length)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _index_and_downcast(self, result, index, index_key):
         if isinstance(index_key, (numbers.Number, slice)):
             index_key = [index_key]
@@ -847,7 +830,7 @@ def _index_and_downcast(self, result, index, index_key):
             result.index = index
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_row_major(
         self,
         df: DataFrameOrSeries,
@@ -874,7 +857,7 @@ def _get_row_major(
         final = self._index_and_downcast(result, result.index, row_tuple)
         return final
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _validate_indexer(
         self,
         indexer: numbers.Number
@@ -902,7 +885,7 @@ def _validate_indexer(
             for i in indexer:
                 self._validate_indexer(i)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __eq__(self, other):
         if isinstance(other, MultiIndex):
             return np.array(
@@ -916,12 +899,12 @@ def __eq__(self, other):
         return NotImplemented
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def size(self):
         # The size of a MultiIndex is only dependent on the number of rows.
         return self._num_rows
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def take(self, indices):
         if isinstance(indices, cudf.Series) and indices.has_nulls:
             raise ValueError("Column must have no nulls.")
@@ -929,7 +912,7 @@ def take(self, indices):
         obj.names = self.names
         return obj
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def serialize(self):
         header, frames = super().serialize()
         # Overwrite the names in _data with the true names.
@@ -937,7 +920,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def deserialize(cls, header, frames):
         # Spoof the column names to construct the frame, then set manually.
         column_names = pickle.loads(header["column_names"])
@@ -945,7 +928,7 @@ def deserialize(cls, header, frames):
         obj = super().deserialize(header, frames)
         return obj._set_names(column_names)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, index):
         flatten = isinstance(index, int)
 
@@ -972,7 +955,7 @@ def __getitem__(self, index):
             result._levels = self._levels
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_frame(self, index=True, name=no_default, allow_duplicates=False):
         """
         Create a DataFrame with the levels of the MultiIndex as columns.
@@ -1022,44 +1005,34 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False):
         a c  a  c
         b d  b  d
         """
-        # TODO: Currently this function makes a shallow copy, which is
-        # incorrect. We want to make a deep copy, otherwise further
-        # modifications of the resulting DataFrame will affect the MultiIndex.
         if name is no_default:
             column_names = [
                 level if name is None else name
                 for level, name in enumerate(self.names)
             ]
+        elif not is_list_like(name):
+            raise TypeError(
+                "'name' must be a list / sequence of column names."
+            )
+        elif len(name) != len(self.levels):
+            raise ValueError(
+                "'name' should have the same length as "
+                "number of levels on index."
+            )
         else:
-            if not is_list_like(name):
-                raise TypeError(
-                    "'name' must be a list / sequence of column names."
-                )
-            if len(name) != len(self.levels):
-                raise ValueError(
-                    "'name' should have the same length as "
-                    "number of levels on index."
-                )
             column_names = name
 
-        all_none_names = None
-        if not (
-            all_none_names := all(x is None for x in column_names)
-        ) and len(column_names) != len(set(column_names)):
+        if len(column_names) != len(set(column_names)):
             raise ValueError("Duplicate column names are not allowed")
-        df = cudf.DataFrame._from_data(
-            data=self._data,
-            columns=column_names
-            if name is not no_default and not all_none_names
-            else None,
+        ca = ColumnAccessor(
+            dict(zip(column_names, (col.copy() for col in self._columns))),
+            verify=False,
+        )
+        return cudf.DataFrame._from_data(
+            data=ca, index=self if index else None
         )
 
-        if index:
-            df = df.set_index(self)
-
-        return df
-
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_level_values(self, level):
         """
         Return the values at the requested level
@@ -1115,7 +1088,7 @@ def _is_interval(self):
         return False
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _concat(cls, objs):
         source_data = [o.to_frame(index=False) for o in objs]
 
@@ -1135,7 +1108,7 @@ def _concat(cls, objs):
         return cudf.MultiIndex.from_frame(source_data, names=names)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_tuples(cls, tuples, names=None):
         """
         Convert list of tuples to MultiIndex.
@@ -1173,12 +1146,12 @@ def from_tuples(cls, tuples, names=None):
         pdi = pd.MultiIndex.from_tuples(tuples, names=names)
         return cls.from_pandas(pdi)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_numpy(self):
         return self.values_host
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values_host(self):
         """
         Return a numpy representation of the MultiIndex.
@@ -1206,7 +1179,7 @@ def values_host(self):
         return self.to_pandas().values
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values(self):
         """
         Return a CuPy representation of the MultiIndex.
@@ -1242,8 +1215,8 @@ def values(self):
         return self.to_frame(index=False).values
 
     @classmethod
-    @_cudf_nvtx_annotate
-    def from_frame(cls, df, names=None):
+    @_performance_tracking
+    def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         """
         Make a MultiIndex from a DataFrame.
 
@@ -1317,7 +1290,7 @@ def from_frame(cls, df, names=None):
         return obj
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_product(cls, arrays, names=None):
         """
         Make a MultiIndex from the cartesian product of multiple iterables.
@@ -1359,7 +1332,7 @@ def from_product(cls, arrays, names=None):
         return cls.from_pandas(pdi)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrays(
         cls,
         arrays,
@@ -1418,7 +1391,7 @@ def from_arrays(
             codes=codes, levels=levels, sortorder=sortorder, names=names
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _poplevels(self, level):
         """
         Remove and return the specified levels from self.
@@ -1469,7 +1442,7 @@ def _poplevels(self, level):
 
         return popped
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def swaplevel(self, i=-2, j=-1):
         """
         Swap level i with level j.
@@ -1520,7 +1493,7 @@ def swaplevel(self, i=-2, j=-1):
             midx = midx.set_names(self.names)
         return midx
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def droplevel(self, level=-1):
         """
         Removes the specified levels from the MultiIndex.
@@ -1583,7 +1556,7 @@ def droplevel(self, level=-1):
         else:
             return mi
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.MultiIndex:
@@ -1600,7 +1573,7 @@ def to_pandas(
         )
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default):
         """
         Convert from a Pandas MultiIndex
@@ -1635,7 +1608,7 @@ def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default):
         )
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_unique(self):
         return len(self) == len(self.unique())
 
@@ -1643,7 +1616,7 @@ def is_unique(self):
     def dtype(self):
         return np.dtype("O")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _is_sorted(self, ascending=None, null_position=None) -> bool:
         """
         Returns a boolean indicating whether the data of the MultiIndex are sorted
@@ -1689,7 +1662,7 @@ def _is_sorted(self, ascending=None, null_position=None) -> bool:
         )
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_increasing(self) -> bool:
         """
         Return if the index is monotonic increasing
@@ -1698,7 +1671,7 @@ def is_monotonic_increasing(self) -> bool:
         return self._is_sorted(ascending=None, null_position=None)
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_decreasing(self) -> bool:
         """
         Return if the index is monotonic decreasing
@@ -1708,7 +1681,7 @@ def is_monotonic_decreasing(self) -> bool:
             ascending=[False] * len(self.levels), null_position=None
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(self, value):
         """
         Fill null values with the specified value.
@@ -1749,11 +1722,11 @@ def fillna(self, value):
 
         return super().fillna(value=value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def unique(self):
         return self.drop_duplicates(keep="first")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nunique(self, dropna: bool = True) -> int:
         mi = self.dropna(how="all") if dropna else self
         return len(mi.unique())
@@ -1768,7 +1741,7 @@ def _clean_nulls_from_index(self):
             index_df._clean_nulls_from_dataframe(index_df), names=self.names
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, deep=False):
         usage = sum(col.memory_usage for col in self._data.columns)
         if self.levels:
@@ -1779,13 +1752,13 @@ def memory_usage(self, deep=False):
                 usage += col.memory_usage
         return usage
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def difference(self, other, sort=None):
         if hasattr(other, "to_pandas"):
             other = other.to_pandas()
         return cudf.from_pandas(self.to_pandas().difference(other, sort))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def append(self, other):
         """
         Append a collection of MultiIndex objects together
@@ -1848,7 +1821,7 @@ def append(self, other):
 
         return MultiIndex._concat(to_concat)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_function__(self, func, types, args, kwargs):
         cudf_df_module = MultiIndex
 
@@ -1895,7 +1868,7 @@ def _level_index_from_level(self, level):
                 ) from None
             return level
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_indexer(self, target, method=None, limit=None, tolerance=None):
         if tolerance is not None:
             raise NotImplementedError(
@@ -1954,7 +1927,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
 
         return _return_get_indexer_result(result_series.to_cupy())
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_loc(self, key):
         is_sorted = (
             self.is_monotonic_increasing or self.is_monotonic_decreasing
@@ -2028,7 +2001,7 @@ def _maybe_match_names(self, other):
             for self_name, other_name in zip(self.names, other.names)
         ]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def union(self, other, sort=None):
         if not isinstance(other, MultiIndex):
             msg = "other must be a MultiIndex or a list of tuples"
@@ -2052,7 +2025,7 @@ def union(self, other, sort=None):
 
         return self._union(other, sort=sort)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _union(self, other, sort=None):
         # TODO: When to_frame is refactored to return a
         # deep copy in future, we should push most of the common
@@ -2078,7 +2051,7 @@ def _union(self, other, sort=None):
             return midx.sort_values()
         return midx
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _intersection(self, other, sort=None):
         if self.names != other.names:
             deep = True
@@ -2101,14 +2074,14 @@ def _intersection(self, other, sort=None):
             return midx.sort_values()
         return midx
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _copy_type_metadata(self: Self, other: Self) -> Self:
         res = super()._copy_type_metadata(other)
         if isinstance(other, MultiIndex):
             res._names = other._names
         return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _split_columns_by_levels(
         self, levels: tuple, *, in_levels: bool
     ) -> Generator[tuple[Any, column.ColumnBase], None, None]:
@@ -2127,7 +2100,7 @@ def _split_columns_by_levels(
             elif not in_levels and i not in level_indices:
                 yield name, col
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _new_index_for_reset_index(
         self, levels: tuple | None, name
     ) -> None | BaseIndex:
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 903c4fe7df5..1120642947b 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -300,51 +300,31 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         obj = objs[0]
         if ignore_index:
             if axis == 1:
-                result = cudf.DataFrame._from_data(
-                    data=obj._data.copy(deep=True),
-                    index=obj.index.copy(deep=True),
-                )
-                # The DataFrame constructor for dict-like data (such as the
-                # ColumnAccessor given by obj._data here) will drop any columns
-                # in the data that are not in `columns`, so we have to rename
-                # after construction.
-                result.columns = pd.RangeIndex(len(obj._data.names))
-            else:
                 if isinstance(obj, cudf.Series):
-                    result = cudf.Series._from_data(
-                        data=obj._data.copy(deep=True),
-                        index=cudf.RangeIndex(len(obj)),
-                    )
-                elif isinstance(obj, pd.Series):
-                    result = cudf.Series(
-                        data=obj,
-                        index=cudf.RangeIndex(len(obj)),
-                    )
+                    result = obj.to_frame()
                 else:
-                    result = cudf.DataFrame._from_data(
-                        data=obj._data.copy(deep=True),
-                        index=cudf.RangeIndex(len(obj)),
-                    )
+                    result = obj.copy(deep=True)
+                result.columns = pd.RangeIndex(len(result._data))
+            else:
+                result = type(obj)._from_data(
+                    data=obj._data.copy(deep=True),
+                    index=cudf.RangeIndex(len(obj)),
+                )
+        elif axis == 0:
+            result = obj.copy(deep=True)
         else:
-            if axis == 0:
-                result = obj.copy()
+            if isinstance(obj, cudf.Series):
+                result = obj.to_frame()
             else:
-                data = obj._data.copy(deep=True)
-                if isinstance(obj, cudf.Series) and obj.name is None:
-                    # If the Series has no name, pandas renames it to 0.
-                    data[0] = data.pop(None)
-                result = cudf.DataFrame._from_data(
-                    data, index=obj.index.copy(deep=True)
+                result = obj.copy(deep=True)
+            if keys is not None and isinstance(result, cudf.DataFrame):
+                k = keys[0]
+                result.columns = cudf.MultiIndex.from_tuples(
+                    [
+                        (k, *c) if isinstance(c, tuple) else (k, c)
+                        for c in result._column_names
+                    ]
                 )
-                if keys is not None:
-                    if isinstance(result, cudf.DataFrame):
-                        k = keys[0]
-                        result.columns = cudf.MultiIndex.from_tuples(
-                            [
-                                (k, *c) if isinstance(c, tuple) else (k, c)
-                                for c in result._column_names
-                            ]
-                        )
 
         if isinstance(result, cudf.Series) and axis == 0:
             # sort has no effect for series concatted along axis 0
@@ -1179,7 +1159,6 @@ def unstack(df, level, fill_value=None):
     if pd.api.types.is_list_like(level):
         if not level:
             return df
-    df = df.copy(deep=False)
     if not isinstance(df.index, cudf.MultiIndex):
         dtype = df._columns[0].dtype
         for col in df._columns:
@@ -1195,6 +1174,7 @@ def unstack(df, level, fill_value=None):
         )
         return res
     else:
+        df = df.copy(deep=False)
         columns = df.index._poplevels(level)
         index = df.index
     result = _pivot(df, index, columns)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 15ad0813601..97b6bbec2d4 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -48,7 +48,7 @@
 from cudf.core.column.struct import StructMethods
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.groupby.groupby import SeriesGroupBy, groupby_doc_template
-from cudf.core.index import BaseIndex, DatetimeIndex, RangeIndex, as_index
+from cudf.core.index import BaseIndex, DatetimeIndex, RangeIndex, ensure_index
 from cudf.core.indexed_frame import (
     IndexedFrame,
     _FrameIndexer,
@@ -68,7 +68,7 @@
     is_mixed_with_object_dtype,
     to_cudf_compatible_scalar,
 )
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 if TYPE_CHECKING:
     from cudf._typing import (
@@ -179,7 +179,7 @@ class _SeriesIlocIndexer(_FrameIndexer):
 
     _frame: cudf.Series
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, arg):
         indexing_spec = indexing_utils.parse_row_iloc_indexer(
             indexing_utils.destructure_series_iloc_indexer(arg, self._frame),
@@ -187,7 +187,7 @@ def __getitem__(self, arg):
         )
         return self._frame._getitem_preprocessed(indexing_spec)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __setitem__(self, key, value):
         if isinstance(key, tuple):
             key = list(key)
@@ -274,7 +274,7 @@ class _SeriesLocIndexer(_FrameIndexer):
     Label-based selection
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, arg: Any) -> ScalarLike | DataFrameOrSeries:
         if isinstance(arg, pd.MultiIndex):
             arg = cudf.from_pandas(arg)
@@ -301,7 +301,7 @@ def __getitem__(self, arg: Any) -> ScalarLike | DataFrameOrSeries:
 
         return self._frame.iloc[arg]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __setitem__(self, key, value):
         try:
             key = self._loc_to_iloc(key)
@@ -476,7 +476,7 @@ def _constructor_expanddim(self):
         return cudf.DataFrame
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_categorical(cls, categorical, codes=None):
         """Creates from a pandas.Categorical
 
@@ -517,7 +517,7 @@ def from_categorical(cls, categorical, codes=None):
         return Series(data=col)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_masked_array(cls, data, mask, null_count=None):
         """Create a Series with null-mask.
         This is equivalent to:
@@ -566,7 +566,7 @@ def from_masked_array(cls, data, mask, null_count=None):
         col = as_column(data).set_mask(mask)
         return cls(data=col)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -584,14 +584,12 @@ def __init__(
             data = {}
 
         if isinstance(data, (pd.Series, pd.Index, BaseIndex, Series)):
-            if copy:
+            if copy and not isinstance(data, (pd.Series, pd.Index)):
                 data = data.copy(deep=True)
             name_from_data = data.name
             column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
-            if isinstance(data, pd.Series):
-                index_from_data = cudf.Index(data.index)
-            elif isinstance(data, Series):
-                index_from_data = data.index
+            if isinstance(data, (pd.Series, Series)):
+                index_from_data = ensure_index(data.index)
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
                 "Use cudf.Series._from_data for constructing a Series from "
@@ -642,7 +640,7 @@ def __init__(
             name = name_from_data
 
         if index is not None:
-            index = cudf.Index(index)
+            index = ensure_index(index)
 
         if index_from_data is not None:
             first_index = index_from_data
@@ -663,7 +661,7 @@ def __init__(
         self._check_data_index_length_match()
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data(
         cls,
         data: MutableMapping,
@@ -675,18 +673,18 @@ def _from_data(
             out.name = name
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data_like_self(self, data: MutableMapping):
         out = super()._from_data_like_self(data)
         out.name = self.name
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __contains__(self, item):
         return item in self.index
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_pandas(cls, s: pd.Series, nan_as_null=no_default):
         """
         Convert from a Pandas Series.
@@ -735,7 +733,7 @@ def from_pandas(cls, s: pd.Series, nan_as_null=no_default):
         return result
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_unique(self):
         """Return boolean if values in the object are unique.
 
@@ -746,7 +744,7 @@ def is_unique(self):
         return self._column.is_unique
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dt(self):
         """
         Accessor object for datetime-like properties of the Series values.
@@ -788,7 +786,7 @@ def dt(self):
             )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hasnans(self):
         """
         Return True if there are any NaNs or nulls.
@@ -829,7 +827,7 @@ def hasnans(self):
         """
         return self._column.has_nulls(include_nan=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def serialize(self):
         header, frames = super().serialize()
 
@@ -842,7 +840,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def deserialize(cls, header, frames):
         index_nframes = header["index_frame_count"]
         obj = super().deserialize(
@@ -855,7 +853,7 @@ def deserialize(cls, header, frames):
 
         return obj
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop(
         self,
         labels=None,
@@ -884,7 +882,7 @@ def tolist(self):  # noqa: D102
 
     to_list = tolist
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_dict(self, into: type[dict] = dict) -> dict:
         """
         Convert Series to {label -> value} dict or dict-like object.
@@ -923,7 +921,7 @@ def to_dict(self, into: type[dict] = dict) -> dict:
         """
         return self.to_pandas().to_dict(into=into)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def reindex(self, *args, **kwargs):
         """
         Conform Series to new index.
@@ -996,7 +994,7 @@ def reindex(self, *args, **kwargs):
         series.name = self.name
         return series
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_reset_index_template.format(
             klass="Series",
@@ -1081,7 +1079,7 @@ def reset_index(
             inplace=inplace,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_frame(self, name=None):
         """Convert Series into a DataFrame
 
@@ -1124,13 +1122,13 @@ def to_frame(self, name=None):
 
         return cudf.DataFrame({col: self._column}, index=self.index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, index=True, deep=False):
         return self._column.memory_usage + (
             self.index.memory_usage() if index else 0
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_function__(self, func, types, args, kwargs):
         if "out" in kwargs or not all(issubclass(t, Series) for t in types):
             return NotImplemented
@@ -1191,7 +1189,7 @@ def __array_function__(self, func, types, args, kwargs):
 
         return NotImplemented
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def map(self, arg, na_action=None) -> "Series":
         """
         Map values of Series according to input correspondence.
@@ -1333,7 +1331,7 @@ def _getitem_preprocessed(
             return self._empty_like(keep_index=True)
         assert_never(spec)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, arg):
         if isinstance(arg, slice):
             return self.iloc[arg]
@@ -1344,7 +1342,7 @@ def __getitem__(self, arg):
 
     items = SingleColumnFrame.__iter__
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __setitem__(self, key, value):
         if isinstance(key, slice):
             self.iloc[key] = value
@@ -1495,36 +1493,36 @@ def _make_operands_and_index_for_binop(
 
     @copy_docstring(CategoricalAccessor)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cat(self):
         return CategoricalAccessor(parent=self)
 
     @copy_docstring(StringMethods)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def str(self):
         return StringMethods(parent=self)
 
     @copy_docstring(ListMethods)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def list(self):
         return ListMethods(parent=self)
 
     @copy_docstring(StructMethods)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def struct(self):
         return StructMethods(parent=self)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dtype(self):
         """The dtype of the Series."""
         return self._column.dtype
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _concat(cls, objs, axis=0, index=True):
         # Concatenate index if not provided
         if index is True:
@@ -1590,25 +1588,25 @@ def _concat(cls, objs, axis=0, index=True):
         return cls(data=col, index=index, name=name)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def valid_count(self):
         """Number of non-null values"""
         return len(self) - self._column.null_count
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def null_count(self):
         """Number of null values"""
         return self._column.null_count
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nullable(self):
         """A boolean indicating whether a null-mask is needed"""
         return self._column.nullable
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def has_nulls(self):
         """
         Indicator whether Series contains null values.
@@ -1637,7 +1635,7 @@ def has_nulls(self):
         """
         return self._column.has_nulls()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dropna(self, axis=0, inplace=False, how=None):
         """
         Return a Series with null values removed.
@@ -1717,7 +1715,7 @@ def dropna(self, axis=0, inplace=False, how=None):
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
         """
         Return Series with duplicate values removed.
@@ -1791,7 +1789,7 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
     ):
@@ -1896,7 +1894,7 @@ def between(self, left, right, inclusive="both") -> Series:
             )
         return self._from_data({self.name: lmask & rmask}, self.index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def all(self, axis=0, bool_only=None, skipna=True, **kwargs):
         if bool_only not in (None, True):
             raise NotImplementedError(
@@ -1904,7 +1902,7 @@ def all(self, axis=0, bool_only=None, skipna=True, **kwargs):
             )
         return super().all(axis, skipna, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def any(self, axis=0, bool_only=None, skipna=True, **kwargs):
         if bool_only not in (None, True):
             raise NotImplementedError(
@@ -1912,7 +1910,7 @@ def any(self, axis=0, bool_only=None, skipna=True, **kwargs):
             )
         return super().any(axis, skipna, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self,
         *,
@@ -2004,7 +2002,7 @@ def to_pandas(
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def data(self):
         """The gpu buffer for the data
 
@@ -2029,12 +2027,12 @@ def data(self):
         return self._column.data
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nullmask(self):
         """The gpu buffer for the null-mask"""
         return cudf.Series(self._column.nullmask)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(
         self,
         dtype,
@@ -2051,13 +2049,13 @@ def astype(
             dtype = {self.name: dtype}
         return super().astype(dtype, copy, errors)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sort_index(self, axis=0, *args, **kwargs):
         if axis not in (0, "index"):
             raise ValueError("Only axis=0 is valid for Series.")
         return super().sort_index(axis=axis, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sort_values(
         self,
         axis=0,
@@ -2112,7 +2110,7 @@ def sort_values(
             ignore_index=ignore_index,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nlargest(self, n=5, keep="first"):
         """Returns a new Series of the *n* largest element.
 
@@ -2175,7 +2173,7 @@ def nlargest(self, n=5, keep="first"):
         """
         return self._n_largest_or_smallest(True, n, [self.name], keep)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nsmallest(self, n=5, keep="first"):
         """
         Returns a new Series of the *n* smallest element.
@@ -2251,7 +2249,7 @@ def nsmallest(self, n=5, keep="first"):
         """
         return self._n_largest_or_smallest(False, n, [self.name], keep)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def argsort(
         self,
         axis=0,
@@ -2274,7 +2272,7 @@ def argsort(
         obj.name = self.name
         return obj
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def replace(self, to_replace=None, value=no_default, *args, **kwargs):
         if is_dict_like(to_replace) and value not in {None, no_default}:
             raise ValueError(
@@ -2284,7 +2282,7 @@ def replace(self, to_replace=None, value=no_default, *args, **kwargs):
 
         return super().replace(to_replace, value, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def update(self, other):
         """
         Modify Series in place using values from passed Series.
@@ -2390,7 +2388,7 @@ def update(self, other):
         self.mask(mask, other, inplace=True)
 
     # UDF related
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def apply(self, func, convert_dtype=True, args=(), **kwargs):
         """
         Apply a scalar function to the values of a Series.
@@ -2535,7 +2533,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
     #
     # Stats
     #
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def count(self):
         """
         Return number of non-NA/null observations in the Series
@@ -2559,7 +2557,7 @@ def count(self):
         """
         return self.valid_count
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def mode(self, dropna=True):
         """
         Return the mode(s) of the dataset.
@@ -2630,7 +2628,7 @@ def mode(self, dropna=True):
             {self.name: val_counts.index.sort_values()}, name=self.name
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def round(self, decimals=0, how="half_even"):
         if not is_integer(decimals):
             raise ValueError(
@@ -2639,7 +2637,7 @@ def round(self, decimals=0, how="half_even"):
         decimals = int(decimals)
         return super().round(decimals, how)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cov(self, other, min_periods=None):
         """
         Compute covariance with Series, excluding missing values.
@@ -2690,7 +2688,7 @@ def cov(self, other, min_periods=None):
                 f"{other.dtype}"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def transpose(self):
         """Return the transpose, which is by definition self."""
 
@@ -2698,7 +2696,7 @@ def transpose(self):
 
     T = property(transpose, doc=transpose.__doc__)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def duplicated(self, keep="first"):
         """
         Indicate duplicate Series values.
@@ -2778,7 +2776,7 @@ def duplicated(self, keep="first"):
         """
         return super().duplicated(keep=keep)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def corr(self, other, method="pearson", min_periods=None):
         """Calculates the sample correlation between two Series,
         excluding missing values.
@@ -2830,7 +2828,7 @@ def corr(self, other, method="pearson", min_periods=None):
                 f"cannot perform corr with types {self.dtype}, {other.dtype}"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def autocorr(self, lag=1):
         """Compute the lag-N autocorrelation. This method computes the Pearson
         correlation between the Series and its shifted self.
@@ -2856,7 +2854,7 @@ def autocorr(self, lag=1):
         """
         return self.corr(self.shift(lag))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isin(self, values):
         """Check whether values are contained in Series.
 
@@ -2926,7 +2924,7 @@ def isin(self, values):
             {self.name: self._column.isin(values)}, index=self.index
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def unique(self):
         """
         Returns unique values of this Series.
@@ -2961,7 +2959,7 @@ def unique(self):
             return res.values
         return Series(res, name=self.name)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def value_counts(
         self,
         normalize=False,
@@ -3116,7 +3114,7 @@ def value_counts(
         res.name = result_name
         return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quantile(
         self, q=0.5, interpolation="linear", exact=True, quant_index=True
     ):
@@ -3191,11 +3189,11 @@ def quantile(
 
         return Series._from_data(
             data={self.name: result},
-            index=as_index(np_array_q) if quant_index else None,
+            index=cudf.Index(np_array_q) if quant_index else None,
         )
 
     @docutils.doc_describe()
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def describe(
         self,
         percentiles=None,
@@ -3240,7 +3238,7 @@ def describe(
             name=self.name,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def digitize(self, bins, right=False):
         """Return the indices of the bins to which each value belongs.
 
@@ -3276,7 +3274,7 @@ def digitize(self, bins, right=False):
             cudf.core.column.numerical.digitize(self._column, bins, right)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def diff(self, periods=1):
         """First discrete difference of element.
 
@@ -3347,7 +3345,7 @@ def diff(self, periods=1):
 
         return self - self.shift(periods=periods)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         groupby_doc_template.format(
             ret=textwrap.dedent(
@@ -3385,7 +3383,7 @@ def groupby(
             dropna,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rename(self, index=None, copy=True):
         """
         Alter Series name
@@ -3431,21 +3429,23 @@ def rename(self, index=None, copy=True):
         out_data = self._data.copy(deep=copy)
         return Series._from_data(out_data, self.index, name=index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def add_prefix(self, prefix):
         return Series._from_data(
+            # TODO: Change to deep=False when copy-on-write is default
             data=self._data.copy(deep=True),
             index=prefix + self.index.astype(str),
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def add_suffix(self, suffix):
         return Series._from_data(
+            # TODO: Change to deep=False when copy-on-write is default
             data=self._data.copy(deep=True),
             index=self.index.astype(str) + suffix,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def keys(self):
         """
         Return alias for index.
@@ -3489,7 +3489,7 @@ def keys(self):
         """
         return self.index
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def explode(self, ignore_index=False):
         """
         Transform each element of a list-like to a row, replicating index
@@ -3526,7 +3526,7 @@ def explode(self, ignore_index=False):
         """
         return super()._explode(self.name, ignore_index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pct_change(
         self, periods=1, fill_method=no_default, limit=no_default, freq=None
     ):
@@ -3600,7 +3600,7 @@ def pct_change(
         change = diff / data.shift(periods=periods, freq=freq)
         return change
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
@@ -3734,7 +3734,7 @@ class DatetimeProperties(BaseDatelikeProperties):
     """
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def year(self) -> Series:
         """
         The year of the datetime.
@@ -3759,7 +3759,7 @@ def year(self) -> Series:
         return self._get_dt_field("year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def month(self) -> Series:
         """
         The month as January=1, December=12.
@@ -3784,7 +3784,7 @@ def month(self) -> Series:
         return self._get_dt_field("month")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day(self) -> Series:
         """
         The day of the datetime.
@@ -3809,7 +3809,7 @@ def day(self) -> Series:
         return self._get_dt_field("day")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hour(self) -> Series:
         """
         The hours of the datetime.
@@ -3834,7 +3834,7 @@ def hour(self) -> Series:
         return self._get_dt_field("hour")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def minute(self) -> Series:
         """
         The minutes of the datetime.
@@ -3859,7 +3859,7 @@ def minute(self) -> Series:
         return self._get_dt_field("minute")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def second(self) -> Series:
         """
         The seconds of the datetime.
@@ -3884,7 +3884,7 @@ def second(self) -> Series:
         return self._get_dt_field("second")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def microsecond(self) -> Series:
         """
         The microseconds of the datetime.
@@ -3916,7 +3916,7 @@ def microsecond(self) -> Series:
         return self._return_result_like_self(micro + extra)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nanosecond(self) -> Series:
         """
         The nanoseconds of the datetime.
@@ -3941,7 +3941,7 @@ def nanosecond(self) -> Series:
         return self._get_dt_field("nanosecond")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def weekday(self) -> Series:
         """
         The day of the week with Monday=0, Sunday=6.
@@ -3978,7 +3978,7 @@ def weekday(self) -> Series:
         return self._get_dt_field("weekday")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dayofweek(self) -> Series:
         """
         The day of the week with Monday=0, Sunday=6.
@@ -4015,7 +4015,7 @@ def dayofweek(self) -> Series:
         return self._get_dt_field("weekday")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dayofyear(self) -> Series:
         """
         The day of the year, from 1-365 in non-leap years and
@@ -4053,7 +4053,7 @@ def dayofyear(self) -> Series:
         return self._get_dt_field("day_of_year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day_of_year(self) -> Series:
         """
         The day of the year, from 1-365 in non-leap years and
@@ -4091,7 +4091,7 @@ def day_of_year(self) -> Series:
         return self._get_dt_field("day_of_year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_leap_year(self) -> Series:
         """
         Boolean indicator if the date belongs to a leap year.
@@ -4146,7 +4146,7 @@ def is_leap_year(self) -> Series:
         return self._return_result_like_self(res)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quarter(self) -> Series:
         """
         Integer indicator for which quarter of the year the date belongs in.
@@ -4175,7 +4175,7 @@ def quarter(self) -> Series:
         )
         return self._return_result_like_self(res)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day_name(self, locale: str | None = None) -> Series:
         """
         Return the day names. Currently supports English locale only.
@@ -4211,7 +4211,7 @@ def day_name(self, locale: str | None = None) -> Series:
             self.series._column.get_day_names(locale)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def month_name(self, locale: str | None = None) -> Series:
         """
         Return the month names. Currently supports English locale only.
@@ -4241,7 +4241,7 @@ def month_name(self, locale: str | None = None) -> Series:
             self.series._column.get_month_names(locale)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isocalendar(self) -> cudf.DataFrame:
         """
         Returns a DataFrame with the year, week, and day
@@ -4289,7 +4289,7 @@ def isocalendar(self) -> cudf.DataFrame:
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_month_start(self) -> Series:
         """
         Booleans indicating if dates are the first day of the month.
@@ -4297,7 +4297,7 @@ def is_month_start(self) -> Series:
         return (self.day == 1).fillna(False)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def days_in_month(self) -> Series:
         """
         Get the total number of days in the month that the date falls on.
@@ -4346,7 +4346,7 @@ def days_in_month(self) -> Series:
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_month_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of the month.
@@ -4389,7 +4389,7 @@ def is_month_end(self) -> Series:
         return (self.day == last_day.dt.day).fillna(False)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_quarter_start(self) -> Series:
         """
         Boolean indicator if the date is the first day of a quarter.
@@ -4434,7 +4434,7 @@ def is_quarter_start(self) -> Series:
         return self._return_result_like_self(result)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_quarter_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of a quarter.
@@ -4481,7 +4481,7 @@ def is_quarter_end(self) -> Series:
         return self._return_result_like_self(result)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_year_start(self) -> Series:
         """
         Boolean indicator if the date is the first day of the year.
@@ -4512,7 +4512,7 @@ def is_year_start(self) -> Series:
         return self._return_result_like_self(outcol.fillna(False))
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_year_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of the year.
@@ -4545,13 +4545,13 @@ def is_year_end(self) -> Series:
         result = cudf._lib.copying.copy_if_else(leap, non_leap, leap_dates)
         return self._return_result_like_self(result.fillna(False))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_dt_field(self, field: str) -> Series:
         return self._return_result_like_self(
             self.series._column.get_dt_field(field)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ceil(self, freq: str) -> Series:
         """
         Perform ceil operation on the data to the specified freq.
@@ -4584,7 +4584,7 @@ def ceil(self, freq: str) -> Series:
         """
         return self._return_result_like_self(self.series._column.ceil(freq))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def floor(self, freq: str) -> Series:
         """
         Perform floor operation on the data to the specified freq.
@@ -4617,7 +4617,7 @@ def floor(self, freq: str) -> Series:
         """
         return self._return_result_like_self(self.series._column.floor(freq))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def round(self, freq: str) -> Series:
         """
         Perform round operation on the data to the specified freq.
@@ -4653,7 +4653,7 @@ def round(self, freq: str) -> Series:
         """
         return self._return_result_like_self(self.series._column.round(freq))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def strftime(self, date_format: str, *args, **kwargs) -> Series:
         """
         Convert to Series using specified ``date_format``.
@@ -4830,7 +4830,7 @@ class TimedeltaProperties(BaseDatelikeProperties):
     """
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def days(self) -> Series:
         """
         Number of days.
@@ -4862,7 +4862,7 @@ def days(self) -> Series:
         return self._get_td_field("days")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def seconds(self) -> Series:
         """
         Number of seconds (>= 0 and less than 1 day).
@@ -4901,7 +4901,7 @@ def seconds(self) -> Series:
         return self._get_td_field("seconds")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def microseconds(self) -> Series:
         """
         Number of microseconds (>= 0 and less than 1 second).
@@ -4933,7 +4933,7 @@ def microseconds(self) -> Series:
         return self._get_td_field("microseconds")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nanoseconds(self) -> Series:
         """
         Return the number of nanoseconds (n), where 0 <= n < 1 microsecond.
@@ -4965,7 +4965,7 @@ def nanoseconds(self) -> Series:
         return self._get_td_field("nanoseconds")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def components(self) -> cudf.DataFrame:
         """
         Return a Dataframe of the components of the Timedeltas.
@@ -4997,14 +4997,14 @@ def components(self) -> cudf.DataFrame:
             ca, index=self.series.index
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_td_field(self, field: str) -> Series:
         return self._return_result_like_self(
             getattr(self.series._column, field)
         )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _align_indices(series_list, how="outer", allow_non_unique=False):
     """
     Internal util to align the indices of a list of Series objects
@@ -5067,7 +5067,7 @@ def _align_indices(series_list, how="outer", allow_non_unique=False):
 
 
 @acquire_spill_lock()
-@_cudf_nvtx_annotate
+@_performance_tracking
 def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     r"""Returns a boolean array where two arrays are equal within a tolerance.
 
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 23a2c828a04..f9555aee6a2 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -18,7 +18,7 @@
 )
 from cudf.core.column import ColumnBase, as_column
 from cudf.core.frame import Frame
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import NotIterable
 
 if TYPE_CHECKING:
@@ -41,7 +41,7 @@ class SingleColumnFrame(Frame, NotIterable):
         "index": 0,
     }
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _reduce(
         self,
         op,
@@ -62,7 +62,7 @@ def _reduce(
         except AttributeError:
             raise TypeError(f"cannot perform {op} with type {self.dtype}")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _scan(self, op, axis=None, *args, **kwargs):
         if axis not in (None, 0):
             raise NotImplementedError("axis parameter is not implemented yet")
@@ -70,24 +70,24 @@ def _scan(self, op, axis=None, *args, **kwargs):
         return super()._scan(op, axis=axis, *args, **kwargs)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self):
         """Get the name of this object."""
         return next(iter(self._column_names))
 
     @name.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self, value):
         self._data[value] = self._data.pop(self.name)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ndim(self) -> int:  # noqa: D401
         """Number of dimensions of the underlying data, by definition 1."""
         return 1
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def shape(self) -> tuple[int]:
         """Get a tuple representing the dimensionality of the Index."""
         return (len(self),)
@@ -99,27 +99,27 @@ def __bool__(self):
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _num_columns(self) -> int:
         return 1
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _column(self) -> ColumnBase:
         return next(iter(self._columns))
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values(self) -> cupy.ndarray:  # noqa: D102
         return self._column.values
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values_host(self) -> numpy.ndarray:  # noqa: D102
         return self._column.values_host
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrow(cls, array) -> Self:
         """Create from PyArrow Array/ChunkedArray.
 
@@ -150,7 +150,7 @@ def from_arrow(cls, array) -> Self:
         """
         return cls(ColumnBase.from_arrow(array))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_arrow(self) -> pa.Array:
         """
         Convert to a PyArrow Array.
@@ -182,7 +182,7 @@ def to_arrow(self) -> pa.Array:
         return self._column.to_arrow()
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_unique(self) -> bool:
         """Return boolean if values in the object are unique.
 
@@ -193,7 +193,7 @@ def is_unique(self) -> bool:
         return self._column.is_unique
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_increasing(self) -> bool:
         """Return boolean if values in the object are monotonically increasing.
 
@@ -204,7 +204,7 @@ def is_monotonic_increasing(self) -> bool:
         return self._column.is_monotonic_increasing
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_decreasing(self) -> bool:
         """Return boolean if values in the object are monotonically decreasing.
 
@@ -215,7 +215,7 @@ def is_monotonic_decreasing(self) -> bool:
         return self._column.is_monotonic_decreasing
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __cuda_array_interface__(self):
         # While the parent column class has a `__cuda_array_interface__` method
         # defined, it is not implemented for all column types. When it is not
@@ -229,7 +229,7 @@ def __cuda_array_interface__(self):
                 "'__cuda_array_interface__'"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def factorize(
         self, sort: bool = False, use_na_sentinel: bool = True
     ) -> tuple[cupy.ndarray, cudf.Index]:
@@ -268,7 +268,7 @@ def factorize(
             use_na_sentinel=use_na_sentinel,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _make_operands_for_binop(
         self,
         other: Any,
@@ -323,7 +323,7 @@ def _make_operands_for_binop(
 
         return {result_name: (self._column, other, reflect, fill_value)}
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nunique(self, dropna: bool = True) -> int:
         """
         Return count of unique values for the column.
@@ -369,7 +369,7 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase:
                 return self._column.apply_boolean_mask(arg)
             raise NotImplementedError(f"Unknown indexer {type(arg)}")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         from cudf.core._internals.where import (
             _check_and_cast_columns_with_other,
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index 06d9296ca0f..265b87350ae 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -30,7 +30,7 @@
     _supported_dtypes_from_frame,
 )
 from cudf.utils._numba import _CUDFNumbaConfig
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 
 def _get_frame_groupby_type(dtype, index_dtype):
@@ -126,7 +126,7 @@ def _get_groupby_apply_kernel(frame, func, args):
     return kernel, return_type
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def jit_groupby_apply(offsets, grouped_values, function, *args):
     """
     Main entrypoint for JIT Groupby.apply via Numba.
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index f1704e4ea78..d616761cb3b 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -38,7 +38,7 @@
     STRING_TYPES,
     TIMEDELTA_TYPES,
 )
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import initfunc
 
 # Maximum size of a string column is 2 GiB
@@ -71,7 +71,7 @@ def _ptx_file():
     )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _get_udf_return_type(argty, func: Callable, args=()):
     """
     Get the return type of a masked UDF for a given set of argument dtypes. It
@@ -236,7 +236,7 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"):
     )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _compile_or_get(
     frame, func, args, kernel_getter=None, suffix="__APPLY_UDF"
 ):
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index f07764e2ce4..e909d96309e 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -12,10 +12,10 @@
 from cudf.api.types import is_scalar
 from cudf.utils import ioutils
 from cudf.utils.dtypes import _maybe_convert_to_default_type
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 @ioutils.doc_read_csv()
 def read_csv(
     filepath_or_buffer,
@@ -151,7 +151,7 @@ def read_csv(
     return df
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 @ioutils.doc_to_csv()
 def to_csv(
     df,
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 58b104b84e9..7733e770d99 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -22,7 +22,7 @@
 from cudf.api.types import is_list_like
 from cudf.core.column import as_column, build_categorical_column, column_empty
 from cudf.utils import ioutils
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 BYTE_SIZES = {
     "kb": 1000,
@@ -50,7 +50,7 @@
 }
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _write_parquet(
     df,
     paths,
@@ -130,7 +130,7 @@ def _write_parquet(
 
 # Logic chosen to match: https://arrow.apache.org/
 # docs/_modules/pyarrow/parquet.html#write_to_dataset
-@_cudf_nvtx_annotate
+@_performance_tracking
 def write_to_dataset(
     df,
     root_path,
@@ -318,7 +318,7 @@ def write_to_dataset(
 
 
 @ioutils.doc_read_parquet_metadata()
-@_cudf_nvtx_annotate
+@_performance_tracking
 def read_parquet_metadata(filepath_or_buffer):
     """{docstring}"""
     # Multiple sources are passed as a list. If a single source is passed,
@@ -360,7 +360,7 @@ def read_parquet_metadata(filepath_or_buffer):
     return libparquet.read_parquet_metadata(filepaths_or_buffers)
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _process_dataset(
     paths,
     fs,
@@ -515,7 +515,7 @@ def _process_dataset(
 
 
 @ioutils.doc_read_parquet()
-@_cudf_nvtx_annotate
+@_performance_tracking
 def read_parquet(
     filepath_or_buffer,
     engine="cudf",
@@ -785,7 +785,7 @@ def _handle_is(column: cudf.Series, value, *, negate) -> cudf.Series:
         return df
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _parquet_to_frame(
     paths_or_buffers,
     *args,
@@ -885,7 +885,7 @@ def _parquet_to_frame(
         return dfs[0]
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _read_parquet(
     filepaths_or_buffers,
     engine,
@@ -908,12 +908,20 @@ def _read_parquet(
                 "cudf engine doesn't support the "
                 f"following positional arguments: {list(args)}"
             )
-        return libparquet.read_parquet(
-            filepaths_or_buffers,
-            columns=columns,
-            row_groups=row_groups,
-            use_pandas_metadata=use_pandas_metadata,
-        )
+        if cudf.get_option("mode.pandas_compatible"):
+            return libparquet.ParquetReader(
+                filepaths_or_buffers,
+                columns=columns,
+                row_groups=row_groups,
+                use_pandas_metadata=use_pandas_metadata,
+            ).read()
+        else:
+            return libparquet.read_parquet(
+                filepaths_or_buffers,
+                columns=columns,
+                row_groups=row_groups,
+                use_pandas_metadata=use_pandas_metadata,
+            )
     else:
         if (
             isinstance(filepaths_or_buffers, list)
@@ -933,7 +941,7 @@ def _read_parquet(
 
 
 @ioutils.doc_to_parquet()
-@_cudf_nvtx_annotate
+@_performance_tracking
 def to_parquet(
     df,
     path,
@@ -1099,7 +1107,7 @@ def _get_estimated_file_size(df):
     return file_size
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _get_partitioned(
     df,
     root_path,
@@ -1137,7 +1145,7 @@ def _get_partitioned(
     return full_paths, metadata_file_paths, grouped_df, part_offsets, filename
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _get_groups_and_offsets(
     df,
     partition_cols,
@@ -1297,7 +1305,7 @@ class ParquetDatasetWriter:
 
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         path,
@@ -1347,7 +1355,7 @@ def __init__(
 
         self._file_sizes: dict[str, int] = {}
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def write_table(self, df):
         """
         Write a dataframe to the file/dataset
@@ -1478,7 +1486,7 @@ def write_table(self, df):
             self.path_cw_map.update({k: new_cw_idx for k in new_paths})
             self._chunked_writers[-1][0].write_table(grouped_df, part_info)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def close(self, return_metadata=False):
         """
         Close all open files and optionally return footer metadata as a binary
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 0e19972f6e0..4329480bb2c 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -1,14 +1,14 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from io import BytesIO, StringIO
 
 import cudf
 from cudf._lib import text as libtext
 from cudf.utils import ioutils
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 @ioutils.doc_read_text()
 def read_text(
     filepath_or_buffer,
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index fb5a963f008..1f539e7f266 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -311,6 +311,20 @@ def _integer_and_none_validator(val):
     _make_contains_validator([False, True]),
 )
 
+_register_option(
+    "memory_profiling",
+    _env_get_bool("CUDF_MEMORY_PROFILING", False),
+    textwrap.dedent(
+        """
+        If set to `False`, disables memory profiling.
+        If set to `True`, enables memory profiling.
+        Read more at: :ref:`memory-profiling-user-doc`
+        \tValid values are True or False. Default is False.
+    """
+    ),
+    _make_contains_validator([False, True]),
+)
+
 
 class option_context(ContextDecorator):
     """
diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index c445be46f58..3b012169676 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -129,6 +129,19 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
     },
 )
 
+
+flatiter = make_final_proxy_type(
+    "flatiter",
+    cupy.flatiter,
+    numpy.flatiter,
+    fast_to_slow=lambda fast: cupy.asnumpy(fast.base).flat,
+    slow_to_fast=lambda slow: cupy.asarray(slow).flat,
+    additional_attributes={
+        "__array__": array_method,
+    },
+)
+
+
 # Mapping flags between slow and fast types
 _ndarray_flags = make_intermediate_proxy_type(
     "_ndarray_flags",
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 0ba432d6d0e..a64bf7772fe 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -522,6 +522,22 @@ def Index__new__(cls, *args, **kwargs):
     },
 )
 
+ArrowStringArrayNumpySemantics = make_final_proxy_type(
+    "ArrowStringArrayNumpySemantics",
+    _Unusable,
+    pd.core.arrays.string_arrow.ArrowStringArrayNumpySemantics,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+)
+
+ArrowStringArray = make_final_proxy_type(
+    "ArrowStringArray",
+    _Unusable,
+    pd.core.arrays.string_arrow.ArrowStringArray,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+)
+
 StringDtype = make_final_proxy_type(
     "StringDtype",
     _Unusable,
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index cd9f90d50fe..a66f63c09b3 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -133,7 +133,8 @@ and not test_s3_roundtrip"
 TEST_THAT_CRASH_PYTEST_WORKERS="not test_bitmasks_pyarrow \
 and not test_large_string_pyarrow \
 and not test_interchange_from_corrected_buffer_dtypes \
-and not test_eof_states"
+and not test_eof_states \
+and not test_array_tz"
 
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
diff --git a/python/cudf/cudf/pylibcudf_tests/test_join.py b/python/cudf/cudf/pylibcudf_tests/test_join.py
new file mode 100644
index 00000000000..eb25ed915b1
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_join.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import numpy as np
+import pyarrow as pa
+from utils import assert_table_eq
+
+from cudf._lib import pylibcudf as plc
+
+
+def test_cross_join():
+    left = pa.Table.from_arrays([[0, 1, 2], [3, 4, 5]], names=["a", "b"])
+    right = pa.Table.from_arrays(
+        [[6, 7, 8, 9], [10, 11, 12, 13]], names=["c", "d"]
+    )
+
+    pleft = plc.interop.from_arrow(left)
+    pright = plc.interop.from_arrow(right)
+
+    expect = pa.Table.from_arrays(
+        [
+            *(np.repeat(c.to_numpy(), len(right)) for c in left.columns),
+            *(np.tile(c.to_numpy(), len(left)) for c in right.columns),
+        ],
+        names=["a", "b", "c", "d"],
+    )
+
+    got = plc.join.cross_join(pleft, pright)
+
+    assert_table_eq(expect, got)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index ef4ab8b86fe..13570a48cc2 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -7,15 +7,28 @@
 from cudf._lib import pylibcudf as plc
 
 
-def test_concatenate_rows():
-    test_data = [[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]]
+@pytest.fixture
+def test_data():
+    return [[[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]]]
 
-    arrow_tbl = pa.Table.from_arrays(test_data, names=["a", "b"])
+
+@pytest.fixture
+def scalar():
+    return pa.scalar(1)
+
+
+@pytest.fixture
+def column():
+    return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32())
+
+
+def test_concatenate_rows(test_data):
+    arrow_tbl = pa.Table.from_arrays(test_data[0], names=["a", "b"])
     plc_tbl = plc.interop.from_arrow(arrow_tbl)
 
     res = plc.lists.concatenate_rows(plc_tbl)
 
-    expect = pa.array([pair[0] + pair[1] for pair in zip(*test_data)])
+    expect = pa.array([pair[0] + pair[1] for pair in zip(*test_data[0])])
 
     assert_column_eq(expect, res)
 
@@ -46,12 +59,88 @@ def test_concatenate_list_elements(test_data, dropna, expected):
     assert_column_eq(expect, res)
 
 
-def test_count_elements():
-    test_data = [[1, 2, 3], [4], [5, 6]]
-    arr = pa.array(test_data)
+def test_contains_scalar(test_data, scalar):
+    list_column = test_data[0][0]
+    arr = pa.array(list_column)
+
+    plc_column = plc.interop.from_arrow(arr)
+    plc_scalar = plc.interop.from_arrow(scalar)
+    res = plc.lists.contains(plc_column, plc_scalar)
+
+    expect = pa.array([True, False, False, False])
+
+    assert_column_eq(expect, res)
+
+
+def test_contains_list_column(test_data):
+    list_column1 = test_data[0][0]
+    list_column2 = [1, 3, 5, 1]
+    arr1 = pa.array(list_column1)
+    arr2 = pa.array(list_column2)
+
+    plc_column1 = plc.interop.from_arrow(arr1)
+    plc_column2 = plc.interop.from_arrow(arr2)
+    res = plc.lists.contains(plc_column1, plc_column2)
+
+    expect = pa.array([True, False, True, False])
+
+    assert_column_eq(expect, res)
+
+
+@pytest.mark.parametrize(
+    "list_column, expected",
+    [
+        (
+            [[1, None], [1, 3, 4], [5, None]],
+            [True, False, True],
+        ),
+        (
+            [[1, None], None, [5]],
+            [True, None, False],
+        ),
+    ],
+)
+def test_contains_nulls(list_column, expected):
+    arr = pa.array(list_column)
+    plc_column = plc.interop.from_arrow(arr)
+    res = plc.lists.contains_nulls(plc_column)
+
+    expect = pa.array(expected)
+
+    assert_column_eq(expect, res)
+
+
+def test_index_of_scalar(test_data, scalar):
+    list_column = test_data[0][0]
+    arr = pa.array(list_column)
+
+    plc_column = plc.interop.from_arrow(arr)
+    plc_scalar = plc.interop.from_arrow(scalar)
+    res = plc.lists.index_of(plc_column, plc_scalar, True)
+
+    expect = pa.array([1, -1, -1, -1], type=pa.int32())
+
+    assert_column_eq(expect, res)
+
+
+def test_index_of_list_column(test_data, column):
+    list_column = test_data[0][0]
+    arr1 = pa.array(list_column)
+    arr2, expect = column
+    plc_column1 = plc.interop.from_arrow(arr1)
+    plc_column2 = plc.interop.from_arrow(arr2)
+    res = plc.lists.index_of(plc_column1, plc_column2, True)
+
+    expect = pa.array(column[1], type=pa.int32())
+
+    assert_column_eq(expect, res)
+
+
+def test_count_elements(test_data):
+    arr = pa.array(test_data[0][1])
     plc_column = plc.interop.from_arrow(arr)
-    res = plc.lists.contains(plc_column)
+    res = plc.lists.count_elements(plc_column)
 
-    expect = pa.array([3, 1, 2])
+    expect = pa.array([1, 1, 0, 3], type=pa.int32())
 
     assert_column_eq(expect, res)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_slice.py b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py
new file mode 100644
index 00000000000..bd63987b30f
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def pa_col():
+    return pa.array(["AbC", "123abc", "", " ", None])
+
+
+@pytest.fixture(scope="module")
+def plc_col(pa_col):
+    return plc.interop.from_arrow(pa_col)
+
+
+@pytest.fixture(
+    scope="module",
+    params=[(1, 3, 1), (0, 3, -1), (3, 2, 1), (1, 5, 5), (1, 100, 2)],
+)
+def pa_start_stop_step(request):
+    return tuple(pa.scalar(x, type=pa.int32()) for x in request.param)
+
+
+@pytest.fixture(scope="module")
+def plc_start_stop_step(pa_start_stop_step):
+    return tuple(plc.interop.from_arrow(x) for x in pa_start_stop_step)
+
+
+@pytest.fixture(scope="module")
+def pa_starts_col():
+    return pa.array([0, 1, 3, -1, 100])
+
+
+@pytest.fixture(scope="module")
+def plc_starts_col(pa_starts_col):
+    return plc.interop.from_arrow(pa_starts_col)
+
+
+@pytest.fixture(scope="module")
+def pa_stops_col():
+    return pa.array([1, 3, 4, -1, 100])
+
+
+@pytest.fixture(scope="module")
+def plc_stops_col(pa_stops_col):
+    return plc.interop.from_arrow(pa_stops_col)
+
+
+def test_slice(pa_col, plc_col, pa_start_stop_step, plc_start_stop_step):
+    pa_start, pa_stop, pa_step = pa_start_stop_step
+    plc_start, plc_stop, plc_step = plc_start_stop_step
+
+    def slice_string(st, start, stop, step):
+        return st[start:stop:step] if st is not None else None
+
+    expected = pa.array(
+        [
+            slice_string(x, pa_start.as_py(), pa_stop.as_py(), pa_step.as_py())
+            for x in pa_col.to_pylist()
+        ],
+        type=pa.string(),
+    )
+
+    got = plc.strings.slice.slice_strings(
+        plc_col, start=plc_start, stop=plc_stop, step=plc_step
+    )
+
+    assert_column_eq(expected, got)
+
+
+def test_slice_column(
+    pa_col, plc_col, pa_starts_col, plc_starts_col, pa_stops_col, plc_stops_col
+):
+    def slice_string(st, start, stop):
+        if stop < 0:
+            stop = len(st)
+        return st[start:stop] if st is not None else None
+
+    expected = pa.array(
+        [
+            slice_string(x, start, stop)
+            for x, start, stop in zip(
+                pa_col.to_pylist(),
+                pa_starts_col.to_pylist(),
+                pa_stops_col.to_pylist(),
+            )
+        ],
+        type=pa.string(),
+    )
+
+    got = plc.strings.slice.slice_strings(
+        plc_col, plc_starts_col, plc_stops_col
+    )
+
+    assert_column_eq(expected, got)
+
+
+def test_slice_invalid(plc_col, plc_starts_col, plc_stops_col):
+    with pytest.raises(TypeError):
+        # no maching signature
+        plc.strings.slice.slice_strings(None, pa_starts_col, pa_stops_col)
+    with pytest.raises(ValueError):
+        # signature found but wrong value passed
+        plc.strings.slice.slice_strings(plc_col, plc_starts_col, None)
+    with pytest.raises(TypeError):
+        # no matching signature (2nd arg)
+        plc.strings.slice.slice_strings(plc_col, None, plc_stops_col)
+    with pytest.raises(TypeError):
+        # can't provide step for columnwise api
+        plc.strings.slice.slice_strings(
+            plc_col, plc_starts_col, plc_stops_col, plc_starts_col
+        )
diff --git a/python/cudf/cudf/pylibcudf_tests/test_table.py b/python/cudf/cudf/pylibcudf_tests/test_table.py
new file mode 100644
index 00000000000..cf1d51f6491
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_table.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.mark.parametrize(
+    "arrow_tbl",
+    [
+        pa.table([]),
+        pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}),
+        pa.table({"a": [1, 2, 3]}),
+        pa.table({"a": [1], "b": [2], "c": [3]}),
+    ],
+)
+def test_table_shape(arrow_tbl):
+    plc_tbl = plc.interop.from_arrow(arrow_tbl)
+
+    plc_tbl_shape = (plc_tbl.num_rows(), plc_tbl.num_columns())
+    assert plc_tbl_shape == arrow_tbl.shape
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 05ee8346afa..f40106a30f4 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10024,6 +10024,14 @@ def test_dataframe_rename_duplicate_column():
         gdf.rename(columns={"a": "b"}, inplace=True)
 
 
+def test_dataframe_rename_columns_keep_type():
+    gdf = cudf.DataFrame([[1, 2, 3]])
+    gdf.columns = cudf.Index([4, 5, 6], dtype=np.int8)
+    result = gdf.rename({4: 50}, axis="columns").columns
+    expected = pd.Index([50, 5, 6], dtype=np.int8)
+    assert_eq(result, expected)
+
+
 @pytest_unmark_spilling
 @pytest.mark.skipif(
     PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
@@ -11070,3 +11078,27 @@ def test_dataframe_loc_int_float(dtype1, dtype2):
     expected = pdf.loc[pidx]
 
     assert_eq(actual, expected, check_index_type=True, check_dtype=True)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        cudf.DataFrame(range(2)),
+        None,
+        [cudf.Series(range(2))],
+        [[0], [1]],
+        {1: range(2)},
+        cupy.arange(2),
+    ],
+)
+def test_init_with_index_no_shallow_copy(data):
+    idx = cudf.RangeIndex(2)
+    df = cudf.DataFrame(data, index=idx)
+    assert df.index is idx
+
+
+def test_from_records_with_index_no_shallow_copy():
+    idx = cudf.RangeIndex(2)
+    data = np.array([(1.0, 2), (3.0, 4)], dtype=[("x", "<f8"), ("y", "<i8")])
+    df = cudf.DataFrame(data.view(np.recarray), index=idx)
+    assert df.index is idx
diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py
index 0896d91570e..790e84559a9 100644
--- a/python/cudf/cudf/tests/test_monotonic.py
+++ b/python/cudf/cudf/tests/test_monotonic.py
@@ -33,11 +33,13 @@ def test_range_index(testrange):
     "testlist",
     [
         [1, 2, 3, 4],
+        [1, 2, 3, 4, None],
         [1, 2, 3, 3, 4],
         [10, 9, 8, 7],
         [10, 9, 8, 8, 7],
         ["c", "d", "e", "f"],
         ["c", "d", "e", "e", "f"],
+        ["c", "d", "e", "f", None],
         ["z", "y", "x", "r"],
         ["z", "y", "x", "x", "r"],
     ],
@@ -51,6 +53,23 @@ def test_generic_index(testlist):
     assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing
 
 
+@pytest.mark.parametrize(
+    "testlist",
+    [
+        [1, 2, 3, 4, np.nan],
+        [10, 9, 8, np.nan, 7],
+        [10, 9, 8, 8, 7, np.nan],
+    ],
+)
+def test_float_index(testlist):
+    index_pd = pd.Index(testlist)
+    index = cudf.from_pandas(index_pd, nan_as_null=False)
+
+    assert index.is_unique == index_pd.is_unique
+    assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing
+    assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing
+
+
 @pytest.mark.parametrize(
     "testlist",
     [
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index e1e7952605b..588bc87d268 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3485,3 +3485,14 @@ def test_parquet_chunked_reader(
     )
     actual = reader.read()
     assert_eq(expected, actual)
+
+
+def test_parquet_reader_pandas_compatibility():
+    df = pd.DataFrame(
+        {"a": [1, 2, 3, 4] * 10000, "b": ["av", "qw", "hi", "xyz"] * 10000}
+    )
+    buffer = BytesIO()
+    df.to_parquet(buffer)
+    with cudf.option_context("mode.pandas_compatible", True):
+        expected = cudf.read_parquet(buffer)
+    assert_eq(expected, df)
diff --git a/python/cudf/cudf/tests/test_performance_tracking.py b/python/cudf/cudf/tests/test_performance_tracking.py
new file mode 100644
index 00000000000..e886b77af3f
--- /dev/null
+++ b/python/cudf/cudf/tests/test_performance_tracking.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from io import StringIO
+
+import pytest
+
+import rmm.mr
+import rmm.statistics
+
+import cudf
+from cudf.utils.performance_tracking import (
+    get_memory_records,
+    print_memory_report,
+)
+
+
+@pytest.fixture
+def rmm_reset():
+    """Fixture to reset the RMM resource before and after the test"""
+    mr = rmm.mr.get_current_device_resource()
+    try:
+        rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource())
+        yield
+    finally:
+        rmm.mr.set_current_device_resource(mr)
+
+
+def test_memory_profiling(rmm_reset):
+    df1 = cudf.DataFrame({"a": [1, 2, 3]})
+    assert len(get_memory_records()) == 0
+
+    rmm.statistics.enable_statistics()
+    cudf.set_option("memory_profiling", True)
+
+    df1.merge(df1)
+
+    assert len(get_memory_records()) > 0
+
+    out = StringIO()
+    print_memory_report(file=out)
+    assert "DataFrame.merge" in out.getvalue()
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 8f65bd26bd1..193d64a9e7f 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1210,7 +1210,7 @@ def test_multiindex_repr(pmi, max_seq_items):
             .index,
             textwrap.dedent(
                 """
-                MultiIndex([('abc',                       'NaT', 0.345),
+                MultiIndex([('abc',                         NaT, 0.345),
                             ( <NA>, '0 days 00:00:00.000000001',  <NA>),
                             ('xyz', '0 days 00:00:00.000000002', 100.0),
                             ( <NA>, '0 days 00:00:00.000000003',  10.0)],
@@ -1252,10 +1252,10 @@ def test_multiindex_repr(pmi, max_seq_items):
             .index,
             textwrap.dedent(
                 """
-            MultiIndex([('NaT', <NA>),
-                        ('NaT', <NA>),
-                        ('NaT', <NA>),
-                        ('NaT', <NA>)],
+            MultiIndex([(NaT, <NA>),
+                        (NaT, <NA>),
+                        (NaT, <NA>),
+                        (NaT, <NA>)],
                     names=['b', 'a'])
             """
             ),
diff --git a/python/cudf/cudf/utils/nvtx_annotation.py b/python/cudf/cudf/utils/nvtx_annotation.py
deleted file mode 100644
index a4404e51232..00000000000
--- a/python/cudf/cudf/utils/nvtx_annotation.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-
-import hashlib
-from functools import partial
-
-from nvtx import annotate
-
-_NVTX_COLORS = ["green", "blue", "purple", "rapids"]
-
-
-def _get_color_for_nvtx(name):
-    m = hashlib.sha256()
-    m.update(name.encode())
-    hash_value = int(m.hexdigest(), 16)
-    idx = hash_value % len(_NVTX_COLORS)
-    return _NVTX_COLORS[idx]
-
-
-def _cudf_nvtx_annotate(func, domain="cudf_python"):
-    """Decorator for applying nvtx annotations to methods in cudf."""
-    return annotate(
-        message=func.__qualname__,
-        color=_get_color_for_nvtx(func.__qualname__),
-        domain=domain,
-    )(func)
-
-
-_dask_cudf_nvtx_annotate = partial(
-    _cudf_nvtx_annotate, domain="dask_cudf_python"
-)
diff --git a/python/cudf/cudf/utils/performance_tracking.py b/python/cudf/cudf/utils/performance_tracking.py
new file mode 100644
index 00000000000..30c891d0d5a
--- /dev/null
+++ b/python/cudf/cudf/utils/performance_tracking.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import contextlib
+import functools
+import hashlib
+import sys
+
+import nvtx
+
+import rmm.statistics
+
+from cudf.options import get_option
+
+_NVTX_COLORS = ["green", "blue", "purple", "rapids"]
+
+
+def _get_color_for_nvtx(name):
+    m = hashlib.sha256()
+    m.update(name.encode())
+    hash_value = int(m.hexdigest(), 16)
+    idx = hash_value % len(_NVTX_COLORS)
+    return _NVTX_COLORS[idx]
+
+
+def _performance_tracking(func, domain="cudf_python"):
+    """Decorator for applying performance tracking (if enabled)."""
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        with contextlib.ExitStack() as stack:
+            if get_option("memory_profiling"):
+                # NB: the user still needs to call `rmm.statistics.enable_statistics()`
+                #     to enable memory profiling.
+                stack.enter_context(
+                    rmm.statistics.profiler(
+                        name=rmm.statistics._get_descriptive_name_of_object(
+                            func
+                        )
+                    )
+                )
+            if nvtx.enabled():
+                stack.enter_context(
+                    nvtx.annotate(
+                        message=func.__qualname__,
+                        color=_get_color_for_nvtx(func.__qualname__),
+                        domain=domain,
+                    )
+                )
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
+_dask_cudf_performance_tracking = functools.partial(
+    _performance_tracking, domain="dask_cudf_python"
+)
+
+
+def get_memory_records() -> (
+    dict[str, rmm.statistics.ProfilerRecords.MemoryRecord]
+):
+    """Get the memory records from the memory profiling
+
+    Returns
+    -------
+    Dict that maps function names to memory records. Empty if
+    memory profiling is disabled
+    """
+    return rmm.statistics.default_profiler_records.records
+
+
+def print_memory_report(file=sys.stdout) -> None:
+    """Pretty print the result of the memory profiling
+
+    Parameters
+    ----------
+    file
+        The output stream
+    """
+    print(rmm.statistics.default_profiler_records.report(), file=file)
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 2e4dfc4bb14..7347ec7866a 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -159,8 +159,9 @@ def _external_only_api(func, alternative=""):
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
         # Check the immediately preceding frame to see if it's in cudf.
-        frame, lineno = next(traceback.walk_stack(None))
-        fn = frame.f_code.co_filename
+        pre_frame = traceback.extract_stack(limit=2)[0]
+        fn = pre_frame.filename
+        lineno = pre_frame.lineno
         if _cudf_root in fn and _tests_root not in fn:
             raise RuntimeError(
                 f"External-only API called in {fn} at line {lineno}. "
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index eed5037cbea..f51ce103677 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1533,3 +1533,36 @@ def test_is_proxy_object():
     assert is_proxy_object(np_arr_proxy)
     assert is_proxy_object(s1)
     assert not is_proxy_object(s2)
+
+
+def test_numpy_cupy_flatiter(series):
+    cp = pytest.importorskip("cupy")
+
+    _, s = series
+    arr = s.values
+
+    assert type(arr.flat._fsproxy_fast) == cp.flatiter
+    assert type(arr.flat._fsproxy_slow) == np.flatiter
+
+
+def test_arrow_string_arrays():
+    cu_s = xpd.Series(["a", "b", "c"])
+    pd_s = pd.Series(["a", "b", "c"])
+
+    cu_arr = xpd.arrays.ArrowStringArray._from_sequence(
+        cu_s, dtype=xpd.StringDtype("pyarrow")
+    )
+    pd_arr = pd.arrays.ArrowStringArray._from_sequence(
+        pd_s, dtype=pd.StringDtype("pyarrow")
+    )
+
+    tm.assert_equal(cu_arr, pd_arr)
+
+    cu_arr = xpd.core.arrays.string_arrow.ArrowStringArray._from_sequence(
+        cu_s, dtype=xpd.StringDtype("pyarrow_numpy")
+    )
+    pd_arr = pd.core.arrays.string_arrow.ArrowStringArray._from_sequence(
+        pd_s, dtype=pd.StringDtype("pyarrow_numpy")
+    )
+
+    tm.assert_equal(cu_arr, pd_arr)
diff --git a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
index 941e61e6787..b924995cf4b 100644
--- a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
+++ b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
@@ -15,10 +15,10 @@
  */
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/udf/udf_apis.hpp>
 #include <cudf/strings/udf/udf_string.cuh>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -57,7 +57,7 @@ std::unique_ptr<rmm::device_buffer> to_string_view_array(cudf::column_view const
                                                          rmm::cuda_stream_view stream)
 {
   return std::make_unique<rmm::device_buffer>(
-    std::move(cudf::strings::detail::create_string_vector_from_column(
+    std::move(cudf::strings::create_string_vector_from_column(
                 cudf::strings_column_view(input), stream, rmm::mr::get_current_device_resource())
                 .release()));
 }
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 871134665af..17d7d15e4e5 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -51,6 +51,7 @@
     "GroupedRollingWindow",
     "Cast",
     "Agg",
+    "Ternary",
     "BinOp",
 ]
 
@@ -443,12 +444,12 @@ def __init__(
         ):
             # With ignore_nulls == False, polars uses Kleene logic
             raise NotImplementedError(f"Kleene logic for {self.name}")
-        if self.name in (
-            pl_expr.BooleanFunction.IsFinite,
-            pl_expr.BooleanFunction.IsInfinite,
-            pl_expr.BooleanFunction.IsIn,
+        if self.name == pl_expr.BooleanFunction.IsIn and not all(
+            c.dtype == self.children[0].dtype for c in self.children
         ):
-            raise NotImplementedError(f"{self.name}")
+            # TODO: If polars IR doesn't put the casts in, we need to
+            # mimic the supertype promotion rules.
+            raise NotImplementedError("IsIn doesn't support supertype casting")
 
     @staticmethod
     def _distinct(
@@ -506,6 +507,33 @@ def do_evaluate(
         mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
+        if self.name in (
+            pl_expr.BooleanFunction.IsFinite,
+            pl_expr.BooleanFunction.IsInfinite,
+        ):
+            # Avoid evaluating the child if the dtype tells us it's unnecessary.
+            (child,) = self.children
+            is_finite = self.name == pl_expr.BooleanFunction.IsFinite
+            if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
+                value = plc.interop.from_arrow(
+                    pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype))
+                )
+                return Column(plc.Column.from_scalar(value, df.num_rows))
+            needles = child.evaluate(df, context=context, mapping=mapping)
+            to_search = [-float("inf"), float("inf")]
+            if is_finite:
+                # NaN is neither finite not infinite
+                to_search.append(float("nan"))
+            haystack = plc.interop.from_arrow(
+                pa.array(
+                    to_search,
+                    type=plc.interop.to_arrow(needles.obj.type()),
+                )
+            )
+            result = plc.search.contains(haystack, needles.obj)
+            if is_finite:
+                result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
+            return Column(result)
         columns = [
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
@@ -612,31 +640,13 @@ def do_evaluate(
                     (c.obj for c in columns),
                 )
             )
-        elif self.name == pl_expr.BooleanFunction.IsBetween:
-            column, lo, hi = columns
-            (closed,) = self.options
-            lop, rop = self._BETWEEN_OPS[closed]
-            lo_obj = (
-                lo.obj_scalar
-                if lo.is_scalar and lo.obj.size() != column.obj.size()
-                else lo.obj
-            )
-            hi_obj = (
-                hi.obj_scalar
-                if hi.is_scalar and hi.obj.size() != column.obj.size()
-                else hi.obj
-            )
+        elif self.name == pl_expr.BooleanFunction.IsIn:
+            needles, haystack = columns
+            return Column(plc.search.contains(haystack.obj, needles.obj))
+        elif self.name == pl_expr.BooleanFunction.Not:
+            (column,) = columns
             return Column(
-                plc.binaryop.binary_operation(
-                    plc.binaryop.binary_operation(
-                        column.obj, lo_obj, lop, output_type=self.dtype
-                    ),
-                    plc.binaryop.binary_operation(
-                        column.obj, hi_obj, rop, output_type=self.dtype
-                    ),
-                    plc.binaryop.BinaryOperator.LOGICAL_AND,
-                    self.dtype,
-                )
+                plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT)
             )
         else:
             raise NotImplementedError(
@@ -1103,6 +1113,34 @@ def do_evaluate(
         return self.op(child.evaluate(df, context=context, mapping=mapping))
 
 
+class Ternary(Expr):
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+    children: tuple[Expr, Expr, Expr]
+
+    def __init__(
+        self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr
+    ) -> None:
+        super().__init__(dtype)
+        self.children = (when, then, otherwise)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        when, then, otherwise = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        then_obj = then.obj_scalar if then.is_scalar else then.obj
+        otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj
+        return Column(plc.copying.copy_if_else(then_obj, otherwise_obj, when.obj))
+
+
 class BinOp(Expr):
     __slots__ = ("op", "children")
     _non_child = ("dtype", "op")
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index b3dd6ae7cc3..3f5f3c74050 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -123,7 +123,7 @@ def broadcast(
     ]
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class IR:
     """Abstract plan node, representing an unevaluated dataframe."""
 
@@ -157,7 +157,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         )  # pragma: no cover
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class PythonScan(IR):
     """Representation of input from a python function."""
 
@@ -171,7 +171,7 @@ def __post_init__(self):
         raise NotImplementedError("PythonScan not implemented")
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Scan(IR):
     """Input from files."""
 
@@ -248,7 +248,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return df.filter(mask)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Cache(IR):
     """
     Return a cached plan node.
@@ -269,7 +269,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return cache.setdefault(self.key, self.value.evaluate(cache=cache))
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class DataFrameScan(IR):
     """
     Input from an existing polars DataFrame.
@@ -315,7 +315,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return df
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Select(IR):
     """Produce a new dataframe selecting given expressions from an input."""
 
@@ -336,7 +336,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(columns)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Reduce(IR):
     """
     Produce a new dataframe selecting given expressions from an input.
@@ -389,7 +389,7 @@ def placeholder_column(n: int) -> plc.Column:
     )
 
 
-@dataclasses.dataclass(slots=False)
+@dataclasses.dataclass
 class GroupBy(IR):
     """Perform a groupby."""
 
@@ -490,7 +490,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame([*result_keys, *results]).slice(self.options.slice)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Join(IR):
     """A join of two dataframes."""
 
@@ -503,7 +503,7 @@ class Join(IR):
     right_on: list[expr.NamedExpr]
     """List of expressions used as keys in the right frame."""
     options: tuple[
-        Literal["inner", "left", "full", "leftsemi", "leftanti"],
+        Literal["inner", "left", "full", "leftsemi", "leftanti", "cross"],
         bool,
         tuple[int, int] | None,
         str | None,
@@ -520,11 +520,14 @@ class Join(IR):
 
     def __post_init__(self) -> None:
         """Validate preconditions."""
-        if self.options[0] == "cross":
-            raise NotImplementedError("cross join not implemented")
+        if any(
+            isinstance(e.value, expr.Literal)
+            for e in itertools.chain(self.left_on, self.right_on)
+        ):
+            raise NotImplementedError("Join with literal as join key.")
 
-    @cache
     @staticmethod
+    @cache
     def _joiners(
         how: Literal["inner", "left", "full", "leftsemi", "leftanti"],
     ) -> tuple[
@@ -567,35 +570,42 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         left = self.left.evaluate(cache=cache)
         right = self.right.evaluate(cache=cache)
-        left_on = DataFrame(
-            broadcast(
-                *(e.evaluate(left) for e in self.left_on), target_length=left.num_rows
-            )
-        )
-        right_on = DataFrame(
-            broadcast(
-                *(e.evaluate(right) for e in self.right_on),
-                target_length=right.num_rows,
-            )
-        )
         how, join_nulls, zlice, suffix, coalesce = self.options
+        suffix = "_right" if suffix is None else suffix
+        if how == "cross":
+            # Separate implementation, since cross_join returns the
+            # result, not the gather maps
+            columns = plc.join.cross_join(left.table, right.table).columns()
+            left_cols = [
+                NamedColumn(new, old.name).sorted_like(old)
+                for new, old in zip(columns[: left.num_columns], left.columns)
+            ]
+            right_cols = [
+                NamedColumn(
+                    new,
+                    old.name
+                    if old.name not in left.column_names_set
+                    else f"{old.name}{suffix}",
+                )
+                for new, old in zip(columns[left.num_columns :], right.columns)
+            ]
+            return DataFrame([*left_cols, *right_cols])
+        # TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184
+        left_on = DataFrame(broadcast(*(e.evaluate(left) for e in self.left_on)))
+        right_on = DataFrame(broadcast(*(e.evaluate(right) for e in self.right_on)))
         null_equality = (
             plc.types.NullEquality.EQUAL
             if join_nulls
             else plc.types.NullEquality.UNEQUAL
         )
-        suffix = "_right" if suffix is None else suffix
         join_fn, left_policy, right_policy = Join._joiners(how)
         if right_policy is None:
             # Semi join
             lg = join_fn(left_on.table, right_on.table, null_equality)
-            left = left.replace_columns(*left_on.columns)
             table = plc.copying.gather(left.table, lg, left_policy)
             result = DataFrame.from_table(table, left.column_names)
         else:
             lg, rg = join_fn(left_on.table, right_on.table, null_equality)
-            left = left.replace_columns(*left_on.columns)
-            right = right.replace_columns(*right_on.columns)
             if coalesce and how == "inner":
                 right = right.discard_columns(right_on.column_names_set)
             left = DataFrame.from_table(
@@ -629,7 +639,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return result.slice(zlice)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class HStack(IR):
     """Add new columns to a dataframe."""
 
@@ -658,7 +668,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return df.with_columns(columns)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Distinct(IR):
     """Produce a new dataframe with distinct rows."""
 
@@ -728,7 +738,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return result.slice(self.zlice)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Sort(IR):
     """Sort a dataframe."""
 
@@ -797,7 +807,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(columns).slice(self.zlice)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Slice(IR):
     """Slice a dataframe."""
 
@@ -814,7 +824,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return df.slice((self.offset, self.length))
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Filter(IR):
     """Filter a dataframe with a boolean mask."""
 
@@ -830,7 +840,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return df.filter(mask)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Projection(IR):
     """Select a subset of columns from a dataframe."""
 
@@ -847,7 +857,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(columns)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class MapFunction(IR):
     """Apply some function to a dataframe."""
 
@@ -881,6 +891,13 @@ def __post_init__(self) -> None:
                 # polars requires that all to-explode columns have the
                 # same sub-shapes
                 raise NotImplementedError("Explode with more than one column")
+        elif self.name == "rename":
+            old, new, _ = self.options
+            # TODO: perhaps polars should validate renaming in the IR?
+            if len(new) != len(set(new)) or (
+                set(new) & (set(self.df.schema.keys() - set(old)))
+            ):
+                raise NotImplementedError("Duplicate new names in rename.")
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -906,7 +923,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             raise AssertionError("Should never be reached")  # pragma: no cover
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Union(IR):
     """Concatenate dataframes vertically."""
 
@@ -930,7 +947,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         ).slice(self.zlice)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class HConcat(IR):
     """Concatenate dataframes horizontally."""
 
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 5d289885f47..953ff636cce 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -342,6 +342,16 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             *(translate_expr(visitor, n=n) for n in node.input),
         )
     elif isinstance(name, pl_expr.BooleanFunction):
+        if name == pl_expr.BooleanFunction.IsBetween:
+            column, lo, hi = (translate_expr(visitor, n=n) for n in node.input)
+            (closed,) = options
+            lop, rop = expr.BooleanFunction._BETWEEN_OPS[closed]
+            return expr.BinOp(
+                dtype,
+                plc.binaryop.BinaryOperator.LOGICAL_AND,
+                expr.BinOp(dtype, lop, column, lo),
+                expr.BinOp(dtype, rop, column, hi),
+            )
         return expr.BooleanFunction(
             dtype,
             name,
@@ -436,6 +446,16 @@ def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Ex
     )
 
 
+@_translate_expr.register
+def _(node: pl_expr.Ternary, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+    return expr.Ternary(
+        dtype,
+        translate_expr(visitor, n=node.predicate),
+        translate_expr(visitor, n=node.truthy),
+        translate_expr(visitor, n=node.falsy),
+    )
+
+
 @_translate_expr.register
 def _(
     node: pl_expr.BinaryExpr, visitor: NodeTraverser, dtype: plc.DataType
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index 6d597a91724..c04eac41bb7 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -6,7 +6,7 @@
 from __future__ import annotations
 
 from collections.abc import Mapping
-from typing import TYPE_CHECKING, Literal, Protocol, TypeAlias
+from typing import TYPE_CHECKING, Literal, Protocol, Union
 
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
@@ -15,43 +15,45 @@
 if TYPE_CHECKING:
     from typing import Callable
 
+    from typing_extensions import TypeAlias
+
     import polars as pl
 
-IR: TypeAlias = (
-    pl_ir.PythonScan
-    | pl_ir.Scan
-    | pl_ir.Cache
-    | pl_ir.DataFrameScan
-    | pl_ir.Select
-    | pl_ir.GroupBy
-    | pl_ir.Join
-    | pl_ir.HStack
-    | pl_ir.Distinct
-    | pl_ir.Sort
-    | pl_ir.Slice
-    | pl_ir.Filter
-    | pl_ir.SimpleProjection
-    | pl_ir.MapFunction
-    | pl_ir.Union
-    | pl_ir.HConcat
-    | pl_ir.ExtContext
-)
-
-Expr: TypeAlias = (
-    pl_expr.Function
-    | pl_expr.Window
-    | pl_expr.Literal
-    | pl_expr.Sort
-    | pl_expr.SortBy
-    | pl_expr.Gather
-    | pl_expr.Filter
-    | pl_expr.Cast
-    | pl_expr.Column
-    | pl_expr.Agg
-    | pl_expr.BinaryExpr
-    | pl_expr.Len
-    | pl_expr.PyExprIR
-)
+IR: TypeAlias = Union[
+    pl_ir.PythonScan,
+    pl_ir.Scan,
+    pl_ir.Cache,
+    pl_ir.DataFrameScan,
+    pl_ir.Select,
+    pl_ir.GroupBy,
+    pl_ir.Join,
+    pl_ir.HStack,
+    pl_ir.Distinct,
+    pl_ir.Sort,
+    pl_ir.Slice,
+    pl_ir.Filter,
+    pl_ir.SimpleProjection,
+    pl_ir.MapFunction,
+    pl_ir.Union,
+    pl_ir.HConcat,
+    pl_ir.ExtContext,
+]
+
+Expr: TypeAlias = Union[
+    pl_expr.Function,
+    pl_expr.Window,
+    pl_expr.Literal,
+    pl_expr.Sort,
+    pl_expr.SortBy,
+    pl_expr.Gather,
+    pl_expr.Filter,
+    pl_expr.Cast,
+    pl_expr.Column,
+    pl_expr.Agg,
+    pl_expr.BinaryExpr,
+    pl_expr.Len,
+    pl_expr.PyExprIR,
+]
 
 Schema: TypeAlias = Mapping[str, plc.DataType]
 
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index 2ffa1c4af6d..267d0a99692 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -52,7 +52,7 @@ def test_agg(df, agg):
 
     # https://github.com/rapidsai/cudf/issues/15852
     check_dtypes = agg not in {"n_unique", "median"}
-    if not check_dtypes and q.schema["a"] != pl.Float64:
+    if not check_dtypes and q.collect_schema()["a"] != pl.Float64:
         with pytest.raises(AssertionError):
             assert_gpu_result_equal(q)
     assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False)
@@ -65,7 +65,7 @@ def test_agg(df, agg):
 )
 @pytest.mark.parametrize("op", ["min", "max"])
 def test_agg_float_with_nans(propagate_nans, op):
-    df = pl.LazyFrame({"a": [1, 2, float("nan")]})
+    df = pl.LazyFrame({"a": pl.Series([1, 2, float("nan")], dtype=pl.Float64())})
     op = getattr(pl.Expr, f"nan_{op}" if propagate_nans else op)
     q = df.select(op(pl.col("a")))
 
diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py
index 951b749e670..97421008669 100644
--- a/python/cudf_polars/tests/expressions/test_booleanfunction.py
+++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py
@@ -6,7 +6,10 @@
 
 import polars as pl
 
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 @pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"])
@@ -26,7 +29,7 @@ def has_nulls(request):
 def test_booleanfunction_reduction(ignore_nulls):
     ldf = pl.LazyFrame(
         {
-            "a": [1, 2, 3.0, 2, 5],
+            "a": pl.Series([1, 2, 3.0, 2, 5], dtype=pl.Float64()),
             "b": [0, 3, 1, -1, None],
             "c": [1, 6, 5, 3, 2],
         }
@@ -67,22 +70,27 @@ def test_boolean_function_unary(request, expr, has_nans, has_nulls):
 
     df = pl.LazyFrame({"a": pl.Series(values, dtype=pl.Float32())})
 
-    q = df.select(expr(pl.col("a")))
+    q = df.select(expr(pl.col("a")), expr(pl.col("a")).not_().alias("b"))
 
     assert_gpu_result_equal(q)
 
 
-@pytest.mark.xfail(reason="Evaluation handlers not yet implemented")
 @pytest.mark.parametrize(
     "expr",
     [
         pl.col("a").is_finite(),
         pl.col("a").is_infinite(),
-        pl.col("a").is_in(pl.col("b")),
+        [pl.col("a").is_infinite(), pl.col("b").is_finite()],
     ],
 )
-def test_unsupported_boolean_function(expr):
-    df = pl.LazyFrame({"a": [1, float("nan"), 2, 4], "b": [1, 2, 3, 4]})
+def test_boolean_finite(expr):
+    df = pl.LazyFrame(
+        {
+            "a": pl.Series([1, float("nan"), 2, float("inf")], dtype=pl.Float64()),
+            "b": [1, 2, 3, 4],
+            "c": pl.Series([1, 2, 3, 4], dtype=pl.Float64()),
+        }
+    )
 
     q = df.select(expr)
 
@@ -95,7 +103,11 @@ def test_unsupported_boolean_function(expr):
 )
 def test_boolean_isbetween(closed, bounds):
     df = pl.LazyFrame(
-        {"a": [1, float("nan"), 2, 4], "lo": [1, 2, 2, 3], "hi": [10, 4, 2, 4]}
+        {
+            "a": pl.Series([1, float("nan"), 2, 4], dtype=pl.Float32()),
+            "lo": [1, 2, 2, 3],
+            "hi": [10, 4, 2, 4],
+        }
     )
 
     q = df.select(pl.col("a").is_between(*bounds, closed=closed))
@@ -127,3 +139,33 @@ def test_boolean_horizontal(request, expr, has_nulls, wide):
     q = ldf.select(expr)
 
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.col("a").is_in(pl.col("b")),
+        pl.col("a").is_in(pl.col("c")),
+        pl.col("c").is_in(pl.col("d")),
+    ],
+)
+def test_boolean_is_in(expr):
+    ldf = pl.LazyFrame(
+        {
+            "a": pl.Series([1, 2, 3], dtype=pl.Int64()),
+            "b": pl.Series([3, 4, 2], dtype=pl.Int64()),
+            "c": pl.Series([1, None, 3], dtype=pl.Int64()),
+            "d": pl.Series([10, None, 11], dtype=pl.Int64()),
+        }
+    )
+
+    q = ldf.select(expr)
+
+    assert_gpu_result_equal(q)
+
+
+def test_boolean_is_in_raises_unsupported():
+    ldf = pl.LazyFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int64)})
+    q = ldf.select(pl.col("a").is_in(pl.lit(1, dtype=pl.Int32())))
+
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/expressions/test_rolling.py b/python/cudf_polars/tests/expressions/test_rolling.py
index d4920d35f14..992efe0ba79 100644
--- a/python/cudf_polars/tests/expressions/test_rolling.py
+++ b/python/cudf_polars/tests/expressions/test_rolling.py
@@ -3,11 +3,9 @@
 
 from __future__ import annotations
 
-import pytest
-
 import polars as pl
 
-from cudf_polars import translate_ir
+from cudf_polars.testing.asserts import assert_ir_translation_raises
 
 
 def test_rolling():
@@ -29,13 +27,13 @@ def test_rolling():
         min_a=pl.min("a").rolling(index_column="dt", period="2d"),
         max_a=pl.max("a").rolling(index_column="dt", period="2d"),
     )
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 def test_grouped_rolling():
     df = pl.LazyFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 2, 1, 3, 1, 2]})
 
     q = df.select(pl.col("a").min().over("b"))
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py
index 3c498fe7286..9729e765948 100644
--- a/python/cudf_polars/tests/expressions/test_stringfunction.py
+++ b/python/cudf_polars/tests/expressions/test_stringfunction.py
@@ -8,8 +8,11 @@
 
 import polars as pl
 
-from cudf_polars import execute_with_cudf, translate_ir
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars import execute_with_cudf
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 @pytest.fixture
@@ -47,22 +50,19 @@ def test_supported_stringfunction_expression(ldf):
 def test_unsupported_stringfunction(ldf):
     q = ldf.select(pl.col("a").str.count_matches("e", literal=True))
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 def test_contains_re_non_strict_raises(ldf):
     q = ldf.select(pl.col("a").str.contains(".", strict=False))
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 def test_contains_re_non_literal_raises(ldf):
     q = ldf.select(pl.col("a").str.contains(pl.col("b"), literal=False))
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf_polars/tests/expressions/test_when_then.py b/python/cudf_polars/tests/expressions/test_when_then.py
new file mode 100644
index 00000000000..cf1c0fe7fce
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_when_then.py
@@ -0,0 +1,27 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize("then_scalar", [False, True])
+@pytest.mark.parametrize("otherwise_scalar", [False, True])
+@pytest.mark.parametrize("expr", [pl.col("c"), pl.col("c").is_not_null()])
+def test_when_then(then_scalar, otherwise_scalar, expr):
+    ldf = pl.LazyFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [10, 13, 11, 15, 16, 11, 10],
+            "c": [None, True, False, False, True, True, False],
+        }
+    )
+
+    then = pl.lit(10) if then_scalar else pl.col("a")
+    otherwise = pl.lit(-2) if otherwise_scalar else pl.col("b")
+    q = ldf.select(pl.when(expr).then(then).otherwise(otherwise))
+    assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index e70f923b097..aefad59eb91 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -6,8 +6,10 @@
 
 import polars as pl
 
-from cudf_polars import translate_ir
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 @pytest.fixture
@@ -72,7 +74,7 @@ def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs):
     q = df.group_by(*keys, maintain_order=maintain_order).agg(*exprs)
 
     if not maintain_order:
-        sort_keys = list(q.schema.keys())[: len(keys)]
+        sort_keys = list(q.collect_schema().keys())[: len(keys)]
         q = q.sort(*sort_keys)
 
     assert_gpu_result_equal(q, check_exact=False)
@@ -97,5 +99,4 @@ def test_groupby_len(df, keys):
 def test_groupby_unsupported(df, expr):
     q = df.group_by("key1").agg(expr)
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index f4a4704f3cc..89f6fd3455b 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -6,7 +6,10 @@
 
 import polars as pl
 
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 @pytest.mark.parametrize(
@@ -16,10 +19,6 @@
         "left",
         "semi",
         "anti",
-        pytest.param(
-            "cross",
-            marks=pytest.mark.xfail(reason="cross join not implemented"),
-        ),
         "full",
     ],
 )
@@ -55,3 +54,34 @@ def test_join(how, coalesce, join_nulls, join_expr):
         right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=coalesce
     )
     assert_gpu_result_equal(query, check_row_order=False)
+
+
+def test_cross_join():
+    left = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 1, None],
+            "b": [1, 2, 3, 4, 5],
+            "c": [2, 3, 4, 5, 6],
+        }
+    ).lazy()
+    right = pl.DataFrame(
+        {
+            "a": [1, 4, 3, 7, None, None],
+            "c": [2, 3, 4, 5, 6, 7],
+        }
+    ).lazy()
+
+    q = left.join(right, how="cross")
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "left_on,right_on", [(pl.col("a"), pl.lit(2)), (pl.lit(2), pl.col("a"))]
+)
+def test_join_literal_key_unsupported(left_on, right_on):
+    left = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
+    right = pl.LazyFrame({"a": [1, 2, 3], "b": [5, 6, 7]})
+    q = left.join(right, left_on=left_on, right_on=right_on, how="inner")
+
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py
index ec6b3f3fc0a..77032108e6f 100644
--- a/python/cudf_polars/tests/test_mapfunction.py
+++ b/python/cudf_polars/tests/test_mapfunction.py
@@ -6,8 +6,10 @@
 
 import polars as pl
 
-from cudf_polars import translate_ir
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 def test_merge_sorted_raises():
@@ -17,16 +19,14 @@ def test_merge_sorted_raises():
 
     q = df1.merge_sorted(df2, key="a").merge_sorted(df3, key="a")
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 def test_explode_multiple_raises():
     df = pl.LazyFrame({"a": [[1, 2], [3, 4]], "b": [[5, 6], [7, 8]]})
     q = df.explode("a", "b")
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 @pytest.mark.parametrize("column", ["a", "b"])
@@ -41,3 +41,23 @@ def test_explode_single(column):
     q = df.explode(column)
 
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("mapping", [{"b": "a"}, {"a": "c", "b": "c"}])
+def test_rename_duplicate_raises(mapping):
+    df = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
+
+    q = df.rename(mapping)
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize(
+    "mapping", [{}, {"b": "c"}, {"b": "a", "a": "b"}, {"a": "c", "b": "d"}]
+)
+def test_rename_columns(mapping):
+    df = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
+
+    q = df.rename(mapping)
+
+    assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_python_scan.py b/python/cudf_polars/tests/test_python_scan.py
index c03474e3dc8..fd8453b77c4 100644
--- a/python/cudf_polars/tests/test_python_scan.py
+++ b/python/cudf_polars/tests/test_python_scan.py
@@ -2,11 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-import pytest
-
 import polars as pl
 
-from cudf_polars import translate_ir
+from cudf_polars.testing.asserts import assert_ir_translation_raises
 
 
 def test_python_scan():
@@ -14,7 +12,6 @@ def source(with_columns, predicate, nrows):
         return pl.DataFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int8())})
 
     q = pl.LazyFrame._scan_python_function({"a": pl.Int8}, source, pyarrow=False)
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
     assert q.collect().equals(source(None, None, None))
diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py
index 6c9122bc260..b021d832910 100644
--- a/python/cudf_polars/tests/test_union.py
+++ b/python/cudf_polars/tests/test_union.py
@@ -2,12 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-import pytest
-
 import polars as pl
 
-from cudf_polars import translate_ir
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 def test_union():
@@ -31,8 +31,8 @@ def test_union_schema_mismatch_raises():
     ).lazy()
     ldf2 = ldf.select(pl.col("a").cast(pl.Float32))
     query = pl.concat([ldf, ldf2], how="diagonal")
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(query._ldf.visit())
+
+    assert_ir_translation_raises(query, NotImplementedError)
 
 
 def test_concat_vertical():
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index d250589e389..1f55a59ea55 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -43,7 +43,7 @@
 
 import cudf
 from cudf.api.types import is_string_dtype
-from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 from .core import DataFrame, Index, Series
 
@@ -53,7 +53,7 @@
 
 
 @meta_nonempty.register(cudf.BaseIndex)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _nonempty_index(idx):
     if isinstance(idx, cudf.core.index.RangeIndex):
         return cudf.core.index.RangeIndex(2, name=idx.name)
@@ -100,7 +100,7 @@ def _nest_list_data(data, leaf_type):
     return data
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _get_non_empty_data(s):
     if isinstance(s, cudf.core.column.CategoricalColumn):
         categories = (
@@ -147,7 +147,7 @@ def _get_non_empty_data(s):
 
 
 @meta_nonempty.register(cudf.Series)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _nonempty_series(s, idx=None):
     if idx is None:
         idx = _nonempty_index(s.index)
@@ -157,7 +157,7 @@ def _nonempty_series(s, idx=None):
 
 
 @meta_nonempty.register(cudf.DataFrame)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def meta_nonempty_cudf(x):
     idx = meta_nonempty(x.index)
     columns_with_dtype = dict()
@@ -182,18 +182,18 @@ def meta_nonempty_cudf(x):
 
 
 @make_meta_dispatch.register((cudf.Series, cudf.DataFrame))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def make_meta_cudf(x, index=None):
     return x.head(0)
 
 
 @make_meta_dispatch.register(cudf.BaseIndex)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def make_meta_cudf_index(x, index=None):
     return x[:0]
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _empty_series(name, dtype, index=None):
     if isinstance(dtype, str) and dtype == "category":
         return cudf.Series(
@@ -203,7 +203,7 @@ def _empty_series(name, dtype, index=None):
 
 
 @make_meta_obj.register(object)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def make_meta_object_cudf(x, index=None):
     """Create an empty cudf object containing the desired metadata.
 
@@ -274,7 +274,7 @@ def make_meta_object_cudf(x, index=None):
 
 
 @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def concat_cudf(
     dfs,
     axis=0,
@@ -299,13 +299,13 @@ def concat_cudf(
 @categorical_dtype_dispatch.register(
     (cudf.DataFrame, cudf.Series, cudf.BaseIndex)
 )
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def categorical_dtype_cudf(categories=None, ordered=False):
     return cudf.CategoricalDtype(categories=categories, ordered=ordered)
 
 
 @tolist_dispatch.register((cudf.Series, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def tolist_cudf(obj):
     return obj.to_pandas().tolist()
 
@@ -313,7 +313,7 @@ def tolist_cudf(obj):
 @is_categorical_dtype_dispatch.register(
     (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series)
 )
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def is_categorical_dtype_cudf(obj):
     return cudf.api.types._is_categorical_dtype(obj)
 
@@ -324,7 +324,7 @@ def get_grouper_cudf(obj):
 
 
 @percentile_lookup.register((cudf.Series, cp.ndarray, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def percentile_cudf(a, q, interpolation="linear"):
     # Cudf dispatch to the equivalent of `np.percentile`:
     # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html
@@ -400,7 +400,7 @@ def _table_to_cudf(obj, table, self_destruct=None, **kwargs):
 
 
 @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def union_categoricals_cudf(
     to_union, sort_categories=False, ignore_order=False
 ):
@@ -410,7 +410,7 @@ def union_categoricals_cudf(
 
 
 @hash_object_dispatch.register((cudf.DataFrame, cudf.Series))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def hash_object_cudf(frame, index=True):
     if index:
         frame = frame.reset_index()
@@ -418,7 +418,7 @@ def hash_object_cudf(frame, index=True):
 
 
 @hash_object_dispatch.register(cudf.BaseIndex)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def hash_object_cudf_index(ind, index=None):
     if isinstance(ind, cudf.MultiIndex):
         return ind.to_frame(index=False).hash_values()
@@ -428,7 +428,7 @@ def hash_object_cudf_index(ind, index=None):
 
 
 @group_split_dispatch.register((cudf.Series, cudf.DataFrame))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def group_split_cudf(df, c, k, ignore_index=False):
     return dict(
         zip(
@@ -443,7 +443,7 @@ def group_split_cudf(df, c, k, ignore_index=False):
 
 
 @sizeof_dispatch.register(cudf.DataFrame)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def sizeof_cudf_dataframe(df):
     return int(
         sum(col.memory_usage for col in df._data.columns)
@@ -452,7 +452,7 @@ def sizeof_cudf_dataframe(df):
 
 
 @sizeof_dispatch.register((cudf.Series, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def sizeof_cudf_series_index(obj):
     return obj.memory_usage()
 
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 3bd455a3a57..aab56e3a1b0 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -22,7 +22,7 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 from dask_cudf import sorting
 from dask_cudf.accessors import ListMethods, StructMethods
@@ -53,7 +53,7 @@ def __repr__(self):
         s = "<dask_cudf.%s | %d tasks | %d npartitions>"
         return s % (type(self).__name__, len(self.dask), self.npartitions)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def to_dask_dataframe(self, **kwargs):
         """Create a dask.dataframe object from a dask_cudf object
 
@@ -92,7 +92,7 @@ class DataFrame(_Frame, dd.core.DataFrame):
 
     _partition_type = cudf.DataFrame
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def _assign_column(self, k, v):
         def assigner(df, k, v):
             out = df.copy()
@@ -102,7 +102,7 @@ def assigner(df, k, v):
         meta = assigner(self._meta, k, dask_make_meta(v))
         return self.map_partitions(assigner, k, v, meta=meta)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None):
         import uuid
 
@@ -123,7 +123,7 @@ def do_apply_rows(df, func, incols, outcols, kwargs):
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def merge(self, other, shuffle_method=None, **kwargs):
         on = kwargs.pop("on", None)
         if isinstance(on, tuple):
@@ -136,7 +136,7 @@ def merge(self, other, shuffle_method=None, **kwargs):
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def join(self, other, shuffle_method=None, **kwargs):
         # CuDF doesn't support "right" join yet
         how = kwargs.pop("how", "left")
@@ -155,7 +155,7 @@ def join(self, other, shuffle_method=None, **kwargs):
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def set_index(
         self,
         other,
@@ -237,7 +237,7 @@ def set_index(
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def sort_values(
         self,
         by,
@@ -275,14 +275,14 @@ def sort_values(
             return df.reset_index(drop=True)
         return df
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def to_parquet(self, path, *args, **kwargs):
         """Calls dask.dataframe.io.to_parquet with CudfEngine backend"""
         from dask_cudf.io import to_parquet
 
         return to_parquet(self, path, *args, **kwargs)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def to_orc(self, path, **kwargs):
         """Calls dask_cudf.io.to_orc"""
         from dask_cudf.io import to_orc
@@ -290,7 +290,7 @@ def to_orc(self, path, **kwargs):
         return to_orc(self, path, **kwargs)
 
     @derived_from(pd.DataFrame)
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def var(
         self,
         axis=None,
@@ -324,28 +324,28 @@ def var(
             return _parallel_var(self, meta, skipna, split_every, out)
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def shuffle(self, *args, shuffle_method=None, **kwargs):
         """Wraps dask.dataframe DataFrame.shuffle method"""
         return super().shuffle(
             *args, shuffle_method=_get_shuffle_method(shuffle_method), **kwargs
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def groupby(self, by=None, **kwargs):
         from .groupby import CudfDataFrameGroupBy
 
         return CudfDataFrameGroupBy(self, by=by, **kwargs)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def sum_of_squares(x):
     x = x.astype("f8")._column
     outcol = libcudf.reduce.reduce("sum_of_squares", x)
     return cudf.Series(outcol)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def var_aggregate(x2, x, n, ddof):
     try:
         with warnings.catch_warnings(record=True):
@@ -358,12 +358,12 @@ def var_aggregate(x2, x, n, ddof):
         return np.float64(np.nan)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def nlargest_agg(x, **kwargs):
     return cudf.concat(x).nlargest(**kwargs)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def nsmallest_agg(x, **kwargs):
     return cudf.concat(x).nsmallest(**kwargs)
 
@@ -371,7 +371,7 @@ def nsmallest_agg(x, **kwargs):
 class Series(_Frame, dd.core.Series):
     _partition_type = cudf.Series
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def count(self, split_every=False):
         return reduction(
             [self],
@@ -381,14 +381,14 @@ def count(self, split_every=False):
             meta="i8",
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def mean(self, split_every=False):
         sum = self.sum(split_every=split_every)
         n = self.count(split_every=split_every)
         return sum / n
 
     @derived_from(pd.DataFrame)
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def var(
         self,
         axis=None,
@@ -417,19 +417,19 @@ def var(
         else:
             return _parallel_var(self, meta, skipna, split_every, out)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def groupby(self, *args, **kwargs):
         from .groupby import CudfSeriesGroupBy
 
         return CudfSeriesGroupBy(self, *args, **kwargs)
 
     @property  # type: ignore
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def list(self):
         return ListMethods(self)
 
     @property  # type: ignore
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def struct(self):
         return StructMethods(self)
 
@@ -438,7 +438,7 @@ class Index(Series, dd.core.Index):
     _partition_type = cudf.Index  # type: ignore
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _naive_var(ddf, meta, skipna, ddof, split_every, out):
     num = ddf._get_numeric_data()
     x = 1.0 * num.sum(skipna=skipna, split_every=split_every)
@@ -453,7 +453,7 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out):
     return handle_out(out, result)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _parallel_var(ddf, meta, skipna, split_every, out):
     def _local_var(x, skipna):
         if skipna:
@@ -520,7 +520,7 @@ def _finalize_var(vals):
     return handle_out(out, result)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _extract_meta(x):
     """
     Extract internal cache data (``_meta``) from dask_cudf objects
@@ -536,7 +536,7 @@ def _extract_meta(x):
     return x
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _emulate(func, *args, **kwargs):
     """
     Apply a function using args / kwargs. If arguments contain dd.DataFrame /
@@ -546,7 +546,7 @@ def _emulate(func, *args, **kwargs):
         return func(*_extract_meta(args), **_extract_meta(kwargs))
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def align_partitions(args):
     """Align partitions between dask_cudf objects.
 
@@ -563,7 +563,7 @@ def align_partitions(args):
     return args
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def reduction(
     args,
     chunk=None,
@@ -702,7 +702,7 @@ def reduction(
     return dd.core.new_dd_object(graph, b, meta, (None, None))
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
     from dask_cudf import QUERY_PLANNING_ON
 
@@ -746,7 +746,7 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
 )
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def from_dask_dataframe(df):
     """
     Convert a Dask :class:`dask.dataframe.DataFrame` to a Dask-cuDF
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 2e72461b43d..bbbcde17b51 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -16,7 +16,7 @@
 
 import cudf
 from cudf.core.groupby.groupby import _deprecate_collect
-from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 from dask_cudf.sorting import _deprecate_shuffle_kwarg
 
@@ -56,13 +56,13 @@ def wrapper(*args, **kwargs):
 
 
 class CudfDataFrameGroupBy(DataFrameGroupBy):
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def __init__(self, *args, sort=None, **kwargs):
         self.sep = kwargs.pop("sep", "___")
         self.as_index = kwargs.pop("as_index", True)
         super().__init__(*args, sort=sort, **kwargs)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def __getitem__(self, key):
         if isinstance(key, list):
             g = CudfDataFrameGroupBy(
@@ -84,7 +84,7 @@ def __getitem__(self, key):
         g._meta = g._meta[key]
         return g
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def _make_groupby_method_aggs(self, agg_name):
         """Create aggs dictionary for aggregation methods"""
 
@@ -92,7 +92,7 @@ def _make_groupby_method_aggs(self, agg_name):
             return {c: agg_name for c in self.obj.columns if c not in self.by}
         return {c: agg_name for c in self.obj.columns if c != self.by}
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def count(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -102,7 +102,7 @@ def count(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def mean(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -112,7 +112,7 @@ def mean(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def std(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -122,7 +122,7 @@ def std(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def var(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -132,7 +132,7 @@ def var(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def sum(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -142,7 +142,7 @@ def sum(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def min(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -152,7 +152,7 @@ def min(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def max(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -162,7 +162,7 @@ def max(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
         _deprecate_collect()
@@ -173,7 +173,7 @@ def collect(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def first(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -183,7 +183,7 @@ def first(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def last(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -194,7 +194,7 @@ def last(self, split_every=None, split_out=1):
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def aggregate(
         self, arg, split_every=None, split_out=1, shuffle_method=None
     ):
@@ -231,13 +231,13 @@ def aggregate(
 
 
 class CudfSeriesGroupBy(SeriesGroupBy):
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def __init__(self, *args, sort=None, **kwargs):
         self.sep = kwargs.pop("sep", "___")
         self.as_index = kwargs.pop("as_index", True)
         super().__init__(*args, sort=sort, **kwargs)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def count(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -247,7 +247,7 @@ def count(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def mean(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -257,7 +257,7 @@ def mean(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def std(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -267,7 +267,7 @@ def std(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def var(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -277,7 +277,7 @@ def var(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def sum(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -287,7 +287,7 @@ def sum(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def min(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -297,7 +297,7 @@ def min(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def max(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -307,7 +307,7 @@ def max(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
         _deprecate_collect()
@@ -318,7 +318,7 @@ def collect(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def first(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -328,7 +328,7 @@ def first(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def last(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -339,7 +339,7 @@ def last(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def aggregate(
         self, arg, split_every=None, split_out=1, shuffle_method=None
     ):
@@ -429,7 +429,7 @@ def _shuffle_aggregate(
     return result
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def groupby_agg(
     ddf,
     gb_cols,
@@ -641,7 +641,7 @@ def groupby_agg(
     )
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _make_groupby_agg_call(
     gb, aggs, split_every, split_out, shuffle_method=None
 ):
@@ -663,7 +663,7 @@ def _make_groupby_agg_call(
     )
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _redirect_aggs(arg):
     """Redirect aggregations to their corresponding name in cuDF"""
     redirects = {
@@ -690,7 +690,7 @@ def _redirect_aggs(arg):
     return redirects.get(arg, arg)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _aggs_optimized(arg, supported: set):
     """Check that aggregations in `arg` are a subset of `supported`"""
     if isinstance(arg, (list, dict)):
@@ -712,7 +712,7 @@ def _aggs_optimized(arg, supported: set):
     return False
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _groupby_optimized(gb):
     """Check that groupby input can use dask-cudf optimized codepath"""
     return isinstance(gb.obj, DaskDataFrame) and (
@@ -730,7 +730,7 @@ def _make_name(col_name, sep="_"):
     return sep.join(name for name in col_name if name != "")
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep):
     """Initial partition-level aggregation task.
 
@@ -768,7 +768,7 @@ def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep):
     return gb[sorted(output_columns)]
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _tree_node_agg(df, gb_cols, dropna, sort, sep):
     """Node in groupby-aggregation reduction tree.
 
@@ -807,7 +807,7 @@ def _tree_node_agg(df, gb_cols, dropna, sort, sep):
     return gb[sorted(output_columns)]
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
     """Calculate variance (given count, sum, and sum-squared columns)."""
 
@@ -829,7 +829,7 @@ def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
     return var
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _finalize_gb_agg(
     gb_in,
     gb_cols,
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index f3774e20d32..a2ba4d1878e 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -18,7 +18,7 @@
 
 import cudf
 from cudf.api.types import _is_categorical_dtype
-from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 _SHUFFLE_SUPPORT = ("tasks", "p2p")  # "disk" not supported
 
@@ -48,14 +48,14 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def set_index_post(df, index_name, drop, column_dtype):
     df2 = df.set_index(index_name, drop=drop)
     df2.columns = df2.columns.astype(column_dtype)
     return df2
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _set_partitions_pre(s, divisions, ascending=True, na_position="last"):
     if ascending:
         partitions = divisions.searchsorted(s, side="right") - 1
@@ -72,7 +72,7 @@ def _set_partitions_pre(s, divisions, ascending=True, na_position="last"):
     return partitions
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _quantile(a, q):
     n = len(a)
     if not len(a):
@@ -83,7 +83,7 @@ def _quantile(a, q):
     )
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def merge_quantiles(finalq, qs, vals):
     """Combine several quantile calculations of different data.
     [NOTE: Same logic as dask.array merge_percentiles]
@@ -146,7 +146,7 @@ def _append_counts(val, count):
     return rv.reset_index(drop=True)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _approximate_quantile(df, q):
     """Approximate quantiles of DataFrame or Series.
     [NOTE: Same logic as dask.dataframe Series quantile]
@@ -220,7 +220,7 @@ def set_quantile_index(df):
     return df
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def quantile_divisions(df, by, npartitions):
     qn = np.linspace(0.0, 1.0, npartitions + 1).tolist()
     divisions = _approximate_quantile(df[by], qn).compute()
@@ -257,7 +257,7 @@ def quantile_divisions(df, by, npartitions):
 
 
 @_deprecate_shuffle_kwarg
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def sort_values(
     df,
     by,