diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 8190b5d0297..315a389339a 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -31,6 +31,6 @@ ENV PYTHONDONTWRITEBYTECODE="1"
 
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
-ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
+ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV HISTFILE="/home/coder/.cache/._bash_history"
 ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache"
diff --git a/.github/workflows/auto-assign.yml b/.github/workflows/auto-assign.yml
new file mode 100644
index 00000000000..673bebd4ecc
--- /dev/null
+++ b/.github/workflows/auto-assign.yml
@@ -0,0 +1,17 @@
+name: "Auto Assign PR"
+
+on:
+  pull_request_target:
+    types:
+      - opened
+      - reopened
+      - synchronize
+
+jobs:
+  add_assignees:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions-ecosystem/action-add-assignees@v1
+        with:
+          repo_token: "${{ secrets.GITHUB_TOKEN }}"
+          assignees: ${{ github.actor }}
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 31e78f82a62..f5cb71bfc14 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -1,4 +1,5 @@
 name: "Pull Request Labeler"
+
 on:
 - pull_request_target
 
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 174dc72bf02..f5234f58efe 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,17 +16,6 @@ repos:
             ^cpp/cmake/thirdparty/patches/.*|
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
           )
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.13.2
-    hooks:
-      - id: isort
-        # Use the config file specific to each subproject so that each
-        # project can specify its own first/third-party packages.
-        args: ["--config-root=python/", "--resolve-all-configs"]
-        files: python/.*
-        exclude: |
-          (?x)^(^python/cudf_polars/.*)
-        types_or: [python, cython, pyi]
   - repo: https://github.com/MarcoGorelli/cython-lint
     rev: v0.16.2
     hooks:
@@ -150,6 +139,7 @@ repos:
     rev: v0.4.8
     hooks:
       - id: ruff
+        args: ["--fix"]
         files: python/.*$
       - id: ruff-format
         files: python/.*$
@@ -165,7 +155,7 @@ repos:
           )
       - id: verify-alpha-spec
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.13.11
+    rev: v1.16.0
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f9cdde7c2b7..3db1ed35294 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -38,6 +38,7 @@ conduct. More information can be found at:
 8. Verify that CI passes all [status checks](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks).
    Fix if needed.
 9. Wait for other developers to review your code and update code as needed.
+   Changes to any C++ files require at least 2 approvals from the cudf-cpp-codeowners before merging.
 10. Once reviewed and approved, a RAPIDS developer will merge your pull request.
 
 If you are unsure about anything, don't hesitate to comment on issues and ask for clarification!
@@ -293,8 +294,8 @@ In order to run doxygen as a linter on C++/CUDA code, run
 ./ci/checks/doxygen.sh
 ```
 
-Python code runs several linters including [Black](https://black.readthedocs.io/en/stable/),
-[isort](https://pycqa.github.io/isort/), and [flake8](https://flake8.pycqa.org/en/latest/).
+Python code runs several linters including [Ruff](https://docs.astral.sh/ruff/)
+with its various rules  for Black-like formatting or Isort.
 
 cuDF also uses [codespell](https://github.com/codespell-project/codespell) to find spelling
 mistakes, and this check is run as a pre-commit hook. To apply the suggested spelling fixes,
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index e5fcef17a83..3d06eacf9ff 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -15,8 +15,12 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
+sccache --zero-stats
+
 # With boa installed conda build forward to boa
 RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild \
     conda/recipes/libcudf
 
+sccache --show-adv-stats
+
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 823d7f62290..ed90041cc77 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -19,6 +19,8 @@ rapids-logger "Begin py build"
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
+sccache --zero-stats
+
 # TODO: Remove `--no-test` flag once importing on a CPU
 # node works correctly
 # With boa installed conda build forwards to the boa builder
@@ -28,12 +30,18 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/pylibcudf
 
+sccache --show-adv-stats
+sccache --zero-stats
+
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cudf
 
+sccache --show-adv-stats
+sccache --zero-stats
+
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
@@ -46,6 +54,8 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cudf_kafka
 
+sccache --show-adv-stats
+
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index bf76f4ed29a..78b8a8a08cf 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -3,7 +3,8 @@
 
 set -euo pipefail
 
-package_dir=$1
+package_name=$1
+package_dir=$2
 
 source rapids-configure-sccache
 source rapids-date-string
@@ -12,4 +13,14 @@ rapids-generate-version > ./VERSION
 
 cd "${package_dir}"
 
-python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check
+sccache --zero-stats
+
+rapids-logger "Building '${package_name}' wheel"
+python -m pip wheel \
+    -w dist \
+    -v \
+    --no-deps \
+    --disable-pip-version-check \
+    .
+
+sccache --show-adv-stats
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index fb93b06dbe2..fef4416a366 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -18,7 +18,7 @@ echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf
 echo "pylibcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/pylibcudf_dist/pylibcudf_*.whl)" >> /tmp/constraints.txt
 export PIP_CONSTRAINT="/tmp/constraints.txt"
 
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh cudf ${package_dir}
 
 python -m auditwheel repair \
     --exclude libcudf.so \
diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh
index 9c945e11c00..79853cdbdb2 100755
--- a/ci/build_wheel_cudf_polars.sh
+++ b/ci/build_wheel_cudf_polars.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 
 package_dir="python/cudf_polars"
 
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh cudf-polars ${package_dir}
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist
+RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh
index eb2a91289f7..00c64afa2ef 100755
--- a/ci/build_wheel_dask_cudf.sh
+++ b/ci/build_wheel_dask_cudf.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 
 package_dir="python/dask_cudf"
 
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh dask-cudf ${package_dir}
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist
+RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 python ${package_dir}/dist
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
index 91bc071583e..b3d6778ea04 100755
--- a/ci/build_wheel_libcudf.sh
+++ b/ci/build_wheel_libcudf.sh
@@ -3,10 +3,30 @@
 
 set -euo pipefail
 
+package_name="libcudf"
 package_dir="python/libcudf"
 
+rapids-logger "Generating build requirements"
+
+rapids-dependency-file-generator \
+  --output requirements \
+  --file-key "py_build_${package_name}" \
+  --file-key "py_rapids_build_${package_name}" \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};cuda_suffixed=true" \
+| tee /tmp/requirements-build.txt
+
+rapids-logger "Installing build requirements"
+python -m pip install \
+    -v \
+    --prefer-binary \
+    -r /tmp/requirements-build.txt
+
+# build with '--no-build-isolation', for better sccache hit rate
+# 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
+export PIP_NO_BUILD_ISOLATION=0
+
 export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
@@ -16,4 +36,4 @@ python -m auditwheel repair \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
-RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist
+RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp "${package_dir}/final_dist"
diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
index 5e9f7f8a0c4..839d98846fe 100755
--- a/ci/build_wheel_pylibcudf.sh
+++ b/ci/build_wheel_pylibcudf.sh
@@ -16,7 +16,7 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f
 echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf_*.whl)" > /tmp/constraints.txt
 export PIP_CONSTRAINT="/tmp/constraints.txt"
 
-./ci/build_wheel.sh ${package_dir}
+./ci/build_wheel.sh pylibcudf ${package_dir}
 
 python -m auditwheel repair \
     --exclude libcudf.so \
@@ -24,4 +24,4 @@ python -m auditwheel repair \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index bd5e6c3d569..c3716c4759a 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -65,7 +65,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.8,<1.9
+- polars>=1.11,<1.12
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<18.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 565a3ebfa3c..38e131e79cb 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -63,7 +63,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.8,<1.9
+- polars>=1.11,<1.12
 - pre-commit
 - pyarrow>=14.0.0,<18.0.0a0
 - pydata-sphinx-theme!=0.14.2
diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml
index e8fef715c60..edf92b930d9 100644
--- a/conda/recipes/cudf-polars/meta.yaml
+++ b/conda/recipes/cudf-polars/meta.yaml
@@ -43,7 +43,7 @@ requirements:
   run:
     - python
     - pylibcudf ={{ version }}
-    - polars >=1.8,<1.9
+    - polars >=1.11,<1.12
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e4b9cbf8921..60132f651d2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -369,6 +369,8 @@ add_library(
   src/filling/sequence.cu
   src/groupby/groupby.cu
   src/groupby/hash/compute_groupby.cu
+  src/groupby/hash/compute_mapping_indices.cu
+  src/groupby/hash/compute_mapping_indices_null.cu
   src/groupby/hash/compute_single_pass_aggs.cu
   src/groupby/hash/create_sparse_results_table.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index e61a8e6e1e6..2a4ac789046 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -49,7 +49,7 @@ target_compile_options(
 
 target_link_libraries(
   ndsh_data_generator
-  PUBLIC cudf GTest::gmock GTest::gtest cudf::cudftestutil nvtx3::nvtx3-cpp
+  PUBLIC cudf cudf::cudftestutil nvtx3::nvtx3-cpp
   PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
@@ -345,11 +345,11 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
-ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
+ConfigureBench(TEXT_BENCH text/subword.cpp)
 
 ConfigureNVBench(
-  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
+  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp
+  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
index f44f26e4d2c..7fe61054a26 100644
--- a/cpp/benchmarks/ast/transform.cpp
+++ b/cpp/benchmarks/ast/transform.cpp
@@ -16,16 +16,29 @@
 
 #include <benchmarks/common/generate_input.hpp>
 
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
 #include <nvbench/nvbench.cuh>
+#include <nvbench/types.cuh>
 
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <iterator>
 #include <list>
 #include <memory>
 #include <optional>
@@ -86,7 +99,71 @@ static void BM_ast_transform(nvbench::state& state)
   auto const& expression_tree_root = expressions.back();
 
   // Use the number of bytes read from global memory
-  state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
+  state.add_global_memory_reads<key_type>(static_cast<size_t>(table_size) * (tree_levels + 1));
+  state.add_global_memory_writes<key_type>(table_size);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
+}
+
+template <cudf::ast::ast_operator cmp_op, cudf::ast::ast_operator reduce_op>
+static void BM_string_compare_ast_transform(nvbench::state& state)
+{
+  auto const string_width    = static_cast<cudf::size_type>(state.get_int64("string_width"));
+  auto const num_rows        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_comparisons = static_cast<cudf::size_type>(state.get_int64("num_comparisons"));
+  auto const hit_rate        = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+
+  CUDF_EXPECTS(num_comparisons > 0, "benchmarks require 1 or more comparisons");
+
+  // Create table data
+  auto const num_cols = num_comparisons * 2;
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  std::for_each(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) {
+      columns.emplace_back(create_string_column(num_rows, string_width, hit_rate));
+    });
+
+  cudf::table table{std::move(columns)};
+  cudf::table_view const table_view = table.view();
+
+  int64_t const chars_size = std::accumulate(
+    table_view.begin(),
+    table_view.end(),
+    static_cast<int64_t>(0),
+    [](int64_t size, auto& column) -> int64_t {
+      return size + cudf::strings_column_view{column}.chars_size(cudf::get_default_stream());
+    });
+
+  // Create column references
+  auto column_refs = std::vector<cudf::ast::column_reference>();
+  std::transform(thrust::make_counting_iterator(0),
+                 thrust::make_counting_iterator(num_cols),
+                 std::back_inserter(column_refs),
+                 [](auto const& column_id) { return cudf::ast::column_reference(column_id); });
+
+  // Create expression trees
+  std::list<cudf::ast::operation> expressions;
+
+  // Construct AST tree (a == b && c == d && e == f && ...)
+
+  expressions.emplace_back(cudf::ast::operation(cmp_op, column_refs[0], column_refs[1]));
+
+  std::for_each(thrust::make_counting_iterator(1),
+                thrust::make_counting_iterator(num_comparisons),
+                [&](size_t idx) {
+                  auto const& lhs = expressions.back();
+                  auto const& rhs = expressions.emplace_back(
+                    cudf::ast::operation(cmp_op, column_refs[idx * 2], column_refs[idx * 2 + 1]));
+                  expressions.emplace_back(cudf::ast::operation(reduce_op, lhs, rhs));
+                });
+
+  auto const& expression_tree_root = expressions.back();
+
+  // Use the number of bytes read from global memory
+  state.add_element_count(chars_size, "chars_size");
+  state.add_global_memory_reads<nvbench::uint8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows);
 
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
@@ -115,3 +192,19 @@ AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true);
 AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true);
+
+#define AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(name, cmp_op, reduce_op) \
+  static void name(::nvbench::state& st)                                       \
+  {                                                                            \
+    ::BM_string_compare_ast_transform<cmp_op, reduce_op>(st);                  \
+  }                                                                            \
+  NVBENCH_BENCH(name)                                                          \
+    .set_name(#name)                                                           \
+    .add_int64_axis("string_width", {32, 64, 128, 256})                        \
+    .add_int64_axis("num_rows", {32768, 262144, 2097152})                      \
+    .add_int64_axis("num_comparisons", {1, 2, 3, 4})                           \
+    .add_int64_axis("hit_rate", {50, 100})
+
+AST_STRING_COMPARE_TRANSFORM_BENCHMARK_DEFINE(ast_string_equal_logical_and,
+                                              cudf::ast::ast_operator::EQUAL,
+                                              cudf::ast::ast_operator::LOGICAL_AND);
diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp
index 7d267a88764..35e41c6c2a4 100644
--- a/cpp/benchmarks/binaryop/binaryop.cpp
+++ b/cpp/benchmarks/binaryop/binaryop.cpp
@@ -17,12 +17,18 @@
 #include <benchmarks/common/generate_input.hpp>
 
 #include <cudf/binaryop.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+
 #include <nvbench/nvbench.cuh>
 
 #include <algorithm>
+#include <cstddef>
+#include <memory>
 
 // This set of benchmarks is designed to be a comparison for the AST benchmarks
 
@@ -44,7 +50,8 @@ static void BM_binaryop_transform(nvbench::state& state)
   cudf::table_view table{*source_table};
 
   // Use the number of bytes read from global memory
-  state.add_global_memory_reads<key_type>(table_size * (tree_levels + 1));
+  state.add_global_memory_reads<key_type>(static_cast<size_t>(table_size) * (tree_levels + 1));
+  state.add_global_memory_writes<key_type>(table_size);
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) {
     // Execute tree that chains additions like (((a + b) + c) + d)
@@ -64,11 +71,65 @@ static void BM_binaryop_transform(nvbench::state& state)
   });
 }
 
+template <cudf::binary_operator cmp_op, cudf::binary_operator reduce_op>
+static void BM_string_compare_binaryop_transform(nvbench::state& state)
+{
+  auto const string_width    = static_cast<cudf::size_type>(state.get_int64("string_width"));
+  auto const num_rows        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_comparisons = static_cast<cudf::size_type>(state.get_int64("num_comparisons"));
+  auto const hit_rate        = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+
+  CUDF_EXPECTS(num_comparisons > 0, "benchmarks require 1 or more comparisons");
+
+  // Create table data
+  auto const num_cols = num_comparisons * 2;
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  std::for_each(
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) {
+      columns.emplace_back(create_string_column(num_rows, string_width, hit_rate));
+    });
+
+  cudf::table table{std::move(columns)};
+  cudf::table_view const table_view = table.view();
+
+  int64_t const chars_size = std::accumulate(
+    table_view.begin(), table_view.end(), static_cast<int64_t>(0), [](int64_t size, auto& column) {
+      return size + cudf::strings_column_view{column}.chars_size(cudf::get_default_stream());
+    });
+
+  // Create column references
+
+  // Use the number of bytes read from global memory
+  state.add_element_count(chars_size, "chars_size");
+  state.add_global_memory_reads<nvbench::uint8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows);
+
+  // Construct binary operations (a == b && c == d && e == f && ...)
+  auto constexpr bool_type = cudf::data_type{cudf::type_id::BOOL8};
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream{launch.get_stream().get_stream()};
+    std::unique_ptr<cudf::column> reduction =
+      cudf::binary_operation(table.get_column(0), table.get_column(1), cmp_op, bool_type, stream);
+    std::for_each(
+      thrust::make_counting_iterator(1),
+      thrust::make_counting_iterator(num_comparisons),
+      [&](size_t idx) {
+        std::unique_ptr<cudf::column> comparison = cudf::binary_operation(
+          table.get_column(idx * 2), table.get_column(idx * 2 + 1), cmp_op, bool_type, stream);
+        std::unique_ptr<cudf::column> reduced =
+          cudf::binary_operation(*comparison, *reduction, reduce_op, bool_type, stream);
+        stream.synchronize();
+        reduction = std::move(reduced);
+      });
+  });
+}
+
 #define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \
                                                                                       \
   static void name(::nvbench::state& st)                                              \
   {                                                                                   \
-    BM_binaryop_transform<key_type, tree_type, reuse_columns>(st);                    \
+    ::BM_binaryop_transform<key_type, tree_type, reuse_columns>(st);                  \
   }                                                                                   \
   NVBENCH_BENCH(name)                                                                 \
     .add_int64_axis("tree_levels", {1, 2, 5, 10})                                     \
@@ -86,3 +147,20 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique,
                                     double,
                                     TreeType::IMBALANCED_LEFT,
                                     false);
+
+#define STRING_COMPARE_BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, cmp_op, reduce_op) \
+                                                                                    \
+  static void name(::nvbench::state& st)                                            \
+  {                                                                                 \
+    ::BM_string_compare_binaryop_transform<cmp_op, reduce_op>(st);                  \
+  }                                                                                 \
+  NVBENCH_BENCH(name)                                                               \
+    .set_name(#name)                                                                \
+    .add_int64_axis("string_width", {32, 64, 128, 256})                             \
+    .add_int64_axis("num_rows", {32768, 262144, 2097152})                           \
+    .add_int64_axis("num_comparisons", {1, 2, 3, 4})                                \
+    .add_int64_axis("hit_rate", {50, 100})
+
+STRING_COMPARE_BINARYOP_TRANSFORM_BENCHMARK_DEFINE(string_compare_binaryop_transform,
+                                                   cudf::binary_operator::EQUAL,
+                                                   cudf::binary_operator::LOGICAL_AND);
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
index bc0ff69bce9..cd3c3871a2e 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -39,7 +39,7 @@ void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop)
   // use number of bytes read and written to global memory
   state.add_global_memory_reads<TypeLhs>(table_size);
   state.add_global_memory_reads<TypeRhs>(table_size);
-  state.add_global_memory_reads<TypeOut>(table_size);
+  state.add_global_memory_writes<TypeOut>(table_size);
 
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); });
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index dc258e32dc5..bdce8a31176 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -17,13 +17,17 @@
 #include "generate_input.hpp"
 #include "random_distribution_factory.cuh"
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/filling.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/combine.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -918,6 +922,58 @@ std::unique_ptr<cudf::table> create_sequence_table(std::vector<cudf::type_id> co
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
+std::unique_ptr<cudf::column> create_string_column(cudf::size_type num_rows,
+                                                   cudf::size_type row_width,
+                                                   int32_t hit_rate)
+{
+  // build input table using the following data
+  auto raw_data = cudf::test::strings_column_wrapper(
+                    {
+                      "123 abc 4567890 DEFGHI 0987 5W43",  // matches both patterns;
+                      "012345 6789 01234 56789 0123 456",  // the rest do not match
+                      "abc 4567890 DEFGHI 0987 Wxyz 123",
+                      "abcdefghijklmnopqrstuvwxyz 01234",
+                      "",
+                      "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
+                      "9876543210,abcdefghijklmnopqrstU",
+                      "9876543210,abcdefghijklmnopqrstU",
+                      "123 édf 4567890 DéFG 0987 X5",
+                      "1",
+                    })
+                    .release();
+
+  if (row_width / 32 > 1) {
+    std::vector<cudf::column_view> columns;
+    for (int i = 0; i < row_width / 32; ++i) {
+      columns.push_back(raw_data->view());
+    }
+    raw_data = cudf::strings::concatenate(cudf::table_view(columns));
+  }
+  auto data_view = raw_data->view();
+
+  // compute number of rows in n_rows that should match
+  auto const num_matches = (static_cast<int64_t>(num_rows) * hit_rate) / 100;
+
+  // Create a randomized gather-map to build a column out of the strings in data.
+  data_profile gather_profile =
+    data_profile_builder().cardinality(0).null_probability(0.0).distribution(
+      cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1);
+  auto gather_table =
+    create_random_table({cudf::type_id::INT32}, row_count{num_rows}, gather_profile);
+  gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
+
+  // Create scatter map by placing 0-index values throughout the gather-map
+  auto scatter_data = cudf::sequence(num_matches,
+                                     cudf::numeric_scalar<int32_t>(0),
+                                     cudf::numeric_scalar<int32_t>(num_rows / num_matches));
+  auto zero_scalar  = cudf::numeric_scalar<int32_t>(0);
+  auto table        = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view());
+  auto gather_map   = table->view().column(0);
+  table             = cudf::gather(cudf::table_view({data_view}), gather_map);
+
+  return std::move(table->release().front());
+}
+
 std::pair<rmm::device_buffer, cudf::size_type> create_random_null_mask(
   cudf::size_type size, std::optional<double> null_probability, unsigned seed)
 {
diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
index 68d3dc492f5..57834fd11d2 100644
--- a/cpp/benchmarks/common/generate_input.hpp
+++ b/cpp/benchmarks/common/generate_input.hpp
@@ -670,6 +670,18 @@ std::unique_ptr<cudf::column> create_random_column(cudf::type_id dtype_id,
                                                    data_profile const& data_params = data_profile{},
                                                    unsigned seed                   = 1);
 
+/**
+ * @brief Deterministically generates a large string column filled with data with the given
+ * parameters.
+ *
+ * @param num_rows Number of rows in the output column
+ * @param row_width Width of each string in the column
+ * @param hit_rate The hit rate percentage, ranging from 0 to 100
+ */
+std::unique_ptr<cudf::column> create_string_column(cudf::size_type num_rows,
+                                                   cudf::size_type row_width,
+                                                   int32_t hit_rate);
+
 /**
  * @brief Generate sequence columns starting with value 0 in first row and increasing by 1 in
  * subsequent rows.
diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index fe24fb58728..45b46005c47 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -186,7 +186,7 @@ std::string exec_cmd(std::string_view cmd)
   std::fflush(nullptr);
   // Switch stderr and stdout to only capture stderr
   auto const redirected_cmd = std::string{"( "}.append(cmd).append(" 3>&2 2>&1 1>&3) 2>/dev/null");
-  std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(redirected_cmd.c_str(), "r"), pclose);
+  std::unique_ptr<FILE, int (*)(FILE*)> pipe(popen(redirected_cmd.c_str(), "r"), pclose);
   CUDF_EXPECTS(pipe != nullptr, "popen() failed");
 
   std::array<char, 128> buffer;
diff --git a/cpp/benchmarks/ndsh/q01.cpp b/cpp/benchmarks/ndsh/q01.cpp
index ef709926ae9..485e8e5497c 100644
--- a/cpp/benchmarks/ndsh/q01.cpp
+++ b/cpp/benchmarks/ndsh/q01.cpp
@@ -104,7 +104,7 @@
 }
 
 void run_ndsh_q1(nvbench::state& state,
-                 std::unordered_map<std::string, parquet_device_buffer>& sources)
+                 std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Define the column projections and filter predicate for `lineitem` table
   std::vector<std::string> const lineitem_cols = {"l_returnflag",
@@ -124,8 +124,8 @@ void run_ndsh_q1(nvbench::state& state,
     cudf::ast::ast_operator::LESS_EQUAL, shipdate_ref, shipdate_upper_literal);
 
   // Read out the `lineitem` table from parquet file
-  auto lineitem =
-    read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred));
+  auto lineitem = read_parquet(
+    sources.at("lineitem").make_source_info(), lineitem_cols, std::move(lineitem_pred));
 
   // Calculate the discount price and charge columns and append to lineitem table
   auto disc_price =
@@ -170,7 +170,7 @@ void ndsh_q1(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(scale_factor, {"lineitem"}, sources);
 
   auto stream = cudf::get_default_stream();
diff --git a/cpp/benchmarks/ndsh/q05.cpp b/cpp/benchmarks/ndsh/q05.cpp
index 522bc4789c2..1c2d657913e 100644
--- a/cpp/benchmarks/ndsh/q05.cpp
+++ b/cpp/benchmarks/ndsh/q05.cpp
@@ -89,7 +89,7 @@
 }
 
 void run_ndsh_q5(nvbench::state& state,
-                 std::unordered_map<std::string, parquet_device_buffer>& sources)
+                 std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Define the column projection and filter predicate for the `orders` table
   std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
@@ -120,17 +120,17 @@ void run_ndsh_q5(nvbench::state& state,
   // Read out the tables from parquet files
   // while pushing down the column projections and filter predicates
   auto const customer =
-    read_parquet(sources["customer"].make_source_info(), {"c_custkey", "c_nationkey"});
+    read_parquet(sources.at("customer").make_source_info(), {"c_custkey", "c_nationkey"});
   auto const orders =
-    read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred));
-  auto const lineitem = read_parquet(sources["lineitem"].make_source_info(),
+    read_parquet(sources.at("orders").make_source_info(), orders_cols, std::move(orders_pred));
+  auto const lineitem = read_parquet(sources.at("lineitem").make_source_info(),
                                      {"l_orderkey", "l_suppkey", "l_extendedprice", "l_discount"});
   auto const supplier =
-    read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"});
+    read_parquet(sources.at("supplier").make_source_info(), {"s_suppkey", "s_nationkey"});
   auto const nation =
-    read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_regionkey", "n_name"});
+    read_parquet(sources.at("nation").make_source_info(), {"n_nationkey", "n_regionkey", "n_name"});
   auto const region =
-    read_parquet(sources["region"].make_source_info(), region_cols, std::move(region_pred));
+    read_parquet(sources.at("region").make_source_info(), region_cols, std::move(region_pred));
 
   // Perform the joins
   auto const join_a = apply_inner_join(region, nation, {"r_regionkey"}, {"n_regionkey"});
@@ -165,7 +165,7 @@ void ndsh_q5(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(
     scale_factor, {"customer", "orders", "lineitem", "supplier", "nation", "region"}, sources);
 
diff --git a/cpp/benchmarks/ndsh/q06.cpp b/cpp/benchmarks/ndsh/q06.cpp
index 04078547973..e1e56c3622e 100644
--- a/cpp/benchmarks/ndsh/q06.cpp
+++ b/cpp/benchmarks/ndsh/q06.cpp
@@ -64,7 +64,7 @@
 }
 
 void run_ndsh_q6(nvbench::state& state,
-                 std::unordered_map<std::string, parquet_device_buffer>& sources)
+                 std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Read out the `lineitem` table from parquet file
   std::vector<std::string> const lineitem_cols = {
@@ -83,8 +83,8 @@ void run_ndsh_q6(nvbench::state& state,
     cudf::ast::operation(cudf::ast::ast_operator::LESS, shipdate_ref, shipdate_upper_literal);
   auto const lineitem_pred = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b);
-  auto lineitem =
-    read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred));
+  auto lineitem = read_parquet(
+    sources.at("lineitem").make_source_info(), lineitem_cols, std::move(lineitem_pred));
 
   // Cast the discount and quantity columns to float32 and append to lineitem table
   auto discout_float =
@@ -134,7 +134,7 @@ void ndsh_q6(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(scale_factor, {"lineitem"}, sources);
 
   auto stream = cudf::get_default_stream();
diff --git a/cpp/benchmarks/ndsh/q09.cpp b/cpp/benchmarks/ndsh/q09.cpp
index 59218ab8912..2e9a69d9ee2 100644
--- a/cpp/benchmarks/ndsh/q09.cpp
+++ b/cpp/benchmarks/ndsh/q09.cpp
@@ -112,20 +112,21 @@
 }
 
 void run_ndsh_q9(nvbench::state& state,
-                 std::unordered_map<std::string, parquet_device_buffer>& sources)
+                 std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Read out the table from parquet files
   auto const lineitem = read_parquet(
-    sources["lineitem"].make_source_info(),
+    sources.at("lineitem").make_source_info(),
     {"l_suppkey", "l_partkey", "l_orderkey", "l_extendedprice", "l_discount", "l_quantity"});
-  auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_name"});
+  auto const nation =
+    read_parquet(sources.at("nation").make_source_info(), {"n_nationkey", "n_name"});
   auto const orders =
-    read_parquet(sources["orders"].make_source_info(), {"o_orderkey", "o_orderdate"});
-  auto const part     = read_parquet(sources["part"].make_source_info(), {"p_partkey", "p_name"});
-  auto const partsupp = read_parquet(sources["partsupp"].make_source_info(),
+    read_parquet(sources.at("orders").make_source_info(), {"o_orderkey", "o_orderdate"});
+  auto const part = read_parquet(sources.at("part").make_source_info(), {"p_partkey", "p_name"});
+  auto const partsupp = read_parquet(sources.at("partsupp").make_source_info(),
                                      {"ps_suppkey", "ps_partkey", "ps_supplycost"});
   auto const supplier =
-    read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"});
+    read_parquet(sources.at("supplier").make_source_info(), {"s_suppkey", "s_nationkey"});
 
   // Generating the `profit` table
   // Filter the part table using `p_name like '%green%'`
@@ -178,7 +179,7 @@ void ndsh_q9(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(
     scale_factor, {"part", "supplier", "lineitem", "partsupp", "orders", "nation"}, sources);
 
diff --git a/cpp/benchmarks/ndsh/q10.cpp b/cpp/benchmarks/ndsh/q10.cpp
index a520480020a..72edd15083d 100644
--- a/cpp/benchmarks/ndsh/q10.cpp
+++ b/cpp/benchmarks/ndsh/q10.cpp
@@ -94,7 +94,7 @@
 }
 
 void run_ndsh_q10(nvbench::state& state,
-                  std::unordered_map<std::string, parquet_device_buffer>& sources)
+                  std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   // Define the column projection and filter predicate for the `orders` table
   std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
@@ -122,15 +122,16 @@ void run_ndsh_q10(nvbench::state& state,
   // Read out the tables from parquet files
   // while pushing down the column projections and filter predicates
   auto const customer = read_parquet(
-    sources["customer"].make_source_info(),
+    sources.at("customer").make_source_info(),
     {"c_custkey", "c_name", "c_nationkey", "c_acctbal", "c_address", "c_phone", "c_comment"});
   auto const orders =
-    read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred));
+    read_parquet(sources.at("orders").make_source_info(), orders_cols, std::move(orders_pred));
   auto const lineitem =
-    read_parquet(sources["lineitem"].make_source_info(),
+    read_parquet(sources.at("lineitem").make_source_info(),
                  {"l_extendedprice", "l_discount", "l_orderkey", "l_returnflag"},
                  std::move(lineitem_pred));
-  auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_name", "n_nationkey"});
+  auto const nation =
+    read_parquet(sources.at("nation").make_source_info(), {"n_name", "n_nationkey"});
 
   // Perform the joins
   auto const join_a       = apply_inner_join(customer, nation, {"c_nationkey"}, {"n_nationkey"});
@@ -163,7 +164,7 @@ void ndsh_q10(nvbench::state& state)
 {
   // Generate the required parquet files in device buffers
   double const scale_factor = state.get_float64("scale_factor");
-  std::unordered_map<std::string, parquet_device_buffer> sources;
+  std::unordered_map<std::string, cuio_source_sink_pair> sources;
   generate_parquet_data_sources(
     scale_factor, {"customer", "orders", "lineitem", "nation"}, sources);
 
diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp
index 62116ddf661..9f9849860c9 100644
--- a/cpp/benchmarks/ndsh/utilities.cpp
+++ b/cpp/benchmarks/ndsh/utilities.cpp
@@ -17,6 +17,8 @@
 #include "utilities.hpp"
 
 #include "common/ndsh_data_generator/ndsh_data_generator.hpp"
+#include "common/table_utilities.hpp"
+#include "cudf/detail/utilities/integer_utils.hpp"
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
@@ -30,8 +32,15 @@
 #include <cudf/transform.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <algorithm>
 #include <cstdlib>
 #include <ctime>
+#include <iterator>
+#include <unordered_set>
 
 namespace {
 
@@ -85,6 +94,15 @@ std::vector<std::string> const NATION_SCHEMA   = {
   "n_nationkey", "n_name", "n_regionkey", "n_comment"};
 std::vector<std::string> const REGION_SCHEMA = {"r_regionkey", "r_name", "r_comment"};
 
+std::unordered_map<std::string, std::vector<std::string> const> const SCHEMAS = {
+  {"orders", ORDERS_SCHEMA},
+  {"lineitem", LINEITEM_SCHEMA},
+  {"part", PART_SCHEMA},
+  {"partsupp", PARTSUPP_SCHEMA},
+  {"supplier", SUPPLIER_SCHEMA},
+  {"customer", CUSTOMER_SCHEMA},
+  {"nation", NATION_SCHEMA},
+  {"region", REGION_SCHEMA}};
 }  // namespace
 
 cudf::table_view table_with_names::table() const { return tbl->view(); }
@@ -337,7 +355,7 @@ int32_t days_since_epoch(int year, int month, int day)
 
 void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
                                     std::vector<std::string> const& col_names,
-                                    parquet_device_buffer& source)
+                                    cuio_source_sink_pair& source)
 {
   CUDF_FUNC_RANGE();
   auto const stream = cudf::get_default_stream();
@@ -351,55 +369,124 @@ void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
   metadata.schema_info            = col_name_infos;
   auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
 
-  // Declare a host and device buffer
-  std::vector<char> h_buffer;
-
+  auto est_size                     = static_cast<std::size_t>(estimate_size(table->view()));
+  constexpr auto PQ_MAX_TABLE_BYTES = 8ul << 30;  // 8GB
+  // TODO: best to get this limit from percent_of_free_device_memory(50) of device memory resource.
+  if (est_size > PQ_MAX_TABLE_BYTES) {
+    auto builder = cudf::io::chunked_parquet_writer_options::builder(source.make_sink_info());
+    builder.metadata(table_input_metadata);
+    auto const options = builder.build();
+    auto num_splits    = static_cast<cudf::size_type>(
+      std::ceil(static_cast<long double>(est_size) / (PQ_MAX_TABLE_BYTES)));
+    std::vector<cudf::size_type> splits(num_splits - 1);
+    auto num_rows          = table->num_rows();
+    auto num_row_per_chunk = cudf::util::div_rounding_up_safe(num_rows, num_splits);
+    std::generate_n(splits.begin(), splits.size(), [num_row_per_chunk, i = 0]() mutable {
+      return (i += num_row_per_chunk);
+    });
+    std::vector<cudf::table_view> split_tables = cudf::split(table->view(), splits, stream);
+    auto writer                                = cudf::io::parquet_chunked_writer(options, stream);
+    for (auto const& chunk_table : split_tables) {
+      writer.write(chunk_table);
+    }
+    writer.close();
+    return;
+  }
   // Write parquet data to host buffer
-  auto builder =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&h_buffer), table->view());
+  auto builder = cudf::io::parquet_writer_options::builder(source.make_sink_info(), table->view());
   builder.metadata(table_input_metadata);
   auto const options = builder.build();
-  cudf::io::write_parquet(options);
+  cudf::io::write_parquet(options, stream);
+}
 
-  // Copy host buffer to device buffer
-  source.d_buffer.resize(h_buffer.size(), stream);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    source.d_buffer.data(), h_buffer.data(), h_buffer.size(), cudaMemcpyDefault, stream.value()));
+inline auto make_managed_pool()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+    std::make_shared<rmm::mr::managed_memory_resource>(), rmm::percent_of_free_device_memory(50));
 }
 
 void generate_parquet_data_sources(double scale_factor,
                                    std::vector<std::string> const& table_names,
-                                   std::unordered_map<std::string, parquet_device_buffer>& sources)
+                                   std::unordered_map<std::string, cuio_source_sink_pair>& sources)
 {
   CUDF_FUNC_RANGE();
-  std::for_each(table_names.begin(), table_names.end(), [&](auto const& table_name) {
-    sources[table_name] = parquet_device_buffer();
-  });
 
-  auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part(
-    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  // Set the memory resource to the managed pool
+  auto old_mr = cudf::get_current_device_resource();
+  // if already managed pool or managed, don't create new one.
+  using managed_pool_mr_t = decltype(make_managed_pool());
+  managed_pool_mr_t managed_pool_mr;
+  bool const is_managed =
+    dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource>*>(old_mr) or
+    dynamic_cast<rmm::mr::managed_memory_resource*>(old_mr);
+  if (!is_managed) {
+    std::cout << "Creating managed pool just for data generation\n";
+    managed_pool_mr = make_managed_pool();
+    cudf::set_current_device_resource(managed_pool_mr.get());
+    // drawback: if already pool takes 50% of free memory, we are left with 50% of 50% of free
+    // memory.
+  }
 
-  auto partsupp = cudf::datagen::generate_partsupp(
-    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  std::unordered_set<std::string> const requested_table_names = [&table_names]() {
+    if (table_names.empty()) {
+      return std::unordered_set<std::string>{
+        "orders", "lineitem", "part", "partsupp", "supplier", "customer", "nation", "region"};
+    }
+    return std::unordered_set(table_names.begin(), table_names.end());
+  }();
+  std::for_each(
+    requested_table_names.begin(), requested_table_names.end(), [&](auto const& table_name) {
+      sources.emplace(table_name, cuio_source_sink_pair(io_type::HOST_BUFFER));
+    });
+  std::unordered_map<std::string, std::unique_ptr<cudf::table>> tables;
+
+  if (sources.count("orders") or sources.count("lineitem") or sources.count("part")) {
+    auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part(
+      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    if (sources.count("orders")) {
+      write_to_parquet_device_buffer(orders, SCHEMAS.at("orders"), sources.at("orders"));
+      orders = {};
+    }
+    if (sources.count("part")) {
+      write_to_parquet_device_buffer(part, SCHEMAS.at("part"), sources.at("part"));
+      part = {};
+    }
+    if (sources.count("lineitem")) {
+      write_to_parquet_device_buffer(lineitem, SCHEMAS.at("lineitem"), sources.at("lineitem"));
+      lineitem = {};
+    }
+  }
+
+  if (sources.count("partsupp")) {
+    auto partsupp = cudf::datagen::generate_partsupp(
+      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(partsupp, SCHEMAS.at("partsupp"), sources.at("partsupp"));
+  }
 
-  auto supplier = cudf::datagen::generate_supplier(
-    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  if (sources.count("supplier")) {
+    auto supplier = cudf::datagen::generate_supplier(
+      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(supplier, SCHEMAS.at("supplier"), sources.at("supplier"));
+  }
 
-  auto customer = cudf::datagen::generate_customer(
-    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  if (sources.count("customer")) {
+    auto customer = cudf::datagen::generate_customer(
+      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(customer, SCHEMAS.at("customer"), sources.at("customer"));
+  }
 
-  auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(),
-                                               cudf::get_current_device_resource_ref());
+  if (sources.count("nation")) {
+    auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(),
+                                                 cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(nation, SCHEMAS.at("nation"), sources.at("nation"));
+  }
 
-  auto region = cudf::datagen::generate_region(cudf::get_default_stream(),
-                                               cudf::get_current_device_resource_ref());
+  if (sources.count("region")) {
+    auto region = cudf::datagen::generate_region(cudf::get_default_stream(),
+                                                 cudf::get_current_device_resource_ref());
+    write_to_parquet_device_buffer(region, SCHEMAS.at("region"), sources.at("region"));
+  }
 
-  write_to_parquet_device_buffer(std::move(orders), ORDERS_SCHEMA, sources["orders"]);
-  write_to_parquet_device_buffer(std::move(lineitem), LINEITEM_SCHEMA, sources["lineitem"]);
-  write_to_parquet_device_buffer(std::move(part), PART_SCHEMA, sources["part"]);
-  write_to_parquet_device_buffer(std::move(partsupp), PARTSUPP_SCHEMA, sources["partsupp"]);
-  write_to_parquet_device_buffer(std::move(customer), CUSTOMER_SCHEMA, sources["customer"]);
-  write_to_parquet_device_buffer(std::move(supplier), SUPPLIER_SCHEMA, sources["supplier"]);
-  write_to_parquet_device_buffer(std::move(nation), NATION_SCHEMA, sources["nation"]);
-  write_to_parquet_device_buffer(std::move(region), REGION_SCHEMA, sources["region"]);
+  // Restore the original memory resource
+  if (!is_managed) { cudf::set_current_device_resource(old_mr); }
 }
diff --git a/cpp/benchmarks/ndsh/utilities.hpp b/cpp/benchmarks/ndsh/utilities.hpp
index 762e43deccf..cae07f86a98 100644
--- a/cpp/benchmarks/ndsh/utilities.hpp
+++ b/cpp/benchmarks/ndsh/utilities.hpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "io/cuio_common.hpp"
+
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/io/parquet.hpp>
@@ -196,24 +198,15 @@ std::tm make_tm(int year, int month, int day);
 int32_t days_since_epoch(int year, int month, int day);
 
 /**
- * @brief Struct representing a parquet device buffer
- */
-struct parquet_device_buffer {
-  parquet_device_buffer() : d_buffer{0, cudf::get_default_stream()} {};
-  cudf::io::source_info make_source_info() { return cudf::io::source_info(d_buffer); }
-  rmm::device_uvector<std::byte> d_buffer;
-};
-
-/**
- * @brief Write a `cudf::table` to a parquet device buffer
+ * @brief Write a `cudf::table` to a parquet cuio sink
  *
  * @param table The `cudf::table` to write
  * @param col_names The column names of the table
- * @param parquet_device_buffer The parquet device buffer to write the table to
+ * @param source The source sink pair to write the table to
  */
 void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
                                     std::vector<std::string> const& col_names,
-                                    parquet_device_buffer& source);
+                                    cuio_source_sink_pair& source);
 
 /**
  * @brief Generate NDS-H tables and write to parquet device buffers
@@ -224,4 +217,4 @@ void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
  */
 void generate_parquet_data_sources(double scale_factor,
                                    std::vector<std::string> const& table_names,
-                                   std::unordered_map<std::string, parquet_device_buffer>& sources);
+                                   std::unordered_map<std::string, cuio_source_sink_pair>& sources);
diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index ae6c8b844c8..a73017dda18 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -17,10 +17,6 @@
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 
-#include <cudf_test/column_wrapper.hpp>
-
-#include <cudf/filling.hpp>
-#include <cudf/strings/combine.hpp>
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -28,57 +24,6 @@
 
 #include <nvbench/nvbench.cuh>
 
-std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
-                                                 cudf::size_type row_width,
-                                                 int32_t hit_rate)
-{
-  // build input table using the following data
-  auto raw_data = cudf::test::strings_column_wrapper(
-                    {
-                      "123 abc 4567890 DEFGHI 0987 5W43",  // matches both patterns;
-                      "012345 6789 01234 56789 0123 456",  // the rest do not match
-                      "abc 4567890 DEFGHI 0987 Wxyz 123",
-                      "abcdefghijklmnopqrstuvwxyz 01234",
-                      "",
-                      "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
-                      "9876543210,abcdefghijklmnopqrstU",
-                      "9876543210,abcdefghijklmnopqrstU",
-                      "123 édf 4567890 DéFG 0987 X5",
-                      "1",
-                    })
-                    .release();
-
-  if (row_width / 32 > 1) {
-    std::vector<cudf::column_view> columns;
-    for (int i = 0; i < row_width / 32; ++i) {
-      columns.push_back(raw_data->view());
-    }
-    raw_data = cudf::strings::concatenate(cudf::table_view(columns));
-  }
-  auto data_view = raw_data->view();
-
-  // compute number of rows in n_rows that should match
-  auto matches = static_cast<int32_t>(n_rows * hit_rate) / 100;
-
-  // Create a randomized gather-map to build a column out of the strings in data.
-  data_profile gather_profile =
-    data_profile_builder().cardinality(0).null_probability(0.0).distribution(
-      cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1);
-  auto gather_table =
-    create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile);
-  gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
-
-  // Create scatter map by placing 0-index values throughout the gather-map
-  auto scatter_data = cudf::sequence(
-    matches, cudf::numeric_scalar<int32_t>(0), cudf::numeric_scalar<int32_t>(n_rows / matches));
-  auto zero_scalar = cudf::numeric_scalar<int32_t>(0);
-  auto table       = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view());
-  auto gather_map  = table->view().column(0);
-  table            = cudf::gather(cudf::table_view({data_view}), gather_map);
-
-  return std::move(table->release().front());
-}
-
 // longer pattern lengths demand more working memory per string
 std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"};
 
@@ -94,7 +39,7 @@ static void bench_contains(nvbench::state& state)
     state.skip("Skip benchmarks greater than size_type limit");
   }
 
-  auto col   = build_input_column(n_rows, row_width, hit_rate);
+  auto col   = create_string_column(n_rows, row_width, hit_rate);
   auto input = cudf::strings_column_view(col->view());
 
   auto pattern = patterns[pattern_index];
diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index a9c620e4bf0..996bdcf0332 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -19,7 +19,6 @@
 
 #include <cudf_test/column_wrapper.hpp>
 
-#include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/find.hpp>
@@ -29,10 +28,6 @@
 
 #include <nvbench/nvbench.cuh>
 
-std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
-                                                 cudf::size_type row_width,
-                                                 int32_t hit_rate);
-
 static void bench_find_string(nvbench::state& state)
 {
   auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
@@ -46,7 +41,7 @@ static void bench_find_string(nvbench::state& state)
   }
 
   auto const stream = cudf::get_default_stream();
-  auto const col    = build_input_column(n_rows, row_width, hit_rate);
+  auto const col    = create_string_column(n_rows, row_width, hit_rate);
   auto const input  = cudf::strings_column_view(col->view());
 
   std::vector<std::string> h_targets({"5W", "5W43", "0987 5W43"});
diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp
index 99cef640dc3..105ae65cbe8 100644
--- a/cpp/benchmarks/string/like.cpp
+++ b/cpp/benchmarks/string/like.cpp
@@ -18,68 +18,12 @@
 
 #include <cudf_test/column_wrapper.hpp>
 
-#include <cudf/copying.hpp>
-#include <cudf/filling.hpp>
-#include <cudf/strings/combine.hpp>
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <nvbench/nvbench.cuh>
 
-namespace {
-std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
-                                                 cudf::size_type row_width,
-                                                 int32_t hit_rate)
-{
-  // build input table using the following data
-  auto raw_data = cudf::test::strings_column_wrapper(
-                    {
-                      "123 abc 4567890 DEFGHI 0987 5W43",  // matches always;
-                      "012345 6789 01234 56789 0123 456",  // the rest do not match
-                      "abc 4567890 DEFGHI 0987 Wxyz 123",
-                      "abcdefghijklmnopqrstuvwxyz 01234",
-                      "",
-                      "AbcéDEFGHIJKLMNOPQRSTUVWXYZ 01",
-                      "9876543210,abcdefghijklmnopqrstU",
-                      "9876543210,abcdefghijklmnopqrstU",
-                      "123 édf 4567890 DéFG 0987 X5",
-                      "1",
-                    })
-                    .release();
-  if (row_width / 32 > 1) {
-    std::vector<cudf::column_view> columns;
-    for (int i = 0; i < row_width / 32; ++i) {
-      columns.push_back(raw_data->view());
-    }
-    raw_data = cudf::strings::concatenate(cudf::table_view(columns));
-  }
-  auto data_view = raw_data->view();
-
-  // compute number of rows in n_rows that should match
-  auto matches = static_cast<int32_t>(n_rows * hit_rate) / 100;
-
-  // Create a randomized gather-map to build a column out of the strings in data.
-  data_profile gather_profile =
-    data_profile_builder().cardinality(0).null_probability(0.0).distribution(
-      cudf::type_id::INT32, distribution_id::UNIFORM, 1, data_view.size() - 1);
-  auto gather_table =
-    create_random_table({cudf::type_id::INT32}, row_count{n_rows}, gather_profile);
-  gather_table->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
-
-  // Create scatter map by placing 0-index values throughout the gather-map
-  auto scatter_data = cudf::sequence(
-    matches, cudf::numeric_scalar<int32_t>(0), cudf::numeric_scalar<int32_t>(n_rows / matches));
-  auto zero_scalar = cudf::numeric_scalar<int32_t>(0);
-  auto table       = cudf::scatter({zero_scalar}, scatter_data->view(), gather_table->view());
-  auto gather_map  = table->view().column(0);
-  table            = cudf::gather(cudf::table_view({data_view}), gather_map);
-
-  return std::move(table->release().front());
-}
-
-}  // namespace
-
 static void bench_like(nvbench::state& state)
 {
   auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
@@ -91,7 +35,7 @@ static void bench_like(nvbench::state& state)
     state.skip("Skip benchmarks greater than size_type limit");
   }
 
-  auto col   = build_input_column(n_rows, row_width, hit_rate);
+  auto col   = create_string_column(n_rows, row_width, hit_rate);
   auto input = cudf::strings_column_view(col->view());
 
   // This pattern forces reading the entire target string (when matched expected)
diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp
index 8e48f8e9a05..43d57201b20 100644
--- a/cpp/benchmarks/text/ngrams.cpp
+++ b/cpp/benchmarks/text/ngrams.cpp
@@ -15,58 +15,45 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/string/string_bench_args.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/generate_ngrams.hpp>
 
-class TextNGrams : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-enum class ngrams_type { tokens, characters };
-
-static void BM_ngrams(benchmark::State& state, ngrams_type nt)
+static void bench_ngrams(nvbench::state& state)
 {
-  auto const n_rows          = static_cast<cudf::size_type>(state.range(0));
-  auto const max_str_length  = static_cast<cudf::size_type>(state.range(1));
+  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const ngram_type = state.get_string("type");
+
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
   auto const separator = cudf::string_scalar("_");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    switch (nt) {
-      case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break;
-      case ngrams_type::characters: nvtext::generate_character_ngrams(input); break;
-    }
-  }
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
-}
+  auto chars_size = input.chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size * 2);
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 5;
-  int const max_rowlen = 40;
-  int const len_mult   = 2;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+  if (ngram_type == "chars") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::generate_character_ngrams(input);
+    });
+  } else {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::generate_ngrams(input, 2, separator);
+    });
+  }
 }
 
-#define NVTEXT_BENCHMARK_DEFINE(name)                             \
-  BENCHMARK_DEFINE_F(TextNGrams, name)                            \
-  (::benchmark::State & st) { BM_ngrams(st, ngrams_type::name); } \
-  BENCHMARK_REGISTER_F(TextNGrams, name)                          \
-    ->Apply(generate_bench_args)                                  \
-    ->UseManualTime()                                             \
-    ->Unit(benchmark::kMillisecond);
-
-NVTEXT_BENCHMARK_DEFINE(tokens)
-NVTEXT_BENCHMARK_DEFINE(characters)
+NVBENCH_BENCH(bench_ngrams)
+  .set_name("ngrams")
+  .add_int64_axis("num_rows", {131072, 262144, 524288, 1048578})
+  .add_int64_axis("row_width", {10, 20, 40, 100})
+  .add_string_axis("type", {"chars", "tokens"});
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index a254171ef11..f4cce8e6da6 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -17,12 +17,8 @@
 
 #include <cudf/ast/detail/operators.hpp>
 #include <cudf/ast/expressions.hpp>
-#include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
-
-#include <thrust/scan.h>
 
 #include <functional>
 #include <numeric>
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index 6bbe32de134..e72661ce49a 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -24,8 +24,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/pair.h>
-
 namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup column_factories
diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index 48f89b8be25..6db5c8b3c7b 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -16,7 +16,6 @@
 #pragma once
 
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/prefetch.hpp>
 #include <cudf/utilities/span.hpp>
diff --git a/cpp/include/cudf/detail/aggregation/result_cache.hpp b/cpp/include/cudf/detail/aggregation/result_cache.hpp
index ec5a511bb7c..486808ebe18 100644
--- a/cpp/include/cudf/detail/aggregation/result_cache.hpp
+++ b/cpp/include/cudf/detail/aggregation/result_cache.hpp
@@ -19,7 +19,6 @@
 #include <cudf/column/column.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/hashing/detail/hashing.hpp>
-#include <cudf/types.hpp>
 
 #include <unordered_map>
 
diff --git a/cpp/include/cudf/detail/is_element_valid.hpp b/cpp/include/cudf/detail/is_element_valid.hpp
index 4b74d12f306..26b1bec2ced 100644
--- a/cpp/include/cudf/detail/is_element_valid.hpp
+++ b/cpp/include/cudf/detail/is_element_valid.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 4349e1b70fd..30f36d6a5da 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -38,18 +38,19 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 
 #include <cuda/std/optional>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
 
-#include <utility>
-
 namespace cudf {
 namespace detail {
 /**
  * @brief Convenience wrapper for creating a `thrust::transform_iterator` over a
- * `thrust::counting_iterator`.
+ * `thrust::counting_iterator` within the range [0, INT_MAX].
+ *
  *
  * Example:
  * @code{.cpp}
@@ -62,14 +63,21 @@ namespace detail {
  * iter[n] == n * n
  * @endcode
  *
- * @param start The starting value of the counting iterator
+ * @param start The starting value of the counting iterator (must be size_type or smaller type).
  * @param f The unary function to apply to the counting iterator.
  * @return A transform iterator that applies `f` to a counting iterator
  */
-template <typename UnaryFunction>
-CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(cudf::size_type start,
+template <typename CountingIterType, typename UnaryFunction>
+CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(CountingIterType start,
                                                               UnaryFunction f)
 {
+  // Check if the `start` for counting_iterator is of size_type or a smaller integral type
+  static_assert(
+    cuda::std::is_integral_v<CountingIterType> and
+      cuda::std::numeric_limits<CountingIterType>::digits <=
+        cuda::std::numeric_limits<cudf::size_type>::digits,
+    "The `start` for the counting_transform_iterator must be size_type or smaller type");
+
   return thrust::make_transform_iterator(thrust::make_counting_iterator(start), f);
 }
 
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index 482265d633e..025e2ccc3ec 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -166,16 +166,9 @@ size_type inplace_bitmask_binop(Binop op,
 
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref();
   cudf::detail::device_scalar<size_type> d_counter{0, stream, mr};
-  rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
-  rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);
-
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    d_masks.data(), masks.data(), masks.size_bytes(), cudaMemcpyDefault, stream.value()));
-  CUDF_CUDA_TRY(cudaMemcpyAsync(d_begin_bits.data(),
-                                masks_begin_bits.data(),
-                                masks_begin_bits.size_bytes(),
-                                cudaMemcpyDefault,
-                                stream.value()));
+
+  auto d_masks      = cudf::detail::make_device_uvector_async(masks, stream, mr);
+  auto d_begin_bits = cudf::detail::make_device_uvector_async(masks_begin_bits, stream, mr);
 
   auto constexpr block_size = 256;
   cudf::detail::grid_1d config(dest_mask.size(), block_size);
diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index 18b1e9b2d2e..0f852db0c54 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -59,7 +59,7 @@ std::unique_ptr<column> true_if(InputIterator begin,
   auto output_mutable_view = output->mutable_view();
   auto output_data         = output_mutable_view.data<bool>();
 
-  thrust::transform(rmm::exec_policy(stream), begin, end, output_data, p);
+  thrust::transform(rmm::exec_policy_nosync(stream), begin, end, output_data, p);
 
   return output;
 }
diff --git a/cpp/include/cudf/detail/utilities/batched_memset.hpp b/cpp/include/cudf/detail/utilities/batched_memset.hpp
index 75f738f7529..78be5b91248 100644
--- a/cpp/include/cudf/detail/utilities/batched_memset.hpp
+++ b/cpp/include/cudf/detail/utilities/batched_memset.hpp
@@ -53,8 +53,8 @@ void batched_memset(std::vector<cudf::device_span<T>> const& bufs,
     cudf::detail::make_device_uvector_async(bufs, stream, cudf::get_current_device_resource_ref());
 
   // get a vector with the sizes of all buffers
-  auto sizes = cudf::detail::make_counting_transform_iterator(
-    static_cast<std::size_t>(0),
+  auto sizes = thrust::make_transform_iterator(
+    thrust::counting_iterator<std::size_t>(0),
     cuda::proclaim_return_type<std::size_t>(
       [gpu_bufs = gpu_bufs.data()] __device__(std::size_t i) { return gpu_bufs[i].size(); }));
 
diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
index 5596f78a90b..0a799f27d00 100644
--- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 
 /**
diff --git a/cpp/include/cudf/io/config_utils.hpp b/cpp/include/cudf/io/config_utils.hpp
index 1827ba0e3e6..13a76d50346 100644
--- a/cpp/include/cudf/io/config_utils.hpp
+++ b/cpp/include/cudf/io/config_utils.hpp
@@ -18,7 +18,8 @@
 #include <cudf/utilities/export.hpp>
 
 namespace CUDF_EXPORT cudf {
-namespace io::cufile_integration {
+namespace io {
+namespace cufile_integration {
 
 /**
  * @brief Returns true if cuFile and its compatibility mode are enabled.
@@ -35,9 +36,15 @@ bool is_gds_enabled();
  */
 bool is_kvikio_enabled();
 
-}  // namespace io::cufile_integration
+/**
+ * @brief Set kvikIO thread pool size according to the environment variable KVIKIO_NTHREADS. If
+ * KVIKIO_NTHREADS is not set, use 8 threads by default.
+ */
+void set_thread_pool_nthreads_from_env();
+
+}  // namespace cufile_integration
 
-namespace io::nvcomp_integration {
+namespace nvcomp_integration {
 
 /**
  * @brief Returns true if all nvCOMP uses are enabled.
@@ -49,5 +56,6 @@ bool is_all_enabled();
  */
 bool is_stable_enabled();
 
-}  // namespace io::nvcomp_integration
+}  // namespace nvcomp_integration
+}  // namespace io
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
index 11eb4518210..5659f86b0c4 100644
--- a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
+++ b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
@@ -16,16 +16,10 @@
 
 #pragma once
 
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <zlib.h>
-
-#include <algorithm>
-#include <array>
 #include <fstream>
-#include <limits>
 
 namespace CUDF_EXPORT cudf {
 namespace io::text::detail::bgzip {
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index fb0b25cf9f1..de2f1770e28 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -65,19 +65,20 @@ rmm::device_uvector<char> make_chars_buffer(column_view const& offsets,
   auto chars_data      = rmm::device_uvector<char>(chars_size, stream, mr);
   auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets);
 
-  auto const src_ptrs = cudf::detail::make_counting_transform_iterator(
-    0u, cuda::proclaim_return_type<void*>([begin] __device__(uint32_t idx) {
+  auto const src_ptrs = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<uint32_t>(0),
+    cuda::proclaim_return_type<void*>([begin] __device__(uint32_t idx) {
       // Due to a bug in cub (https://github.com/NVIDIA/cccl/issues/586),
       // we have to use `const_cast` to remove `const` qualifier from the source pointer.
       // This should be fine as long as we only read but not write anything to the source.
       return reinterpret_cast<void*>(const_cast<char*>(begin[idx].first));
     }));
-  auto const src_sizes = cudf::detail::make_counting_transform_iterator(
-    0u, cuda::proclaim_return_type<size_type>([begin] __device__(uint32_t idx) {
-      return begin[idx].second;
-    }));
-  auto const dst_ptrs = cudf::detail::make_counting_transform_iterator(
-    0u,
+  auto const src_sizes = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<uint32_t>(0),
+    cuda::proclaim_return_type<size_type>(
+      [begin] __device__(uint32_t idx) { return begin[idx].second; }));
+  auto const dst_ptrs = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<uint32_t>(0),
     cuda::proclaim_return_type<char*>([offsets = d_offsets, output = chars_data.data()] __device__(
                                         uint32_t idx) { return output + offsets[idx]; }));
 
diff --git a/cpp/include/cudf/utilities/default_stream.hpp b/cpp/include/cudf/utilities/default_stream.hpp
index 97a42243250..3e740b81cc9 100644
--- a/cpp/include/cudf/utilities/default_stream.hpp
+++ b/cpp/include/cudf/utilities/default_stream.hpp
@@ -16,10 +16,8 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
 
-#include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace CUDF_EXPORT cudf {
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 3f37ae02151..cf8413b597f 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -22,8 +22,6 @@
 #include <cudf/wrappers/durations.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <cuda/std/type_traits>
-
 namespace CUDF_EXPORT cudf {
 
 /**
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index 15b5f921c1b..6351a84e38f 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 7c909f1a948..42124461cdf 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -41,6 +41,8 @@ namespace CUDF_EXPORT nvtext {
  *
  * This function uses MurmurHash3_x86_32 for the hash algorithm.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if the width < 2
  *
  * @param input Strings column to compute minhash
@@ -51,7 +53,7 @@ namespace CUDF_EXPORT nvtext {
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Minhash values for each string in input
  */
-std::unique_ptr<cudf::column> minhash(
+[[deprecated]] std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
   cudf::numeric_scalar<uint32_t> seed = 0,
   cudf::size_type width               = 4,
@@ -71,6 +73,8 @@ std::unique_ptr<cudf::column> minhash(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12 - to be replaced in a future release
+ *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
@@ -83,7 +87,7 @@ std::unique_ptr<cudf::column> minhash(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> minhash(
+[[deprecated]] std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
   cudf::device_span<uint32_t const> seeds,
   cudf::size_type width             = 4,
@@ -102,6 +106,8 @@ std::unique_ptr<cudf::column> minhash(
  * The hash function returns 2 uint64 values but only the first value
  * is used with the minhash calculation.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if the width < 2
  *
  * @param input Strings column to compute minhash
@@ -112,7 +118,7 @@ std::unique_ptr<cudf::column> minhash(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Minhash values as UINT64 for each string in input
  */
-std::unique_ptr<cudf::column> minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> minhash64(
   cudf::strings_column_view const& input,
   cudf::numeric_scalar<uint64_t> seed = 0,
   cudf::size_type width               = 4,
@@ -132,6 +138,8 @@ std::unique_ptr<cudf::column> minhash64(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12 - to be replaced in a future release
+ *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
@@ -144,7 +152,7 @@ std::unique_ptr<cudf::column> minhash64(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> minhash64(
   cudf::strings_column_view const& input,
   cudf::device_span<uint64_t const> seeds,
   cudf::size_type width             = 4,
@@ -164,6 +172,8 @@ std::unique_ptr<cudf::column> minhash64(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
@@ -173,7 +183,7 @@ std::unique_ptr<cudf::column> minhash64(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> word_minhash(
+[[deprecated]] std::unique_ptr<cudf::column> word_minhash(
   cudf::lists_column_view const& input,
   cudf::device_span<uint32_t const> seeds,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -193,6 +203,8 @@ std::unique_ptr<cudf::column> word_minhash(
  *
  * Any null row entries result in corresponding null output rows.
  *
+ * @deprecated Deprecated in 24.12
+ *
  * @throw std::invalid_argument if seeds is empty
  * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
@@ -202,7 +214,7 @@ std::unique_ptr<cudf::column> word_minhash(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> word_minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> word_minhash64(
   cudf::lists_column_view const& input,
   cudf::device_span<uint64_t const> seeds,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
diff --git a/cpp/include/nvtext/replace.hpp b/cpp/include/nvtext/replace.hpp
index bbd0503379b..822edcbdb43 100644
--- a/cpp/include/nvtext/replace.hpp
+++ b/cpp/include/nvtext/replace.hpp
@@ -82,7 +82,7 @@ namespace CUDF_EXPORT nvtext {
  *                  The default of empty string will identify tokens using whitespace.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New strings columns of with replaced strings
+ * @return New strings column with replaced strings
  */
 std::unique_ptr<cudf::column> replace_tokens(
   cudf::strings_column_view const& input,
@@ -131,7 +131,7 @@ std::unique_ptr<cudf::column> replace_tokens(
  *                  The default of empty string will identify tokens using whitespace.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New strings columns of with replaced strings
+ * @return New strings column of filtered strings
  */
 std::unique_ptr<cudf::column> filter_tokens(
   cudf::strings_column_view const& input,
diff --git a/cpp/include/nvtext/stemmer.hpp b/cpp/include/nvtext/stemmer.hpp
index 55a4124bfd0..e5b2a4cc21b 100644
--- a/cpp/include/nvtext/stemmer.hpp
+++ b/cpp/include/nvtext/stemmer.hpp
@@ -51,7 +51,7 @@ enum class letter_type {
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * b1 = is_letter(st, VOWEL, 1)
  * b1 is now [false, true, true]
  * @endcode
@@ -62,7 +62,7 @@ enum class letter_type {
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * b2 = is_letter(st, CONSONANT, -1) // last letter checked in each string
  * b2 is now [false, true, false]
  * @endcode
@@ -99,7 +99,7 @@ std::unique_ptr<cudf::column> is_letter(
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * ix = [3, 1, 4]
  * b1 = is_letter(st, VOWEL, ix)
  * b1 is now [true, true, false]
@@ -111,7 +111,7 @@ std::unique_ptr<cudf::column> is_letter(
  *
  * @code{.pseudo}
  * Example:
- * st = ["trouble", "toy", "sygyzy"]
+ * st = ["trouble", "toy", "syzygy"]
  * ix = [3, -2, 4] // 2nd to last character in st[1] is checked
  * b2 = is_letter(st, CONSONANT, ix)
  * b2 is now [false, false, true]
diff --git a/cpp/src/ast/expression_parser.cpp b/cpp/src/ast/expression_parser.cpp
index 3b650d791aa..5815ce33e33 100644
--- a/cpp/src/ast/expression_parser.cpp
+++ b/cpp/src/ast/expression_parser.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,6 @@
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/ast/detail/operators.hpp>
 #include <cudf/ast/expressions.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
diff --git a/cpp/src/ast/expressions.cpp b/cpp/src/ast/expressions.cpp
index b45b9d0c78c..4c2b56dd4f5 100644
--- a/cpp/src/ast/expressions.cpp
+++ b/cpp/src/ast/expressions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,6 @@
 #include <cudf/ast/detail/expression_transformer.hpp>
 #include <cudf/ast/detail/operators.hpp>
 #include <cudf/ast/expressions.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index a6c878efbbc..1b23ea12a5e 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -27,15 +27,10 @@
 #include <cudf/detail/binaryop.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/unary.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index 482413d0ccb..972f97e8668 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -15,19 +15,13 @@
  */
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/fill.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/strings/detail/fill.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <thrust/iterator/constant_iterator.h>
-
 namespace cudf {
 namespace {
 struct size_of_helper {
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index 386c5ebe478..e831aa9645d 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -27,9 +26,7 @@
 #include <thrust/iterator/transform_iterator.h>
 
 #include <algorithm>
-#include <exception>
 #include <numeric>
-#include <string>
 #include <vector>
 
 namespace cudf {
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index d60fb5ce110..5e2065ba844 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -20,16 +20,11 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/iterator/transform_iterator.h>
-
 #include <algorithm>
 
 namespace cudf {
diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp
index 1282eec6c44..a001807c82b 100644
--- a/cpp/src/copying/pack.cpp
+++ b/cpp/src/copying/pack.cpp
@@ -18,7 +18,6 @@
 #include <cudf/detail/contiguous_split.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/copying/split.cpp b/cpp/src/copying/split.cpp
index 832a72ed5b0..116e3516460 100644
--- a/cpp/src/copying/split.cpp
+++ b/cpp/src/copying/split.cpp
@@ -14,10 +14,8 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index 2196ee97fee..f786624680c 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -13,12 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <algorithm>
 #include <filesystem>
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cu b/cpp/src/groupby/hash/compute_mapping_indices.cu
new file mode 100644
index 00000000000..519d7cd2f1c
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_mapping_indices.cuh"
+#include "compute_mapping_indices.hpp"
+
+namespace cudf::groupby::detail::hash {
+template cudf::size_type max_occupancy_grid_size<hash_set_ref_t<cuco::insert_and_find_tag>>(
+  cudf::size_type n);
+
+template void compute_mapping_indices<hash_set_ref_t<cuco::insert_and_find_tag>>(
+  cudf::size_type grid_size,
+  cudf::size_type num,
+  hash_set_ref_t<cuco::insert_and_find_tag> global_set,
+  bitmask_type const* row_bitmask,
+  bool skip_rows_with_nulls,
+  cudf::size_type* local_mapping_index,
+  cudf::size_type* global_mapping_index,
+  cudf::size_type* block_cardinality,
+  cuda::std::atomic_flag* needs_global_memory_fallback,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh
new file mode 100644
index 00000000000..d353830780f
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "compute_mapping_indices.hpp"
+#include "helpers.cuh"
+
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cooperative_groups.h>
+#include <cuco/static_set_ref.cuh>
+#include <cuda/std/atomic>
+#include <cuda/std/utility>
+
+#include <algorithm>
+
+namespace cudf::groupby::detail::hash {
+template <typename SetType>
+__device__ void find_local_mapping(cooperative_groups::thread_block const& block,
+                                   cudf::size_type idx,
+                                   cudf::size_type num_input_rows,
+                                   SetType shared_set,
+                                   bitmask_type const* row_bitmask,
+                                   bool skip_rows_with_nulls,
+                                   cudf::size_type* cardinality,
+                                   cudf::size_type* local_mapping_index,
+                                   cudf::size_type* shared_set_indices)
+{
+  auto const is_valid_input =
+    idx < num_input_rows and (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, idx));
+  auto const [result_idx, inserted] = [&]() {
+    if (is_valid_input) {
+      auto const result      = shared_set.insert_and_find(idx);
+      auto const matched_idx = *result.first;
+      auto const inserted    = result.second;
+      // inserted a new element
+      if (result.second) {
+        auto const shared_set_index          = atomicAdd(cardinality, 1);
+        shared_set_indices[shared_set_index] = idx;
+        local_mapping_index[idx]             = shared_set_index;
+      }
+      return cuda::std::pair{matched_idx, inserted};
+    }
+    return cuda::std::pair{0, false};  // dummy values
+  }();
+  // Syncing the thread block is needed so that updates in `local_mapping_index` are visible to all
+  // threads in the thread block.
+  block.sync();
+  if (is_valid_input) {
+    // element was already in set
+    if (!inserted) { local_mapping_index[idx] = local_mapping_index[result_idx]; }
+  }
+}
+
+template <typename SetRef>
+__device__ void find_global_mapping(cooperative_groups::thread_block const& block,
+                                    cudf::size_type cardinality,
+                                    SetRef global_set,
+                                    cudf::size_type* shared_set_indices,
+                                    cudf::size_type* global_mapping_index)
+{
+  // for all unique keys in shared memory hash set, stores their matches in
+  // global hash set to `global_mapping_index`
+  for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
+    auto const input_idx = shared_set_indices[idx];
+    global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx] =
+      *global_set.insert_and_find(input_idx).first;
+  }
+}
+
+/*
+ * @brief Inserts keys into the shared memory hash set, and stores the block-wise rank for a given
+ * row index in `local_mapping_index`. If the number of unique keys found in a threadblock exceeds
+ * `GROUPBY_CARDINALITY_THRESHOLD`, the threads in that block will exit without updating
+ * `global_set` or setting `global_mapping_index`. Else, we insert the unique keys found to the
+ * global hash set, and save the row index of the global sparse table in `global_mapping_index`.
+ */
+template <class SetRef>
+CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows,
+                                        SetRef global_set,
+                                        bitmask_type const* row_bitmask,
+                                        bool skip_rows_with_nulls,
+                                        cudf::size_type* local_mapping_index,
+                                        cudf::size_type* global_mapping_index,
+                                        cudf::size_type* block_cardinality,
+                                        cuda::std::atomic_flag* needs_global_memory_fallback)
+{
+  __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
+
+  // Shared set initialization
+  __shared__ cuco::window<cudf::size_type, GROUPBY_WINDOW_SIZE> windows[window_extent.value()];
+
+  auto raw_set = cuco::static_set_ref{
+    cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+    global_set.key_eq(),
+    probing_scheme_t{global_set.hash_function()},
+    cuco::thread_scope_block,
+    cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, decltype(window_extent)>{
+      window_extent, windows}};
+  auto shared_set = raw_set.rebind_operators(cuco::insert_and_find);
+
+  auto const block = cooperative_groups::this_thread_block();
+  shared_set.initialize(block);
+
+  __shared__ cudf::size_type cardinality;
+  if (block.thread_rank() == 0) { cardinality = 0; }
+  block.sync();
+
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+
+  for (auto idx = cudf::detail::grid_1d::global_thread_id();
+       idx - block.thread_rank() < num_input_rows;
+       idx += stride) {
+    find_local_mapping(block,
+                       idx,
+                       num_input_rows,
+                       shared_set,
+                       row_bitmask,
+                       skip_rows_with_nulls,
+                       &cardinality,
+                       local_mapping_index,
+                       shared_set_indices);
+
+    block.sync();
+
+    if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) {
+      if (block.thread_rank() == 0) { needs_global_memory_fallback->test_and_set(); }
+      break;
+    }
+  }
+
+  // Insert unique keys from shared to global hash set if block-cardinality
+  // doesn't exceed the threshold upper-limit
+  if (cardinality < GROUPBY_CARDINALITY_THRESHOLD) {
+    find_global_mapping(block, cardinality, global_set, shared_set_indices, global_mapping_index);
+  }
+
+  if (block.thread_rank() == 0) { block_cardinality[block.group_index().x] = cardinality; }
+}
+
+template <class SetRef>
+cudf::size_type max_occupancy_grid_size(cudf::size_type n)
+{
+  cudf::size_type max_active_blocks{-1};
+  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &max_active_blocks, mapping_indices_kernel<SetRef>, GROUPBY_BLOCK_SIZE, 0));
+  auto const grid_size  = max_active_blocks * cudf::detail::num_multiprocessors();
+  auto const num_blocks = cudf::util::div_rounding_up_safe(n, GROUPBY_BLOCK_SIZE);
+  return std::min(grid_size, num_blocks);
+}
+
+template <class SetRef>
+void compute_mapping_indices(cudf::size_type grid_size,
+                             cudf::size_type num,
+                             SetRef global_set,
+                             bitmask_type const* row_bitmask,
+                             bool skip_rows_with_nulls,
+                             cudf::size_type* local_mapping_index,
+                             cudf::size_type* global_mapping_index,
+                             cudf::size_type* block_cardinality,
+                             cuda::std::atomic_flag* needs_global_memory_fallback,
+                             rmm::cuda_stream_view stream)
+{
+  mapping_indices_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, 0, stream>>>(
+    num,
+    global_set,
+    row_bitmask,
+    skip_rows_with_nulls,
+    local_mapping_index,
+    global_mapping_index,
+    block_cardinality,
+    needs_global_memory_fallback);
+}
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.hpp b/cpp/src/groupby/hash/compute_mapping_indices.hpp
new file mode 100644
index 00000000000..473ad99e650
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_mapping_indices.hpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cuda/std/atomic>
+
+namespace cudf::groupby::detail::hash {
+/*
+ * @brief Computes the maximum number of active blocks of the given kernel that can be executed on
+ * the underlying device
+ */
+template <class SetRef>
+[[nodiscard]] cudf::size_type max_occupancy_grid_size(cudf::size_type n);
+
+template <class SetRef>
+void compute_mapping_indices(cudf::size_type grid_size,
+                             cudf::size_type num,
+                             SetRef global_set,
+                             bitmask_type const* row_bitmask,
+                             bool skip_rows_with_nulls,
+                             cudf::size_type* local_mapping_index,
+                             cudf::size_type* global_mapping_index,
+                             cudf::size_type* block_cardinality,
+                             cuda::std::atomic_flag* needs_global_memory_fallback,
+                             rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_mapping_indices_null.cu b/cpp/src/groupby/hash/compute_mapping_indices_null.cu
new file mode 100644
index 00000000000..81c3c9e456f
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_mapping_indices_null.cu
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_mapping_indices.cuh"
+#include "compute_mapping_indices.hpp"
+
+namespace cudf::groupby::detail::hash {
+template cudf::size_type
+max_occupancy_grid_size<nullable_hash_set_ref_t<cuco::insert_and_find_tag>>(cudf::size_type n);
+
+template void compute_mapping_indices<nullable_hash_set_ref_t<cuco::insert_and_find_tag>>(
+  cudf::size_type grid_size,
+  cudf::size_type num,
+  nullable_hash_set_ref_t<cuco::insert_and_find_tag> global_set,
+  bitmask_type const* row_bitmask,
+  bool skip_rows_with_nulls,
+  cudf::size_type* local_mapping_index,
+  cudf::size_type* global_mapping_index,
+  cudf::size_type* block_cardinality,
+  cuda::std::atomic_flag* needs_global_memory_fallback,
+  rmm::cuda_stream_view stream);
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
index 2bf983e5e90..dfad51f27d4 100644
--- a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
+++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
@@ -17,7 +17,6 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <memory>
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index a9085a1f1fd..3041e261945 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -26,7 +26,6 @@
 #include <cudf/detail/binaryop.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp
index a99262fb3bf..c69ebe12d2c 100644
--- a/cpp/src/interop/arrow_utilities.cpp
+++ b/cpp/src/interop/arrow_utilities.cpp
@@ -20,11 +20,6 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-
 #include <nanoarrow/nanoarrow.h>
 
 namespace cudf {
diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp
index 1b79fbf9eda..e4bdedf6603 100644
--- a/cpp/src/interop/arrow_utilities.hpp
+++ b/cpp/src/interop/arrow_utilities.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index a1be6aade4e..4395b741e53 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -16,11 +16,8 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/lists/list_view.hpp>
-#include <cudf/structs/struct_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index d5caa4720ac..b3fcca62314 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -17,7 +17,6 @@
 #include "avro.hpp"
 
 #include <array>
-#include <cstring>
 #include <unordered_map>
 
 namespace cudf {
diff --git a/cpp/src/io/avro/avro.hpp b/cpp/src/io/avro/avro.hpp
index 2e992546ccc..fd2c781b8a1 100644
--- a/cpp/src/io/avro/avro.hpp
+++ b/cpp/src/io/avro/avro.hpp
@@ -18,11 +18,9 @@
 
 #include "avro_common.hpp"
 
-#include <algorithm>
 #include <array>
 #include <cstddef>
 #include <cstdint>
-#include <cstdio>
 #include <cstring>
 #include <map>
 #include <string>
diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index 583bd6a3523..2e1cda2d6b7 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -18,9 +18,7 @@
 
 #include "gpuinflate.hpp"
 
-#include <cudf/io/config_utils.hpp>
 #include <cudf/io/nvcomp_adapter.hpp>
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 1af45b41d8e..fb8c308065d 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -24,8 +24,6 @@
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <cuda_runtime.h>
-
 #include <zlib.h>  // uncompress
 
 #include <cstring>  // memset
@@ -538,8 +536,10 @@ size_t decompress_zstd(host_span<uint8_t const> src,
   CUDF_EXPECTS(hd_stats[0].status == compression_status::SUCCESS, "ZSTD decompression failed");
 
   // Copy temporary output to `dst`
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    dst.data(), d_dst.data(), hd_stats[0].bytes_written, cudaMemcpyDefault, stream.value()));
+  cudf::detail::cuda_memcpy_async(
+    dst.subspan(0, hd_stats[0].bytes_written),
+    device_span<uint8_t const>{d_dst.data(), hd_stats[0].bytes_written},
+    stream);
 
   return hd_stats[0].bytes_written;
 }
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 8c32fc85f78..72fca75c56b 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -21,6 +21,7 @@
 
 #include "csv_common.hpp"
 #include "csv_gpu.hpp"
+#include "cudf/detail/utilities/cuda_memcpy.hpp"
 #include "io/comp/io_uncomp.hpp"
 #include "io/utilities/column_buffer.hpp"
 #include "io/utilities/hostdevice_vector.hpp"
@@ -275,11 +276,10 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
     auto const read_offset = byte_range_offset + input_pos + previous_data_size;
     auto const read_size   = target_pos - input_pos - previous_data_size;
     if (data.has_value()) {
-      CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data() + previous_data_size,
-                                    data->data() + read_offset,
-                                    target_pos - input_pos - previous_data_size,
-                                    cudaMemcpyDefault,
-                                    stream.value()));
+      cudf::detail::cuda_memcpy_async(
+        device_span<char>{d_data.data() + previous_data_size, read_size},
+        data->subspan(read_offset, read_size),
+        stream);
     } else {
       if (source->is_device_read_preferred(read_size)) {
         source->device_read(read_offset,
@@ -288,12 +288,11 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
                             stream);
       } else {
         auto const buffer = source->host_read(read_offset, read_size);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data() + previous_data_size,
-                                      buffer->data(),
-                                      buffer->size(),
-                                      cudaMemcpyDefault,
-                                      stream.value()));
-        stream.synchronize();  // To prevent buffer going out of scope before we copy the data.
+        // Use sync version to prevent buffer going out of scope before we copy the data.
+        cudf::detail::cuda_memcpy(
+          device_span<char>{d_data.data() + previous_data_size, read_size},
+          host_span<char const>{reinterpret_cast<char const*>(buffer->data()), buffer->size()},
+          stream);
       }
     }
 
@@ -311,12 +310,10 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
                                                                    range_end,
                                                                    skip_rows,
                                                                    stream);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                                  row_ctx.device_ptr(),
-                                  num_blocks * sizeof(uint64_t),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
+
+    cudf::detail::cuda_memcpy(host_span<uint64_t>{row_ctx}.subspan(0, num_blocks),
+                              device_span<uint64_t const>{row_ctx}.subspan(0, num_blocks),
+                              stream);
 
     // Sum up the rows in each character block, selecting the row count that
     // corresponds to the current input context. Also stores the now known input
@@ -331,11 +328,9 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
       // At least one row in range in this batch
       all_row_offsets.resize(total_rows - skip_rows, stream);
 
-      CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.device_ptr(),
-                                    row_ctx.host_ptr(),
-                                    num_blocks * sizeof(uint64_t),
-                                    cudaMemcpyDefault,
-                                    stream.value()));
+      cudf::detail::cuda_memcpy_async(device_span<uint64_t>{row_ctx}.subspan(0, num_blocks),
+                                      host_span<uint64_t const>{row_ctx}.subspan(0, num_blocks),
+                                      stream);
 
       // Pass 2: Output row offsets
       cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(),
@@ -352,12 +347,9 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
                                              stream);
       // With byte range, we want to keep only one row out of the specified range
       if (range_end < data_size) {
-        CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                                      row_ctx.device_ptr(),
-                                      num_blocks * sizeof(uint64_t),
-                                      cudaMemcpyDefault,
-                                      stream.value()));
-        stream.synchronize();
+        cudf::detail::cuda_memcpy(host_span<uint64_t>{row_ctx}.subspan(0, num_blocks),
+                                  device_span<uint64_t const>{row_ctx}.subspan(0, num_blocks),
+                                  stream);
 
         size_t rows_out_of_range = 0;
         for (uint32_t i = 0; i < num_blocks; i++) {
@@ -401,12 +393,9 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
   // Remove header rows and extract header
   auto const header_row_index = std::max<size_t>(header_rows, 1) - 1;
   if (header_row_index + 1 < row_offsets.size()) {
-    CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                                  row_offsets.data() + header_row_index,
-                                  2 * sizeof(uint64_t),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
+    cudf::detail::cuda_memcpy(host_span<uint64_t>{row_ctx}.subspan(0, 2),
+                              device_span<uint64_t const>{row_offsets.data() + header_row_index, 2},
+                              stream);
 
     auto const header_start = input_pos + row_ctx[0];
     auto const header_end   = input_pos + row_ctx[1];
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index b84446b5f3e..2bbe05ced84 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -27,6 +27,7 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/fill.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/detail/csv.hpp>
 #include <cudf/null_mask.hpp>
@@ -405,13 +406,8 @@ void write_chunked(data_sink* out_sink,
     out_sink->device_write(ptr_all_bytes, total_num_bytes, stream);
   } else {
     // copy the bytes to host to write them out
-    thrust::host_vector<char> h_bytes(total_num_bytes);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(h_bytes.data(),
-                                  ptr_all_bytes,
-                                  total_num_bytes * sizeof(char),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
+    auto const h_bytes = cudf::detail::make_host_vector_sync(
+      device_span<char const>{ptr_all_bytes, total_num_bytes}, stream);
 
     out_sink->host_write(h_bytes.data(), total_num_bytes);
   }
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index a8682e6a760..ceaeb5d8f85 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -32,10 +32,8 @@
 #include <cudf/io/orc_metadata.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/io/parquet_metadata.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <algorithm>
 #include <utility>
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index 7ee652e0239..570a00cbfc2 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -24,6 +24,7 @@
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
@@ -198,17 +199,13 @@ NodeIndexT get_row_array_parent_col_id(device_span<NodeIndexT const> col_ids,
                                        bool is_enabled_lines,
                                        rmm::cuda_stream_view stream)
 {
-  NodeIndexT value = parent_node_sentinel;
-  if (!col_ids.empty()) {
-    auto const list_node_index = is_enabled_lines ? 0 : 1;
-    CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
-                                  col_ids.data() + list_node_index,
-                                  sizeof(NodeIndexT),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
-  }
-  return value;
+  if (col_ids.empty()) { return parent_node_sentinel; }
+
+  auto const list_node_index = is_enabled_lines ? 0 : 1;
+  auto const value           = cudf::detail::make_host_vector_sync(
+    device_span<NodeIndexT const>{col_ids.data() + list_node_index, 1}, stream);
+
+  return value[0];
 }
 /**
  * @brief Holds member data pointers of `d_json_column`
@@ -818,11 +815,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
                  column_categories.cbegin(),
                  expected_types.begin(),
                  [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; });
-  cudaMemcpyAsync(d_column_tree.node_categories.begin(),
-                  expected_types.data(),
-                  expected_types.size() * sizeof(column_categories[0]),
-                  cudaMemcpyDefault,
-                  stream.value());
+  cudf::detail::cuda_memcpy_async<NodeT>(d_column_tree.node_categories, expected_types, stream);
 
   return {is_pruned, columns};
 }
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 4584f71775f..7e4d975e431 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -513,16 +513,14 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
 #endif
 
   bool const is_array_of_arrays = [&]() {
-    std::array<node_t, 2> h_node_categories = {NC_ERR, NC_ERR};
-    auto const size_to_copy                 = std::min(size_t{2}, gpu_tree.node_categories.size());
-    CUDF_CUDA_TRY(cudaMemcpyAsync(h_node_categories.data(),
-                                  gpu_tree.node_categories.data(),
-                                  sizeof(node_t) * size_to_copy,
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    stream.synchronize();
+    auto const size_to_copy = std::min(size_t{2}, gpu_tree.node_categories.size());
+    if (size_to_copy == 0) return false;
+    auto const h_node_categories = cudf::detail::make_host_vector_sync(
+      device_span<NodeT const>{gpu_tree.node_categories.data(), size_to_copy}, stream);
+
     if (options.is_enabled_lines()) return h_node_categories[0] == NC_LIST;
-    return h_node_categories[0] == NC_LIST and h_node_categories[1] == NC_LIST;
+    return h_node_categories.size() >= 2 and h_node_categories[0] == NC_LIST and
+           h_node_categories[1] == NC_LIST;
   }();
 
   auto [gpu_col_id, gpu_row_offsets] =
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index d949635c1cc..e2fe926ea19 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -264,16 +264,13 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
       error_count > 0) {
     auto const error_location =
       thrust::find(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin);
-    SymbolOffsetT error_index;
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(&error_index,
-                      token_indices.data() + thrust::distance(tokens.begin(), error_location),
-                      sizeof(SymbolOffsetT),
-                      cudaMemcpyDefault,
-                      stream.value()));
-    stream.synchronize();
+    auto error_index = cudf::detail::make_host_vector_sync<SymbolOffsetT>(
+      device_span<SymbolOffsetT const>{
+        token_indices.data() + thrust::distance(tokens.begin(), error_location), 1},
+      stream);
+
     CUDF_FAIL("JSON Parser encountered an invalid format at location " +
-              std::to_string(error_index));
+              std::to_string(error_index[0]));
   }
 
   auto const num_tokens = tokens.size();
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index f6be4539d7f..7b3b04dea16 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -19,10 +19,7 @@
 #include <cudf/io/detail/tokenize_json.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/bit.hpp>
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <map>
 #include <vector>
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 534b30a6089..60e78f4763d 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1448,10 +1448,6 @@ void get_stack_context(device_span<SymbolT const> json_in,
   // Number of stack operations in the input (i.e., number of '{', '}', '[', ']' outside of quotes)
   cudf::detail::device_scalar<SymbolOffsetT> d_num_stack_ops(stream);
 
-  // Sequence of stack symbols and their position in the original input (sparse representation)
-  rmm::device_uvector<StackSymbolT> stack_ops{json_in.size(), stream};
-  rmm::device_uvector<SymbolOffsetT> stack_op_indices{json_in.size(), stream};
-
   // Prepare finite-state transducer that only selects '{', '}', '[', ']' outside of quotes
   constexpr auto max_translation_table_size =
     to_stack_op::NUM_SYMBOL_GROUPS * to_stack_op::TT_NUM_STATES;
@@ -1468,11 +1464,26 @@ void get_stack_context(device_span<SymbolT const> json_in,
 
   // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end
   // of structs/lists
+  // Run FST to estimate the sizes of translated buffers
+  json_to_stack_ops_fst.Transduce(json_in.begin(),
+                                  static_cast<SymbolOffsetT>(json_in.size()),
+                                  thrust::make_discard_iterator(),
+                                  thrust::make_discard_iterator(),
+                                  d_num_stack_ops.data(),
+                                  to_stack_op::start_state,
+                                  stream);
+
+  auto stack_ops_bufsize = d_num_stack_ops.value(stream);
+  // Sequence of stack symbols and their position in the original input (sparse representation)
+  rmm::device_uvector<StackSymbolT> stack_ops{stack_ops_bufsize, stream};
+  rmm::device_uvector<SymbolOffsetT> stack_op_indices{stack_ops_bufsize, stream};
+
+  // Run bracket-brace FST to retrieve starting positions of structs and lists
   json_to_stack_ops_fst.Transduce(json_in.begin(),
                                   static_cast<SymbolOffsetT>(json_in.size()),
                                   stack_ops.data(),
                                   stack_op_indices.data(),
-                                  d_num_stack_ops.data(),
+                                  thrust::make_discard_iterator(),
                                   to_stack_op::start_state,
                                   stream);
 
@@ -1508,6 +1519,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
   device_span<SymbolOffsetT const> token_indices,
   rmm::cuda_stream_view stream)
 {
+  CUDF_FUNC_RANGE();
   // Instantiate FST for post-processing the token stream to remove all tokens that belong to an
   // invalid JSON line
   token_filter::UnwrapTokenFromSymbolOp sgid_op{};
@@ -1643,21 +1655,28 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   // see a JSON-line delimiter as the very first item
   SymbolOffsetT const delimiter_offset =
     (format == tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER ? 1 : 0);
-  rmm::device_uvector<PdaTokenT> tokens{max_token_out_count + delimiter_offset, stream, mr};
-  rmm::device_uvector<SymbolOffsetT> tokens_indices{
-    max_token_out_count + delimiter_offset, stream, mr};
 
+  // Run FST to estimate the size of output buffers
   json_to_tokens_fst.Transduce(zip_in,
                                static_cast<SymbolOffsetT>(json_in.size()),
-                               tokens.data() + delimiter_offset,
-                               tokens_indices.data() + delimiter_offset,
+                               thrust::make_discard_iterator(),
+                               thrust::make_discard_iterator(),
                                num_written_tokens.data(),
                                tokenizer_pda::start_state,
                                stream);
 
   auto const num_total_tokens = num_written_tokens.value(stream) + delimiter_offset;
-  tokens.resize(num_total_tokens, stream);
-  tokens_indices.resize(num_total_tokens, stream);
+  rmm::device_uvector<PdaTokenT> tokens{num_total_tokens, stream, mr};
+  rmm::device_uvector<SymbolOffsetT> tokens_indices{num_total_tokens, stream, mr};
+
+  // Run FST to translate the input JSON string into tokens and indices at which they occur
+  json_to_tokens_fst.Transduce(zip_in,
+                               static_cast<SymbolOffsetT>(json_in.size()),
+                               tokens.data() + delimiter_offset,
+                               tokens_indices.data() + delimiter_offset,
+                               thrust::make_discard_iterator(),
+                               tokenizer_pda::start_state,
+                               stream);
 
   if (delimiter_offset == 1) {
     tokens.set_element(0, token_t::LineEnd, stream);
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index c424d2b3b62..8a740ae17ef 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -315,13 +315,12 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   // Reading to host because decompression of a single block is much faster on the CPU
   sources[0]->host_read(range_offset, remaining_bytes_to_read, hbuffer.data());
   auto uncomp_data = decompress(compression, hbuffer);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(buffer.data(),
-                                reinterpret_cast<char*>(uncomp_data.data()),
-                                uncomp_data.size() * sizeof(char),
-                                cudaMemcpyHostToDevice,
-                                stream.value()));
-  stream.synchronize();
-  return buffer.first(uncomp_data.size());
+  auto ret_buffer  = buffer.first(uncomp_data.size());
+  cudf::detail::cuda_memcpy<char>(
+    ret_buffer,
+    host_span<char const>{reinterpret_cast<char const*>(uncomp_data.data()), uncomp_data.size()},
+    stream);
+  return ret_buffer;
 }
 
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 9cc77e8e738..fcaee9c548e 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -516,10 +516,10 @@ void reader_impl::load_next_stripe_data(read_mode mode)
         _stream.synchronize();
         stream_synchronized = true;
       }
-      device_read_tasks.push_back(
-        std::pair(source_ptr->device_read_async(
-                    read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),
-                  read_info.length));
+      device_read_tasks.emplace_back(
+        source_ptr->device_read_async(
+          read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),
+        read_info.length);
 
     } else {
       auto buffer = source_ptr->host_read(read_info.offset, read_info.length);
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index c42348a165f..0081ed30d17 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -23,6 +23,7 @@
 
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/device_scalar.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
diff --git a/cpp/src/io/orc/reader_impl_helpers.cpp b/cpp/src/io/orc/reader_impl_helpers.cpp
index 4c1079cffe8..7e5db4b7617 100644
--- a/cpp/src/io/orc/reader_impl_helpers.cpp
+++ b/cpp/src/io/orc/reader_impl_helpers.cpp
@@ -16,8 +16,6 @@
 
 #include "reader_impl_helpers.hpp"
 
-#include <cudf/utilities/memory_resource.hpp>
-
 namespace cudf::io::orc::detail {
 
 std::unique_ptr<column> create_empty_column(size_type orc_col_id,
diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp
index 5528b2ee763..4cded30d89b 100644
--- a/cpp/src/io/orc/reader_impl_helpers.hpp
+++ b/cpp/src/io/orc/reader_impl_helpers.hpp
@@ -20,9 +20,6 @@
 #include "io/orc/orc.hpp"
 #include "io/utilities/column_buffer.hpp"
 
-#include <cudf/io/orc.hpp>
-#include <cudf/utilities/memory_resource.hpp>
-
 #include <rmm/cuda_stream_view.hpp>
 
 #include <memory>
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 03020eb649f..d432deb8e79 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -19,6 +19,7 @@
  * @brief cuDF-IO ORC writer class implementation
  */
 
+#include "cudf/detail/utilities/cuda_memcpy.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/orc/orc_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
@@ -1408,7 +1409,8 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
     num_entries_seen += stripes_per_col;
   }
 
-  std::vector<statistics_merge_group> file_stats_merge(num_file_blobs);
+  auto file_stats_merge =
+    cudf::detail::make_host_vector<statistics_merge_group>(num_file_blobs, stream);
   for (auto i = 0u; i < num_file_blobs; ++i) {
     auto col_stats         = &file_stats_merge[i];
     col_stats->col_dtype   = per_chunk_stats.col_types[i];
@@ -1418,11 +1420,10 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
   }
 
   auto d_file_stats_merge = stats_merge.device_ptr(num_stripe_blobs);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(d_file_stats_merge,
-                                file_stats_merge.data(),
-                                num_file_blobs * sizeof(statistics_merge_group),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  cudf::detail::cuda_memcpy_async<statistics_merge_group>(
+    device_span<statistics_merge_group>{stats_merge.device_ptr(num_stripe_blobs), num_file_blobs},
+    file_stats_merge,
+    stream);
 
   auto file_stat_chunks = stat_chunks.data() + num_stripe_blobs;
   detail::merge_group_statistics<detail::io_file_format::ORC>(
@@ -1573,7 +1574,7 @@ void write_index_stream(int32_t stripe_id,
  * @param[in] strm_desc Stream's descriptor
  * @param[in] enc_stream Chunk's streams
  * @param[in] compressed_data Compressed stream data
- * @param[in,out] stream_out Temporary host output buffer
+ * @param[in,out] bounce_buffer Pinned memory bounce buffer for D2H data transfer
  * @param[in,out] stripe Stream's parent stripe
  * @param[in,out] streams List of all streams
  * @param[in] compression_kind The compression kind
@@ -1584,7 +1585,7 @@ void write_index_stream(int32_t stripe_id,
 std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
                                     gpu::encoder_chunk_streams const& enc_stream,
                                     uint8_t const* compressed_data,
-                                    uint8_t* stream_out,
+                                    host_span<uint8_t> bounce_buffer,
                                     StripeInformation* stripe,
                                     orc_streams* streams,
                                     CompressionKind compression_kind,
@@ -1604,11 +1605,10 @@ std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
     if (out_sink->is_device_write_preferred(length)) {
       return out_sink->device_write_async(stream_in, length, stream);
     } else {
-      CUDF_CUDA_TRY(
-        cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDefault, stream.value()));
-      stream.synchronize();
+      cudf::detail::cuda_memcpy(
+        bounce_buffer.subspan(0, length), device_span<uint8_t const>{stream_in, length}, stream);
 
-      out_sink->host_write(stream_out, length);
+      out_sink->host_write(bounce_buffer.data(), length);
       return std::async(std::launch::deferred, [] {});
     }
   }();
@@ -2616,7 +2616,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
         strm_desc,
         enc_data.streams[strm_desc.column_id][segmentation.stripes[stripe_id].first],
         compressed_data.data(),
-        bounce_buffer.data(),
+        bounce_buffer,
         &stripe,
         &streams,
         _compression_kind,
diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
index ddf65e9020f..d15435b2553 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.cpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -27,7 +27,6 @@
 #include "ipc/Schema_generated.h"
 #include "writer_impl_helpers.hpp"
 
-#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
index 9bc435bf6c8..66810ee163a 100644
--- a/cpp/src/io/parquet/arrow_schema_writer.hpp
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -22,10 +22,9 @@
 #pragma once
 
 #include <cudf/detail/utilities/linked_column.hpp>
-#include <cudf/io/data_sink.hpp>
-#include <cudf/io/detail/parquet.hpp>
+#include <cudf/io/detail/utils.hpp>
+#include <cudf/io/types.hpp>
 #include <cudf/strings/detail/utilities.hpp>
-#include <cudf/types.hpp>
 
 namespace cudf::io::parquet::detail {
 
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index 12c24e2b848..b87f2e9c692 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -22,10 +22,7 @@
 
 #include <algorithm>
 #include <cstddef>
-#include <optional>
-#include <string>
 #include <utility>
-#include <vector>
 
 namespace CUDF_EXPORT cudf {
 namespace io::parquet::detail {
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index d4778b1ea15..05859d60c03 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include "parquet.hpp"
-#include "parquet_common.hpp"
 
 #include <algorithm>
 #include <cstddef>
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index b3276c81c1f..0d24fa4236f 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -21,6 +21,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
 
 namespace cudf::io::parquet::detail {
@@ -476,9 +477,9 @@ void WriteFinalOffsets(host_span<size_type const> offsets,
   auto d_src_data = cudf::detail::make_device_uvector_async(
     offsets, stream, cudf::get_current_device_resource_ref());
   // Iterator for the source (scalar) data
-  auto src_iter = cudf::detail::make_counting_transform_iterator(
-    static_cast<std::size_t>(0),
-    cuda::proclaim_return_type<size_type*>(
+  auto src_iter = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<std::size_t>(0),
+    cuda::proclaim_return_type<cudf::size_type*>(
       [src = d_src_data.begin()] __device__(std::size_t i) { return src + i; }));
 
   // Copy buffer addresses to device and create an iterator
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index f0a0bc0b51b..a965f3325d5 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -23,7 +23,6 @@
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -454,15 +453,18 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   CUDF_EXPECTS(predicate.type().id() == cudf::type_id::BOOL8,
                "Filter expression must return a boolean column");
 
-  auto num_bitmasks = num_bitmask_words(predicate.size());
-  std::vector<bitmask_type> host_bitmask(num_bitmasks, ~bitmask_type{0});
-  if (predicate.nullable()) {
-    CUDF_CUDA_TRY(cudaMemcpyAsync(host_bitmask.data(),
-                                  predicate.null_mask(),
-                                  num_bitmasks * sizeof(bitmask_type),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-  }
+  auto const host_bitmask = [&] {
+    auto const num_bitmasks = num_bitmask_words(predicate.size());
+    if (predicate.nullable()) {
+      return cudf::detail::make_host_vector_sync(
+        device_span<bitmask_type const>(predicate.null_mask(), num_bitmasks), stream);
+    } else {
+      auto bitmask = cudf::detail::make_host_vector<bitmask_type>(num_bitmasks, stream);
+      std::fill(bitmask.begin(), bitmask.end(), ~bitmask_type{0});
+      return bitmask;
+    }
+  }();
+
   auto validity_it = cudf::detail::make_counting_transform_iterator(
     0, [bitmask = host_bitmask.data()](auto bit_index) { return bit_is_set(bitmask, bit_index); });
 
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index dd354b905f3..170c6e8857f 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -16,8 +16,6 @@
 
 #include "reader_impl.hpp"
 
-#include <cudf/utilities/memory_resource.hpp>
-
 namespace cudf::io::parquet::detail {
 
 reader::reader() = default;
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index f0865c715bc..fed1a309064 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -21,11 +21,9 @@
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 
 #include <bitset>
@@ -78,7 +76,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
   // TODO: This step is somewhat redundant if size info has already been calculated (nested schema,
   // chunked reader).
   auto const has_strings = (kernel_mask & STRINGS_MASK) != 0;
-  std::vector<size_t> col_string_sizes(_input_columns.size(), 0L);
+  auto col_string_sizes  = cudf::detail::make_host_vector<size_t>(_input_columns.size(), _stream);
   if (has_strings) {
     // need to compute pages bounds/sizes if we lack page indexes or are using custom bounds
     // TODO: we could probably dummy up size stats for FLBA data since we know the width
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 62ffc4d3077..3aa9b94ed6b 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -284,7 +284,7 @@ class reader::impl {
    *
    * @return Vector of total string data sizes for each column
    */
-  std::vector<size_t> calculate_page_string_offsets();
+  cudf::detail::host_vector<size_t> calculate_page_string_offsets();
 
   /**
    * @brief Converts the page data and outputs to columns.
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index 3a3cdd34a58..a0c2dbd3e44 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -107,7 +107,7 @@ struct subpass_intermediate_data {
  * rowgroups may represent less than all of the rowgroups to be read for the file.
  */
 struct pass_intermediate_data {
-  std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
+  std::vector<rmm::device_buffer> raw_page_data;
 
   // rowgroup, chunk and page information for the current pass.
   bool has_compressed_data{false};
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 5138a92ac14..f03f1214b9a 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -218,7 +218,7 @@ void generate_depth_remappings(
  */
 [[nodiscard]] std::future<void> read_column_chunks_async(
   std::vector<std::unique_ptr<datasource>> const& sources,
-  std::vector<std::unique_ptr<datasource::buffer>>& page_data,
+  cudf::host_span<rmm::device_buffer> page_data,
   cudf::detail::hostdevice_vector<ColumnChunkDesc>& chunks,
   size_t begin_chunk,
   size_t end_chunk,
@@ -251,23 +251,24 @@ void generate_depth_remappings(
       if (source->is_device_read_preferred(io_size)) {
         // Buffer needs to be padded.
         // Required by `gpuDecodePageData`.
-        auto buffer =
+        page_data[chunk] =
           rmm::device_buffer(cudf::util::round_up_safe(io_size, BUFFER_PADDING_MULTIPLE), stream);
         auto fut_read_size = source->device_read_async(
-          io_offset, io_size, static_cast<uint8_t*>(buffer.data()), stream);
+          io_offset, io_size, static_cast<uint8_t*>(page_data[chunk].data()), stream);
         read_tasks.emplace_back(std::move(fut_read_size));
-        page_data[chunk] = datasource::buffer::create(std::move(buffer));
       } else {
         auto const read_buffer = source->host_read(io_offset, io_size);
         // Buffer needs to be padded.
         // Required by `gpuDecodePageData`.
-        auto tmp_buffer = rmm::device_buffer(
+        page_data[chunk] = rmm::device_buffer(
           cudf::util::round_up_safe(read_buffer->size(), BUFFER_PADDING_MULTIPLE), stream);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(
-          tmp_buffer.data(), read_buffer->data(), read_buffer->size(), cudaMemcpyDefault, stream));
-        page_data[chunk] = datasource::buffer::create(std::move(tmp_buffer));
+        CUDF_CUDA_TRY(cudaMemcpyAsync(page_data[chunk].data(),
+                                      read_buffer->data(),
+                                      read_buffer->size(),
+                                      cudaMemcpyDefault,
+                                      stream));
       }
-      auto d_compdata = page_data[chunk]->data();
+      auto d_compdata = static_cast<uint8_t const*>(page_data[chunk].data());
       do {
         chunks[chunk].compressed_data = d_compdata;
         d_compdata += chunks[chunk].compressed_size;
@@ -980,7 +981,7 @@ std::pair<bool, std::future<void>> reader::impl::read_column_chunks()
   std::vector<size_type> chunk_source_map(num_chunks);
 
   // Tracker for eventually deallocating compressed and uncompressed data
-  raw_page_data = std::vector<std::unique_ptr<datasource::buffer>>(num_chunks);
+  raw_page_data = std::vector<rmm::device_buffer>(num_chunks);
 
   // Keep track of column chunk file offsets
   std::vector<size_t> column_chunk_offsets(num_chunks);
@@ -1629,10 +1630,10 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
         get_page_nesting_size{
           d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.device_begin()});
 
-      // Manually create a int64_t `key_start` compatible counting_transform_iterator to avoid
-      // implicit casting to size_type.
-      auto const reduction_keys = thrust::make_transform_iterator(
-        thrust::make_counting_iterator<size_t>(key_start), get_reduction_key{subpass.pages.size()});
+      // Manually create a size_t `key_start` compatible counting_transform_iterator.
+      auto const reduction_keys =
+        thrust::make_transform_iterator(thrust::make_counting_iterator<std::size_t>(key_start),
+                                        get_reduction_key{subpass.pages.size()});
 
       // Find the size of each column
       thrust::reduce_by_key(rmm::exec_policy_nosync(_stream),
@@ -1695,15 +1696,14 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
     nullmask_bufs, std::numeric_limits<cudf::bitmask_type>::max(), _stream);
 }
 
-std::vector<size_t> reader::impl::calculate_page_string_offsets()
+cudf::detail::host_vector<size_t> reader::impl::calculate_page_string_offsets()
 {
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
 
   auto page_keys = make_page_key_iterator(subpass.pages);
 
-  std::vector<size_t> col_sizes(_input_columns.size(), 0L);
-  rmm::device_uvector<size_t> d_col_sizes(col_sizes.size(), _stream);
+  rmm::device_uvector<size_t> d_col_sizes(_input_columns.size(), _stream);
 
   // use page_index to fetch page string sizes in the proper order
   auto val_iter = thrust::make_transform_iterator(subpass.pages.device_begin(),
@@ -1717,7 +1717,7 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
                                 page_offset_output_iter{subpass.pages.device_ptr()});
 
   // now sum up page sizes
-  rmm::device_uvector<int> reduce_keys(col_sizes.size(), _stream);
+  rmm::device_uvector<int> reduce_keys(d_col_sizes.size(), _stream);
   thrust::reduce_by_key(rmm::exec_policy_nosync(_stream),
                         page_keys,
                         page_keys + subpass.pages.size(),
@@ -1725,14 +1725,7 @@ std::vector<size_t> reader::impl::calculate_page_string_offsets()
                         reduce_keys.begin(),
                         d_col_sizes.begin());
 
-  cudaMemcpyAsync(col_sizes.data(),
-                  d_col_sizes.data(),
-                  sizeof(size_t) * col_sizes.size(),
-                  cudaMemcpyDeviceToHost,
-                  _stream);
-  _stream.synchronize();
-
-  return col_sizes;
+  return cudf::detail::make_host_vector_sync(d_col_sizes, _stream);
 }
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 190f13eb688..f865c9a7643 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -183,7 +183,7 @@ struct aggregate_writer_metadata {
     std::vector<RowGroup> row_groups;
     std::vector<KeyValue> key_value_metadata;
     std::vector<OffsetIndex> offset_indexes;
-    std::vector<std::vector<uint8_t>> column_indexes;
+    std::vector<cudf::detail::host_vector<uint8_t>> column_indexes;
   };
   std::vector<per_file_metadata> files;
   std::optional<std::vector<ColumnOrder>> column_orders = std::nullopt;
@@ -1543,12 +1543,7 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
       d_chunks.flat_view(), {column_stats, pages.size()}, column_index_truncate_length, stream);
   }
 
-  auto h_chunks = chunks.host_view();
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks.data(),
-                                d_chunks.data(),
-                                d_chunks.flat_view().size_bytes(),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  chunks.device_to_host_async(stream);
 
   if (comp_stats.has_value()) {
     comp_stats.value() += collect_compression_statistics(comp_in, comp_res, stream);
@@ -2559,12 +2554,11 @@ void writer::impl::write_parquet_data_to_sink(
         } else {
           CUDF_EXPECTS(bounce_buffer.size() >= ck.compressed_size,
                        "Bounce buffer was not properly initialized.");
-          CUDF_CUDA_TRY(cudaMemcpyAsync(bounce_buffer.data(),
-                                        dev_bfr + ck.ck_stat_size,
-                                        ck.compressed_size,
-                                        cudaMemcpyDefault,
-                                        _stream.value()));
-          _stream.synchronize();
+          cudf::detail::cuda_memcpy(
+            host_span{bounce_buffer}.subspan(0, ck.compressed_size),
+            device_span<uint8_t const>{dev_bfr + ck.ck_stat_size, ck.compressed_size},
+            _stream);
+
           _out_sink[p]->host_write(bounce_buffer.data(), ck.compressed_size);
         }
 
@@ -2600,13 +2594,8 @@ void writer::impl::write_parquet_data_to_sink(
           auto const& column_chunk_meta = row_group.columns[i].meta_data;
 
           // start transfer of the column index
-          std::vector<uint8_t> column_idx;
-          column_idx.resize(ck.column_index_size);
-          CUDF_CUDA_TRY(cudaMemcpyAsync(column_idx.data(),
-                                        ck.column_index_blob,
-                                        ck.column_index_size,
-                                        cudaMemcpyDefault,
-                                        _stream.value()));
+          auto column_idx = cudf::detail::make_host_vector_async(
+            device_span<uint8_t const>{ck.column_index_blob, ck.column_index_size}, _stream);
 
           // calculate offsets while the column index is transferring
           int64_t curr_pg_offset = column_chunk_meta.data_page_offset;
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index badcd3f58f9..06069630685 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -74,8 +74,8 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
     // Buffer needs to be padded.
     // Required by `inflate_kernel`.
     device.resize(cudf::util::round_up_safe(host.size(), BUFFER_PADDING_MULTIPLE), stream);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      device.data(), host.data(), host.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
+    cudf::detail::cuda_memcpy_async<T>(
+      device_span<T>{device}.subspan(0, host.size()), host, stream);
   }
 
   struct decompression_blocks {
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 58faa0ebfe4..f4a2f29026a 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -22,10 +22,6 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 
-#include <rmm/device_buffer.hpp>
-
-#include <thrust/host_vector.h>
-
 #include <fstream>
 
 namespace cudf::io::text {
@@ -87,8 +83,10 @@ class datasource_chunk_reader : public data_chunk_reader {
       _source->host_read(_offset, read_size, reinterpret_cast<uint8_t*>(h_ticket.buffer.data()));
 
       // copy the host-pinned data on to device
-      CUDF_CUDA_TRY(cudaMemcpyAsync(
-        chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value()));
+      cudf::detail::cuda_memcpy_async<char>(
+        device_span<char>{chunk}.subspan(0, read_size),
+        host_span<char const>{h_ticket.buffer}.subspan(0, read_size),
+        stream);
 
       // record the host-to-device copy.
       CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
@@ -153,8 +151,10 @@ class istream_data_chunk_reader : public data_chunk_reader {
     auto chunk = rmm::device_uvector<char>(read_size, stream);
 
     // copy the host-pinned data on to device
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value()));
+    cudf::detail::cuda_memcpy_async<char>(
+      device_span<char>{chunk}.subspan(0, read_size),
+      host_span<char const>{h_ticket.buffer}.subspan(0, read_size),
+      stream);
 
     // record the host-to-device copy.
     CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
@@ -193,12 +193,10 @@ class host_span_data_chunk_reader : public data_chunk_reader {
     auto chunk = rmm::device_uvector<char>(read_size, stream);
 
     // copy the host data to device
-    CUDF_CUDA_TRY(cudaMemcpyAsync(  //
-      chunk.data(),
-      _data.data() + _position,
-      read_size,
-      cudaMemcpyDefault,
-      stream.value()));
+    cudf::detail::cuda_memcpy_async<char>(
+      cudf::device_span<char>{chunk}.subspan(0, read_size),
+      cudf::host_span<char const>{_data}.subspan(_position, read_size),
+      stream);
 
     _position += read_size;
 
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 249dc3b5875..6d954753af8 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -21,12 +21,12 @@
 
 #include "column_buffer.hpp"
 
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <iomanip>
 #include <sstream>
 
 namespace cudf::io::detail {
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index e73b2bc88de..31c8b781e77 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -22,12 +22,9 @@
 #pragma once
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/io/types.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
-#include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -35,6 +32,8 @@
 
 #include <thrust/pair.h>
 
+#include <optional>
+
 namespace cudf {
 namespace io {
 namespace detail {
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index a3afbd52896..b66742569d9 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -16,11 +16,10 @@
 
 #include "getenv_or.hpp"
 
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <cstdlib>
-#include <sstream>
+#include <kvikio/defaults.hpp>
+
 #include <string>
 
 namespace cudf::io {
@@ -53,6 +52,14 @@ bool is_gds_enabled() { return is_always_enabled() or get_env_policy() == usage_
 
 bool is_kvikio_enabled() { return get_env_policy() == usage_policy::KVIKIO; }
 
+void set_thread_pool_nthreads_from_env()
+{
+  static std::once_flag flag{};
+  std::call_once(flag, [] {
+    auto nthreads = getenv_or<unsigned int>("KVIKIO_NTHREADS", 8U);
+    kvikio::defaults::thread_pool_nthreads_reset(nthreads);
+  });
+}
 }  // namespace cufile_integration
 
 namespace nvcomp_integration {
@@ -81,5 +88,4 @@ bool is_all_enabled() { return get_env_policy() == usage_policy::ALWAYS; }
 bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_policy::STABLE; }
 
 }  // namespace nvcomp_integration
-
 }  // namespace cudf::io
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 0b76f3d3e8f..a8a275919d8 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -42,6 +42,7 @@ class file_sink : public data_sink {
     if (!_output_stream.is_open()) { detail::throw_on_file_open_failure(filepath, true); }
 
     if (cufile_integration::is_kvikio_enabled()) {
+      cufile_integration::set_thread_pool_nthreads_from_env();
       _kvikio_file = kvikio::FileHandle(filepath, "w");
       CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 2daaecadca6..9668b30e9a9 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -18,6 +18,7 @@
 #include "getenv_or.hpp"
 
 #include <cudf/detail/utilities/logger.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/config_utils.hpp>
 #include <cudf/io/datasource.hpp>
@@ -32,7 +33,6 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
-#include <unordered_map>
 #include <vector>
 
 namespace cudf {
@@ -48,6 +48,7 @@ class file_source : public datasource {
   {
     detail::force_init_cuda_context();
     if (cufile_integration::is_kvikio_enabled()) {
+      cufile_integration::set_thread_pool_nthreads_from_env();
       _kvikio_file = kvikio::FileHandle(filepath);
       CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");
@@ -246,17 +247,18 @@ class device_buffer_source final : public datasource {
   size_t host_read(size_t offset, size_t size, uint8_t* dst) override
   {
     auto const count  = std::min(size, this->size() - offset);
-    auto const stream = cudf::get_default_stream();
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(dst, _d_buffer.data() + offset, count, cudaMemcpyDefault, stream.value()));
-    stream.synchronize();
+    auto const stream = cudf::detail::global_cuda_stream_pool().get_stream();
+    cudf::detail::cuda_memcpy(host_span<uint8_t>{dst, count},
+                              device_span<uint8_t const>{
+                                reinterpret_cast<uint8_t const*>(_d_buffer.data() + offset), count},
+                              stream);
     return count;
   }
 
   std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
   {
     auto const count  = std::min(size, this->size() - offset);
-    auto const stream = cudf::get_default_stream();
+    auto const stream = cudf::detail::global_cuda_stream_pool().get_stream();
     auto h_data       = cudf::detail::make_host_vector_async(
       cudf::device_span<std::byte const>{_d_buffer.data() + offset, count}, stream);
     stream.synchronize();
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 98ed9b28f0a..93cdccfbb9f 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -22,8 +22,6 @@
 #include <cudf/detail/utilities/logger.hpp>
 #include <cudf/io/config_utils.hpp>
 
-#include <rmm/device_buffer.hpp>
-
 #include <dlfcn.h>
 
 #include <cerrno>
diff --git a/cpp/src/io/utilities/row_selection.cpp b/cpp/src/io/utilities/row_selection.cpp
index c0bbca39167..cf252fe63af 100644
--- a/cpp/src/io/utilities/row_selection.cpp
+++ b/cpp/src/io/utilities/row_selection.cpp
@@ -16,10 +16,7 @@
 
 #include "io/utilities/row_selection.hpp"
 
-#include <cudf/utilities/error.hpp>
-
 #include <algorithm>
-#include <limits>
 
 namespace cudf::io::detail {
 
diff --git a/cpp/src/io/utilities/row_selection.hpp b/cpp/src/io/utilities/row_selection.hpp
index 7c607099cdc..e826feff201 100644
--- a/cpp/src/io/utilities/row_selection.hpp
+++ b/cpp/src/io/utilities/row_selection.hpp
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <cstdint>
 #include <optional>
diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp
index 89c47d246d0..34a0bdce124 100644
--- a/cpp/src/jit/cache.cpp
+++ b/cpp/src/jit/cache.cpp
@@ -16,11 +16,8 @@
 
 #include <cudf/utilities/error.hpp>
 
-#include <cuda.h>
-
 #include <jitify2.hpp>
 
-#include <cstddef>
 #include <filesystem>
 
 namespace cudf {
diff --git a/cpp/src/jit/util.cpp b/cpp/src/jit/util.cpp
index 0585e02a031..d9a29203133 100644
--- a/cpp/src/jit/util.cpp
+++ b/cpp/src/jit/util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,6 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <string>
-
 namespace cudf {
 namespace jit {
 struct get_data_ptr_functor {
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index b0a84a6d50c..d27420658d6 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -1126,12 +1126,8 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
  * `max` of 0.
  *
  * @param tdv input tdigests. The tdigests within this column are grouped by key.
- * @param h_group_offsets a host iterator of the offsets to the start of each group. A group is
- * counted as one even when the cluster is empty in it. The offsets should have the same values as
- * the ones in `group_offsets`.
  * @param group_offsets a device iterator of the offsets to the start of each group. A group is
- * counted as one even when the cluster is empty in it. The offsets should have the same values as
- * the ones in `h_group_offsets`.
+ * counted as one even when the cluster is empty in it.
  * @param group_labels a device iterator of the the group label for each tdigest cluster including
  * empty clusters.
  * @param num_group_labels the number of unique group labels.
@@ -1142,9 +1138,8 @@ std::pair<rmm::device_uvector<double>, rmm::device_uvector<double>> generate_mer
  *
  * @return A column containing the merged tdigests.
  */
-template <typename HGroupOffsetIter, typename GroupOffsetIter, typename GroupLabelIter>
+template <typename GroupOffsetIter, typename GroupLabelIter>
 std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
-                                       HGroupOffsetIter h_group_offsets,
                                        GroupOffsetIter group_offsets,
                                        GroupLabelIter group_labels,
                                        size_t num_group_labels,
@@ -1313,21 +1308,13 @@ std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
 
   if (input.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); }
 
-  auto group_offsets_  = group_offsets_fn{input.size()};
-  auto h_group_offsets = cudf::detail::make_counting_transform_iterator(0, group_offsets_);
-  auto group_offsets   = cudf::detail::make_counting_transform_iterator(0, group_offsets_);
-  auto group_labels    = thrust::make_constant_iterator(0);
-  return to_tdigest_scalar(merge_tdigests(tdv,
-                                          h_group_offsets,
-                                          group_offsets,
-                                          group_labels,
-                                          input.size(),
-                                          1,
-                                          max_centroids,
-                                          stream,
-                                          mr),
-                           stream,
-                           mr);
+  auto group_offsets_ = group_offsets_fn{input.size()};
+  auto group_offsets  = cudf::detail::make_counting_transform_iterator(0, group_offsets_);
+  auto group_labels   = thrust::make_constant_iterator(0);
+  return to_tdigest_scalar(
+    merge_tdigests(tdv, group_offsets, group_labels, input.size(), 1, max_centroids, stream, mr),
+    stream,
+    mr);
 }
 
 std::unique_ptr<column> group_tdigest(column_view const& col,
@@ -1376,16 +1363,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
     return cudf::tdigest::detail::make_empty_tdigests_column(num_groups, stream, mr);
   }
 
-  // bring group offsets back to the host
-  std::vector<size_type> h_group_offsets(group_offsets.size());
-  cudaMemcpyAsync(h_group_offsets.data(),
-                  group_offsets.begin(),
-                  sizeof(size_type) * group_offsets.size(),
-                  cudaMemcpyDefault,
-                  stream);
-
   return merge_tdigests(tdv,
-                        h_group_offsets.begin(),
                         group_offsets.data(),
                         group_labels.data(),
                         group_labels.size(),
diff --git a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
index a9f86ac1b5f..17844b6bb0a 100644
--- a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
+++ b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/tdigest/tdigest_column_view.hpp>
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index d187375b69f..75ebc078930 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -26,8 +26,6 @@
 #include <cudf/reduction/detail/histogram.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/structs/structs_column_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp
index d3c0b54f286..b91ae19b51a 100644
--- a/cpp/src/reductions/scan/scan.cpp
+++ b/cpp/src/reductions/scan/scan.cpp
@@ -14,13 +14,10 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/scan.hpp>
 #include <cudf/reduction.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index 40d1d8a0a53..c4f6c135dde 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -13,16 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cudf/column/column.hpp>
-#include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.cpp b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
index 72c23395a93..7cad31c0658 100644
--- a/cpp/src/rolling/detail/optimized_unbounded_window.cpp
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
@@ -18,13 +18,10 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
-#include <cudf/detail/utilities/assert.cuh>
 #include <cudf/groupby.hpp>
 #include <cudf/reduction/detail/reduction.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
-#include <cudf/unary.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf::detail {
diff --git a/cpp/src/rolling/detail/range_window_bounds.hpp b/cpp/src/rolling/detail/range_window_bounds.hpp
index 8a53e937f98..77cb2a8c7f5 100644
--- a/cpp/src/rolling/detail/range_window_bounds.hpp
+++ b/cpp/src/rolling/detail/range_window_bounds.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,7 @@
 #pragma once
 
 #include <cudf/rolling/range_window_bounds.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/wrappers/durations.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/rolling/range_window_bounds.cpp b/cpp/src/rolling/range_window_bounds.cpp
index 69792136c64..7f698dfcd6b 100644
--- a/cpp/src/rolling/range_window_bounds.cpp
+++ b/cpp/src/rolling/range_window_bounds.cpp
@@ -19,7 +19,6 @@
 #include <cudf/rolling/range_window_bounds.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
-#include <cudf/wrappers/durations.hpp>
 
 namespace cudf {
 namespace {
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 31535198c58..4ec2174a96f 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -26,8 +26,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-
 #include <string>
 
 namespace cudf {
diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index 656fe61fbbe..9f242bdffe0 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -16,10 +16,8 @@
 
 #include <cudf/detail/copy.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 0db1adf1223..f5d052c6657 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -16,6 +16,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/convert/convert_durations.hpp>
 #include <cudf/strings/detail/convert/int_to_string.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
@@ -152,12 +153,8 @@ struct format_compiler {
     }
 
     // create program in device memory
-    d_items.resize(items.size(), stream);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(d_items.data(),
-                                  items.data(),
-                                  items.size() * sizeof(items[0]),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
+    d_items = cudf::detail::make_device_uvector_sync(
+      items, stream, cudf::get_current_device_resource_ref());
   }
 
   format_item const* compiled_format_items() { return d_items.data(); }
diff --git a/cpp/src/strings/regex/regexec.cpp b/cpp/src/strings/regex/regexec.cpp
index d1990733e81..60ad714dfec 100644
--- a/cpp/src/strings/regex/regexec.cpp
+++ b/cpp/src/strings/regex/regexec.cpp
@@ -24,7 +24,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 
-#include <algorithm>
 #include <functional>
 #include <numeric>
 
diff --git a/cpp/src/strings/strings_scalar_factories.cpp b/cpp/src/strings/strings_scalar_factories.cpp
index 219d1174d42..1cc405234b2 100644
--- a/cpp/src/strings/strings_scalar_factories.cpp
+++ b/cpp/src/strings/strings_scalar_factories.cpp
@@ -16,7 +16,6 @@
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/structs/structs_column_view.cpp b/cpp/src/structs/structs_column_view.cpp
index b0284e9cb96..e14142a9ad1 100644
--- a/cpp/src/structs/structs_column_view.cpp
+++ b/cpp/src/structs/structs_column_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/types.hpp>
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index 5df9943303d..4012ee3d21c 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -21,13 +21,10 @@
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/structs/structs_column_view.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
-#include <cudf/utilities/traits.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp
index cb707c94288..41c64c6decb 100644
--- a/cpp/src/table/table.cpp
+++ b/cpp/src/table/table.cpp
@@ -18,7 +18,6 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp
index 8a5340dc20d..659beb749af 100644
--- a/cpp/src/table/table_view.cpp
+++ b/cpp/src/table/table_view.cpp
@@ -20,10 +20,7 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-
 #include <algorithm>
-#include <cassert>
 #include <vector>
 
 namespace cudf {
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 52b96bc9039..b919ac16956 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -23,8 +23,6 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/null_mask.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/src/utilities/cuda.cpp b/cpp/src/utilities/cuda.cpp
index 53ca0608170..d979bda41d0 100644
--- a/cpp/src/utilities/cuda.cpp
+++ b/cpp/src/utilities/cuda.cpp
@@ -18,8 +18,6 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <cuda_runtime.h>
-
 namespace cudf::detail {
 
 cudf::size_type num_multiprocessors()
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index 9d8e3cf2fa6..e30806a5011 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -18,7 +18,6 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/cuda_device.hpp>
diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp
index 58971552758..000526723c4 100644
--- a/cpp/src/utilities/prefetch.cpp
+++ b/cpp/src/utilities/prefetch.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/prefetch.hpp>
 
diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp
index 8c29182bfb5..7069b59be26 100644
--- a/cpp/src/utilities/stream_pool.cpp
+++ b/cpp/src/utilities/stream_pool.cpp
@@ -23,7 +23,6 @@
 
 #include <algorithm>
 #include <cstddef>
-#include <memory>
 #include <mutex>
 #include <vector>
 
diff --git a/cpp/src/utilities/traits.cpp b/cpp/src/utilities/traits.cpp
index a68dc84e340..c1e71f5f8f9 100644
--- a/cpp/src/utilities/traits.cpp
+++ b/cpp/src/utilities/traits.cpp
@@ -19,8 +19,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf/wrappers/dictionary.hpp>
 
-#include <cuda_runtime.h>
-
 namespace cudf {
 
 namespace {
diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp
index 3095b342748..84c8529641d 100644
--- a/cpp/src/utilities/type_checks.cpp
+++ b/cpp/src/utilities/type_checks.cpp
@@ -21,8 +21,6 @@
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-
 #include <algorithm>
 
 namespace cudf {
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a4213dcbe94..b78a64d0e55 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -611,7 +611,6 @@ ConfigureTest(
   text/bpe_tests.cpp
   text/edit_distance_tests.cpp
   text/jaccard_tests.cpp
-  text/minhash_tests.cpp
   text/ngrams_tests.cpp
   text/ngrams_tokenize_tests.cpp
   text/normalize_tests.cpp
diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index a4bde50a21e..7af88d8aa34 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -18,7 +18,6 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/ast/expressions.hpp>
@@ -26,14 +25,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/transform.hpp>
-#include <cudf/types.hpp>
-
-#include <rmm/device_uvector.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -41,7 +34,6 @@
 #include <limits>
 #include <list>
 #include <random>
-#include <type_traits>
 #include <vector>
 
 template <typename T>
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index aa5b49567e6..3bd67001c16 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -26,9 +26,7 @@
 #include <cudf/binaryop.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
-#include <cudf/unary.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
diff --git a/cpp/tests/binaryop/binop-generic-ptx-test.cpp b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
index 03cc87a1968..e9a2761db4a 100644
--- a/cpp/tests/binaryop/binop-generic-ptx-test.cpp
+++ b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -19,7 +19,6 @@
 
 #include <tests/binaryop/assert-binops.h>
 #include <tests/binaryop/binop-fixture.hpp>
-#include <tests/binaryop/util/operation.h>
 #include <tests/binaryop/util/runtime_support.h>
 
 #include <cudf/binaryop.hpp>
diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp
index fe221fb1c48..799bf646e52 100644
--- a/cpp/tests/bitmask/bitmask_tests.cpp
+++ b/cpp/tests/bitmask/bitmask_tests.cpp
@@ -16,7 +16,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/testing_main.hpp>
 
diff --git a/cpp/tests/column/bit_cast_test.cpp b/cpp/tests/column/bit_cast_test.cpp
index ab230ab036e..5570a7d498c 100644
--- a/cpp/tests/column/bit_cast_test.cpp
+++ b/cpp/tests/column/bit_cast_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
@@ -26,8 +25,6 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
-#include <random>
-
 template <typename T, typename T2 = void>
 struct rep_type_impl {
   using type = void;
diff --git a/cpp/tests/column/column_test.cpp b/cpp/tests/column/column_test.cpp
index 631f5150829..d700adaebd5 100644
--- a/cpp/tests/column/column_test.cpp
+++ b/cpp/tests/column/column_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
diff --git a/cpp/tests/column/column_view_device_span_test.cpp b/cpp/tests/column/column_view_device_span_test.cpp
index 6de9121158b..470437f4112 100644
--- a/cpp/tests/column/column_view_device_span_test.cpp
+++ b/cpp/tests/column/column_view_device_span_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp
index 37ab4b8f387..ad344476332 100644
--- a/cpp/tests/column/column_view_shallow_test.cpp
+++ b/cpp/tests/column/column_view_shallow_test.cpp
@@ -15,9 +15,7 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index 603187f0330..aa9d508b6aa 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -26,11 +26,8 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/utilities.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-
 #include <thrust/iterator/counting_iterator.h>
 
 class ColumnFactoryTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 18140c34abd..aedc498964a 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -34,8 +34,6 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <thrust/iterator/constant_iterator.h>
-
 #include <numeric>
 #include <stdexcept>
 #include <string>
diff --git a/cpp/tests/copying/copy_if_else_nested_tests.cpp b/cpp/tests/copying/copy_if_else_nested_tests.cpp
index cfbd181f944..e1cdfe9beed 100644
--- a/cpp/tests/copying/copy_if_else_nested_tests.cpp
+++ b/cpp/tests/copying/copy_if_else_nested_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp
index 25d93da277b..e2133a546e4 100644
--- a/cpp/tests/copying/copy_range_tests.cpp
+++ b/cpp/tests/copying/copy_range_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp
index 4124f749012..9c00725d5d2 100644
--- a/cpp/tests/copying/copy_tests.cpp
+++ b/cpp/tests/copying/copy_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/copying/gather_list_tests.cpp b/cpp/tests/copying/gather_list_tests.cpp
index 247090aac90..93f71345c5c 100644
--- a/cpp/tests/copying/gather_list_tests.cpp
+++ b/cpp/tests/copying/gather_list_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/copying/gather_str_tests.cpp b/cpp/tests/copying/gather_str_tests.cpp
index 28098878086..795e3f30aa1 100644
--- a/cpp/tests/copying/gather_str_tests.cpp
+++ b/cpp/tests/copying/gather_str_tests.cpp
@@ -16,7 +16,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/copying/gather_struct_tests.cpp b/cpp/tests/copying/gather_struct_tests.cpp
index 1598ab2646a..b2c0f7acc3a 100644
--- a/cpp/tests/copying/gather_struct_tests.cpp
+++ b/cpp/tests/copying/gather_struct_tests.cpp
@@ -17,20 +17,15 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/error.hpp>
 
 #include <rmm/device_buffer.hpp>
 
diff --git a/cpp/tests/copying/gather_tests.cpp b/cpp/tests/copying/gather_tests.cpp
index 07ce672b14d..908dcd67673 100644
--- a/cpp/tests/copying/gather_tests.cpp
+++ b/cpp/tests/copying/gather_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index 90ff97e7355..b2d64dac7c8 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -16,10 +16,8 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
diff --git a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
index 4f28ff12941..1f76efdc4c3 100644
--- a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
+++ b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
@@ -16,13 +16,10 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/gather.hpp>
-#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/cpp/tests/copying/reverse_tests.cpp b/cpp/tests/copying/reverse_tests.cpp
index e4b2d319ddf..46516436901 100644
--- a/cpp/tests/copying/reverse_tests.cpp
+++ b/cpp/tests/copying/reverse_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,17 +17,13 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
-#include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/tabulate.h>
diff --git a/cpp/tests/copying/sample_tests.cpp b/cpp/tests/copying/sample_tests.cpp
index 2f76e3f1fcd..8be5d8c1fbb 100644
--- a/cpp/tests/copying/sample_tests.cpp
+++ b/cpp/tests/copying/sample_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,12 +15,9 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <cudf/column/column.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/sorting.hpp>
diff --git a/cpp/tests/copying/scatter_list_scalar_tests.cpp b/cpp/tests/copying/scatter_list_scalar_tests.cpp
index 42d2e004d6b..23faa6e5b86 100644
--- a/cpp/tests/copying/scatter_list_scalar_tests.cpp
+++ b/cpp/tests/copying/scatter_list_scalar_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/null_mask.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 
 using mask_vector = std::vector<cudf::valid_type>;
 using size_column = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
diff --git a/cpp/tests/copying/scatter_list_tests.cpp b/cpp/tests/copying/scatter_list_tests.cpp
index a82860a3eec..1f87fcfcc99 100644
--- a/cpp/tests/copying/scatter_list_tests.cpp
+++ b/cpp/tests/copying/scatter_list_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/copying/scatter_struct_scalar_tests.cpp b/cpp/tests/copying/scatter_struct_scalar_tests.cpp
index 78572b0bb37..1d1da8a1b1e 100644
--- a/cpp/tests/copying/scatter_struct_scalar_tests.cpp
+++ b/cpp/tests/copying/scatter_struct_scalar_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,6 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
diff --git a/cpp/tests/copying/scatter_struct_tests.cpp b/cpp/tests/copying/scatter_struct_tests.cpp
index c92244d047b..7d88e9af85f 100644
--- a/cpp/tests/copying/scatter_struct_tests.cpp
+++ b/cpp/tests/copying/scatter_struct_tests.cpp
@@ -21,7 +21,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 
 using namespace cudf::test::iterators;
diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp
index 41a753cd0ac..74c04446bdd 100644
--- a/cpp/tests/copying/scatter_tests.cpp
+++ b/cpp/tests/copying/scatter_tests.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
@@ -23,7 +22,6 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/stream_compaction.hpp>
 
 #include <stdexcept>
 
diff --git a/cpp/tests/copying/segmented_gather_list_tests.cpp b/cpp/tests/copying/segmented_gather_list_tests.cpp
index 8881fb344a2..a133ae43872 100644
--- a/cpp/tests/copying/segmented_gather_list_tests.cpp
+++ b/cpp/tests/copying/segmented_gather_list_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,6 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/gather.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp
index ff6808d9a79..72a8e7357bc 100644
--- a/cpp/tests/copying/shift_tests.cpp
+++ b/cpp/tests/copying/shift_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
@@ -30,7 +29,6 @@
 
 #include <limits>
 #include <memory>
-#include <stdexcept>
 
 using TestTypes = cudf::test::Types<int32_t>;
 
diff --git a/cpp/tests/copying/slice_tests.cpp b/cpp/tests/copying/slice_tests.cpp
index aef0d4ad78a..3868a147fa8 100644
--- a/cpp/tests/copying/slice_tests.cpp
+++ b/cpp/tests/copying/slice_tests.cpp
@@ -22,12 +22,8 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-#include <cudf/wrappers/timestamps.hpp>
 
 #include <array>
 #include <stdexcept>
diff --git a/cpp/tests/copying/utility_tests.cpp b/cpp/tests/copying/utility_tests.cpp
index 0905f9babdc..90457f8d74c 100644
--- a/cpp/tests/copying/utility_tests.cpp
+++ b/cpp/tests/copying/utility_tests.cpp
@@ -23,7 +23,6 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index 603edb27c7c..44f99adc0e9 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -23,14 +23,11 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/datetime.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <thrust/transform.h>
-
 #define XXX false  // stub for null values
 
 constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
diff --git a/cpp/tests/dictionary/add_keys_test.cpp b/cpp/tests/dictionary/add_keys_test.cpp
index 46bf5468922..ebc8c11e86c 100644
--- a/cpp/tests/dictionary/add_keys_test.cpp
+++ b/cpp/tests/dictionary/add_keys_test.cpp
@@ -24,8 +24,6 @@
 #include <cudf/dictionary/update_keys.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <vector>
-
 struct DictionaryAddKeysTest : public cudf::test::BaseFixture {};
 
 TEST_F(DictionaryAddKeysTest, StringsColumn)
diff --git a/cpp/tests/dictionary/encode_test.cpp b/cpp/tests/dictionary/encode_test.cpp
index 5db0e9fa1e4..dfa3ede5d46 100644
--- a/cpp/tests/dictionary/encode_test.cpp
+++ b/cpp/tests/dictionary/encode_test.cpp
@@ -21,8 +21,6 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 
-#include <vector>
-
 struct DictionaryEncodeTest : public cudf::test::BaseFixture {};
 
 TEST_F(DictionaryEncodeTest, EncodeStringColumn)
diff --git a/cpp/tests/dictionary/fill_test.cpp b/cpp/tests/dictionary/fill_test.cpp
index 18696b66e48..bc7d19201aa 100644
--- a/cpp/tests/dictionary/fill_test.cpp
+++ b/cpp/tests/dictionary/fill_test.cpp
@@ -18,13 +18,10 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
-#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
 
-#include <vector>
-
 struct DictionaryFillTest : public cudf::test::BaseFixture {};
 
 TEST_F(DictionaryFillTest, StringsColumn)
diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp
index 25501b4fde7..2774173b80a 100644
--- a/cpp/tests/dictionary/search_test.cpp
+++ b/cpp/tests/dictionary/search_test.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/dictionary/detail/search.hpp>
diff --git a/cpp/tests/dictionary/slice_test.cpp b/cpp/tests/dictionary/slice_test.cpp
index d80f8dee079..8c15d6dbecd 100644
--- a/cpp/tests/dictionary/slice_test.cpp
+++ b/cpp/tests/dictionary/slice_test.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/dictionary/update_keys.hpp>
diff --git a/cpp/tests/filling/fill_tests.cpp b/cpp/tests/filling/fill_tests.cpp
index 26badefe698..a5e2db6a005 100644
--- a/cpp/tests/filling/fill_tests.cpp
+++ b/cpp/tests/filling/fill_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/filling/repeat_tests.cpp b/cpp/tests/filling/repeat_tests.cpp
index 6326765c68b..c856984a4a3 100644
--- a/cpp/tests/filling/repeat_tests.cpp
+++ b/cpp/tests/filling/repeat_tests.cpp
@@ -17,14 +17,11 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/filling.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
@@ -33,7 +30,6 @@
 
 #include <algorithm>
 #include <numeric>
-#include <random>
 
 constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
 
diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp
index 0783b4e5bbb..53782c90c26 100644
--- a/cpp/tests/filling/sequence_tests.cpp
+++ b/cpp/tests/filling/sequence_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/filling.hpp>
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index a222289216d..b96c6909e55 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -18,17 +18,14 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/binaryop.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/unary.hpp>
 
 #include <algorithm>
 #include <limits>
 #include <numeric>
-#include <type_traits>
 #include <vector>
 
 using namespace numeric;
diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp
index a79b6a32916..ba456084a7c 100644
--- a/cpp/tests/groupby/collect_list_tests.cpp
+++ b/cpp/tests/groupby/collect_list_tests.cpp
@@ -20,8 +20,6 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
-
 template <typename V>
 struct groupby_collect_list_test : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp
index 61d2838590b..dfd7eb82c4a 100644
--- a/cpp/tests/groupby/collect_set_tests.cpp
+++ b/cpp/tests/groupby/collect_set_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/lists/sorting.hpp>
 #include <cudf/sorting.hpp>
diff --git a/cpp/tests/groupby/correlation_tests.cpp b/cpp/tests/groupby/correlation_tests.cpp
index 26f714632dd..f8cc813e877 100644
--- a/cpp/tests/groupby/correlation_tests.cpp
+++ b/cpp/tests/groupby/correlation_tests.cpp
@@ -25,7 +25,6 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 
 #include <limits>
-#include <vector>
 
 using namespace cudf::test::iterators;
 
diff --git a/cpp/tests/groupby/covariance_tests.cpp b/cpp/tests/groupby/covariance_tests.cpp
index e3eb2da201f..81378bb91e8 100644
--- a/cpp/tests/groupby/covariance_tests.cpp
+++ b/cpp/tests/groupby/covariance_tests.cpp
@@ -23,10 +23,8 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/utilities/traits.hpp>
 
 #include <limits>
-#include <vector>
 
 using namespace cudf::test::iterators;
 
diff --git a/cpp/tests/groupby/groupby_test_util.cpp b/cpp/tests/groupby/groupby_test_util.cpp
index 5d99d15ae77..df0375d6a09 100644
--- a/cpp/tests/groupby/groupby_test_util.cpp
+++ b/cpp/tests/groupby/groupby_test_util.cpp
@@ -17,8 +17,8 @@
 #include "groupby_test_util.hpp"
 
 #include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/default_stream.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
@@ -27,9 +27,6 @@
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
-#include <cudf/unary.hpp>
-
-#include <random>
 
 void test_single_agg(cudf::column_view const& keys,
                      cudf::column_view const& values,
diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp
index 755b0c20f17..9d2e613be3e 100644
--- a/cpp/tests/groupby/groupby_test_util.hpp
+++ b/cpp/tests/groupby/groupby_test_util.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,8 @@
 
 #pragma once
 
-#include <cudf/column/column_view.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/sorting.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 enum class force_use_sort_impl : bool { NO, YES };
 
diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp
index 2d447025919..783cfb17e49 100644
--- a/cpp/tests/groupby/histogram_tests.cpp
+++ b/cpp/tests/groupby/histogram_tests.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/lists/sorting.hpp>
 #include <cudf/sorting.hpp>
diff --git a/cpp/tests/groupby/max_scan_tests.cpp b/cpp/tests/groupby/max_scan_tests.cpp
index d86de798844..6195e0179ec 100644
--- a/cpp/tests/groupby/max_scan_tests.cpp
+++ b/cpp/tests/groupby/max_scan_tests.cpp
@@ -22,7 +22,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/dictionary/update_keys.hpp>
 
 using namespace cudf::test::iterators;
 
diff --git a/cpp/tests/groupby/merge_lists_tests.cpp b/cpp/tests/groupby/merge_lists_tests.cpp
index 279d71560b4..4481e2dc022 100644
--- a/cpp/tests/groupby/merge_lists_tests.cpp
+++ b/cpp/tests/groupby/merge_lists_tests.cpp
@@ -21,7 +21,6 @@
 
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/table/table_view.hpp>
 
diff --git a/cpp/tests/groupby/merge_sets_tests.cpp b/cpp/tests/groupby/merge_sets_tests.cpp
index 9736bb84dd6..1bfba265478 100644
--- a/cpp/tests/groupby/merge_sets_tests.cpp
+++ b/cpp/tests/groupby/merge_sets_tests.cpp
@@ -21,7 +21,6 @@
 
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/lists/sorting.hpp>
 #include <cudf/sorting.hpp>
diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp
index 7f31bc9089f..f2a50248b4a 100644
--- a/cpp/tests/groupby/rank_scan_tests.cpp
+++ b/cpp/tests/groupby/rank_scan_tests.cpp
@@ -22,8 +22,6 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
-
 using namespace cudf::test::iterators;
 
 template <typename T>
diff --git a/cpp/tests/groupby/shift_tests.cpp b/cpp/tests/groupby/shift_tests.cpp
index 14c9ceb4508..49f9d7cb10a 100644
--- a/cpp/tests/groupby/shift_tests.cpp
+++ b/cpp/tests/groupby/shift_tests.cpp
@@ -21,7 +21,6 @@
 
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 
 template <typename T>
diff --git a/cpp/tests/hashing/md5_test.cpp b/cpp/tests/hashing/md5_test.cpp
index 69e518cbf8d..b54adb52496 100644
--- a/cpp/tests/hashing/md5_test.cpp
+++ b/cpp/tests/hashing/md5_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
diff --git a/cpp/tests/hashing/murmurhash3_x86_32_test.cpp b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp
index c1a6e6ff6e1..b4622f5eb81 100644
--- a/cpp/tests/hashing/murmurhash3_x86_32_test.cpp
+++ b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp
@@ -17,11 +17,9 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/iterator.cuh>
 #include <cudf/hashing.hpp>
 
 constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
diff --git a/cpp/tests/hashing/sha1_test.cpp b/cpp/tests/hashing/sha1_test.cpp
index e28e71442a6..1e86751bb4c 100644
--- a/cpp/tests/hashing/sha1_test.cpp
+++ b/cpp/tests/hashing/sha1_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
diff --git a/cpp/tests/hashing/sha224_test.cpp b/cpp/tests/hashing/sha224_test.cpp
index 61b584f94df..259e7102ee2 100644
--- a/cpp/tests/hashing/sha224_test.cpp
+++ b/cpp/tests/hashing/sha224_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp
index 8bc47c92c6b..a4affc87874 100644
--- a/cpp/tests/hashing/sha256_test.cpp
+++ b/cpp/tests/hashing/sha256_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
diff --git a/cpp/tests/hashing/sha384_test.cpp b/cpp/tests/hashing/sha384_test.cpp
index 4c79934f98d..8a5c090eeea 100644
--- a/cpp/tests/hashing/sha384_test.cpp
+++ b/cpp/tests/hashing/sha384_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
diff --git a/cpp/tests/hashing/sha512_test.cpp b/cpp/tests/hashing/sha512_test.cpp
index 0eb1c60b8fc..77fc56b5f13 100644
--- a/cpp/tests/hashing/sha512_test.cpp
+++ b/cpp/tests/hashing/sha512_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/hashing.hpp>
diff --git a/cpp/tests/hashing/xxhash_64_test.cpp b/cpp/tests/hashing/xxhash_64_test.cpp
index ab4ed829681..d8694a72d94 100644
--- a/cpp/tests/hashing/xxhash_64_test.cpp
+++ b/cpp/tests/hashing/xxhash_64_test.cpp
@@ -17,11 +17,8 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/iterator.cuh>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/hashing.hpp>
 
 using NumericTypesNoBools =
diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp
index 2151ec6e22f..1ddc33e749a 100644
--- a/cpp/tests/interop/from_arrow_device_test.cpp
+++ b/cpp/tests/interop/from_arrow_device_test.cpp
@@ -17,17 +17,13 @@
 #include "nanoarrow_utils.hpp"
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp
index ef9936b214c..d93ef28aab8 100644
--- a/cpp/tests/interop/from_arrow_host_test.cpp
+++ b/cpp/tests/interop/from_arrow_host_test.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
@@ -28,7 +27,6 @@
 #include <cudf/copying.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
-#include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/interop/from_arrow_stream_test.cpp b/cpp/tests/interop/from_arrow_stream_test.cpp
index 80a2e4b2ffd..3916025bf22 100644
--- a/cpp/tests/interop/from_arrow_stream_test.cpp
+++ b/cpp/tests/interop/from_arrow_stream_test.cpp
@@ -17,27 +17,14 @@
 #include "nanoarrow_utils.hpp"
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/concatenate.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/dictionary/dictionary_factories.hpp>
-#include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-
 struct VectorOfArrays {
   std::vector<nanoarrow::UniqueArray> arrays;
   nanoarrow::UniqueSchema schema;
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 6e742b9e4cf..18efae75cb1 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -25,9 +25,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
 #include <cudf/table/table.hpp>
@@ -37,8 +35,6 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <arrow/c/bridge.h>
-#include <nanoarrow/nanoarrow.h>
-#include <nanoarrow/nanoarrow_device.h>
 
 std::unique_ptr<cudf::table> get_cudf_table()
 {
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 7ba586461dc..29aa928c277 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -17,21 +17,15 @@
 #include "nanoarrow_utils.hpp"
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 
diff --git a/cpp/tests/interop/to_arrow_host_test.cpp b/cpp/tests/interop/to_arrow_host_test.cpp
index fcb4433b42e..fa3aa82fee2 100644
--- a/cpp/tests/interop/to_arrow_host_test.cpp
+++ b/cpp/tests/interop/to_arrow_host_test.cpp
@@ -17,20 +17,14 @@
 #include "nanoarrow_utils.hpp"
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index a6aa4b22eca..86295d8efb1 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -19,14 +19,12 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index b265dcf9273..cc1e367d114 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -17,14 +17,12 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/io/csv.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
@@ -32,18 +30,12 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/unary.hpp>
 
-#include <thrust/copy.h>
 #include <thrust/execution_policy.h>
-#include <thrust/find.h>
 #include <thrust/iterator/counting_iterator.h>
 
-#include <arrow/io/api.h>
-
 #include <algorithm>
 #include <fstream>
-#include <iostream>
 #include <iterator>
 #include <limits>
 #include <numeric>
diff --git a/cpp/tests/io/file_io_test.cpp b/cpp/tests/io/file_io_test.cpp
index 3c41f21b0a4..1b85541687a 100644
--- a/cpp/tests/io/file_io_test.cpp
+++ b/cpp/tests/io/file_io_test.cpp
@@ -15,13 +15,10 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <src/io/utilities/file_io_utilities.hpp>
 
-#include <type_traits>
-
 // Base test fixture for tests
 struct CuFileIOTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp
index d23acf3ae00..c8c2d18903f 100644
--- a/cpp/tests/io/json/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json/json_quote_normalization_test.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
-#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
@@ -29,7 +28,6 @@
 
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <string>
 
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index cb6716f4a18..5f070bd53b9 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -39,8 +39,6 @@
 
 #include <thrust/iterator/constant_iterator.h>
 
-#include <arrow/io/api.h>
-
 #include <fstream>
 #include <limits>
 #include <memory>
diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
index 15682c6ae6b..887d4fa783f 100644
--- a/cpp/tests/io/json/json_tree.cpp
+++ b/cpp/tests/io/json/json_tree.cpp
@@ -15,12 +15,8 @@
  */
 
 #include "io/json/nested_json.hpp"
-#include "io/utilities/hostdevice_vector.hpp"
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/random.hpp>
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/hashing.hpp>
@@ -29,9 +25,9 @@
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
+#include <numeric>
 #include <stack>
 #include <string>
 #include <unordered_map>
diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp
index f32aba0e632..e0e955c4f48 100644
--- a/cpp/tests/io/json/nested_json_test.cpp
+++ b/cpp/tests/io/json/nested_json_test.cpp
@@ -21,24 +21,16 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/io_metadata_utilities.hpp>
 #include <cudf_test/random.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
-#include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
-#include <cudf/io/parquet.hpp>
 #include <cudf/io/types.hpp>
-#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/copy.h>
 #include <thrust/iterator/zip_iterator.h>
 
 #include <string>
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 8ad1fea649d..5f1aea71f73 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1358,10 +1358,11 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
   int64_t constexpr total_rows  = num_rows * num_reps;
   static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
 
-  auto const it  = cudf::detail::make_counting_transform_iterator(0l, [num_rows](int64_t i) {
-    return (i % num_rows) % static_cast<int64_t>(std::numeric_limits<data_type>::max() / 2);
-  });
-  auto const col = data_col(it, it + num_rows);
+  auto const it = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<int64_t>(0), [num_rows](int64_t i) {
+      return (i % num_rows) % static_cast<int64_t>(std::numeric_limits<data_type>::max() / 2);
+    });
+  auto const col         = data_col(it, it + num_rows);
   auto const chunk_table = cudf::table_view{{col}};
 
   std::vector<char> data_buffer;
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index cce0adbf317..fce99187516 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -31,7 +31,6 @@
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/io/orc_metadata.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/span.hpp>
diff --git a/cpp/tests/io/parquet_common.hpp b/cpp/tests/io/parquet_common.hpp
index c90b81ed27a..d66aa3bde9d 100644
--- a/cpp/tests/io/parquet_common.hpp
+++ b/cpp/tests/io/parquet_common.hpp
@@ -22,13 +22,11 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/io/datasource.hpp>
-#include <cudf/io/parquet_metadata.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
 #include <src/io/parquet/compact_protocol_reader.hpp>
 #include <src/io/parquet/parquet.hpp>
-#include <src/io/parquet/parquet_gpu.hpp>
 
 #include <random>
 #include <type_traits>
diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp
index f1286a00d22..d66f685cd9c 100644
--- a/cpp/tests/io/parquet_misc_test.cpp
+++ b/cpp/tests/io/parquet_misc_test.cpp
@@ -20,8 +20,6 @@
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/io/parquet.hpp>
-#include <cudf/stream_compaction.hpp>
-#include <cudf/transform.hpp>
 
 #include <array>
 
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index 7986a3c6d70..177e6163d4f 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -29,6 +29,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/transform.hpp>
 
+#include <src/io/parquet/parquet_gpu.hpp>
+
 #include <array>
 
 TEST_F(ParquetReaderTest, UserBounds)
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index be2ecd56424..5c3c8342cd2 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf_test/base_fixture.hpp>
 #include <cudf_test/testing_main.hpp>
 
 // NOTE: this file exists to define the parquet test's `main()` function.
diff --git a/cpp/tests/io/row_selection_test.cpp b/cpp/tests/io/row_selection_test.cpp
index ebadd870091..c40d3bbd299 100644
--- a/cpp/tests/io/row_selection_test.cpp
+++ b/cpp/tests/io/row_selection_test.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <src/io/utilities/row_selection.hpp>
diff --git a/cpp/tests/io/text/data_chunk_source_test.cpp b/cpp/tests/io/text/data_chunk_source_test.cpp
index 6f46df20633..79ce908f3e0 100644
--- a/cpp/tests/io/text/data_chunk_source_test.cpp
+++ b/cpp/tests/io/text/data_chunk_source_test.cpp
@@ -15,14 +15,11 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-
 #include <fstream>
 #include <random>
 
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
index 74d08061df9..60244462e2c 100644
--- a/cpp/tests/io/text/multibyte_split_test.cpp
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -19,16 +19,12 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/concatenate.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/multibyte_split.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 using cudf::test::strings_column_wrapper;
diff --git a/cpp/tests/iterator/value_iterator.cpp b/cpp/tests/iterator/value_iterator.cpp
index 22bc7475dbe..f7f7c0f2721 100644
--- a/cpp/tests/iterator/value_iterator.cpp
+++ b/cpp/tests/iterator/value_iterator.cpp
@@ -13,7 +13,6 @@
  * the License.
  */
 
-#include <cudf_test/base_fixture.hpp>
 #include <cudf_test/testing_main.hpp>
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/jit/parse_ptx_function.cpp b/cpp/tests/jit/parse_ptx_function.cpp
index 6f9dfd06730..c9bb691907a 100644
--- a/cpp/tests/jit/parse_ptx_function.cpp
+++ b/cpp/tests/jit/parse_ptx_function.cpp
@@ -16,7 +16,6 @@
 
 #include "jit/parser.hpp"
 
-#include <cudf_test/base_fixture.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <algorithm>
diff --git a/cpp/tests/join/cross_join_tests.cpp b/cpp/tests/join/cross_join_tests.cpp
index d87f5e54153..971913443e5 100644
--- a/cpp/tests/join/cross_join_tests.cpp
+++ b/cpp/tests/join/cross_join_tests.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp
index 178edc52dd3..9070efa38fe 100644
--- a/cpp/tests/join/distinct_join_tests.cpp
+++ b/cpp/tests/join/distinct_join_tests.cpp
@@ -15,12 +15,8 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/filling.hpp>
@@ -31,7 +27,6 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <limits>
 #include <vector>
 
 template <typename T>
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 3431e941359..6a8a54c8465 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -20,17 +20,12 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/join.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
index 554d5754e39..ddc65c3f379 100644
--- a/cpp/tests/join/semi_anti_join_tests.cpp
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -22,7 +22,6 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
-#include <cudf/dictionary/encode.hpp>
 #include <cudf/join.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
diff --git a/cpp/tests/json/json_tests.cpp b/cpp/tests/json/json_tests.cpp
index 42a574ac5c0..53166e04173 100644
--- a/cpp/tests/json/json_tests.cpp
+++ b/cpp/tests/json/json_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/json/json.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp
index 7b61be113f9..f1404990354 100644
--- a/cpp/tests/large_strings/large_strings_fixture.cpp
+++ b/cpp/tests/large_strings/large_strings_fixture.cpp
@@ -16,12 +16,10 @@
 
 #include "large_strings_fixture.hpp"
 
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/strings/combine.hpp>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
diff --git a/cpp/tests/large_strings/parquet_tests.cpp b/cpp/tests/large_strings/parquet_tests.cpp
index 007c08ce0fb..f47782a2d02 100644
--- a/cpp/tests/large_strings/parquet_tests.cpp
+++ b/cpp/tests/large_strings/parquet_tests.cpp
@@ -16,8 +16,6 @@
 
 #include "large_strings_fixture.hpp"
 
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/io/parquet.hpp>
diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp
index 8fb2b403051..7ae7a6a7414 100644
--- a/cpp/tests/lists/contains_tests.cpp
+++ b/cpp/tests/lists/contains_tests.cpp
@@ -22,7 +22,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/lists/contains.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 
diff --git a/cpp/tests/lists/extract_tests.cpp b/cpp/tests/lists/extract_tests.cpp
index 92dd5df5ec7..2c24f695c29 100644
--- a/cpp/tests/lists/extract_tests.cpp
+++ b/cpp/tests/lists/extract_tests.cpp
@@ -21,12 +21,8 @@
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/extract.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/tests/lists/sequences_tests.cpp b/cpp/tests/lists/sequences_tests.cpp
index 74545903eb3..dcb906cd2ef 100644
--- a/cpp/tests/lists/sequences_tests.cpp
+++ b/cpp/tests/lists/sequences_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp
index 5625b47e7ea..18aa118bb81 100644
--- a/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp
+++ b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp
@@ -20,8 +20,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/lists/extract.hpp>
 #include <cudf/lists/stream_compaction.hpp>
 
 namespace cudf::test {
diff --git a/cpp/tests/merge/merge_dictionary_test.cpp b/cpp/tests/merge/merge_dictionary_test.cpp
index dd528c19e4e..1d7a31fd797 100644
--- a/cpp/tests/merge/merge_dictionary_test.cpp
+++ b/cpp/tests/merge/merge_dictionary_test.cpp
@@ -17,9 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 
-#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/merge.hpp>
 #include <cudf/table/table.hpp>
diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp
index bea044496b3..d9fdb6099f0 100644
--- a/cpp/tests/merge/merge_string_test.cpp
+++ b/cpp/tests/merge/merge_string_test.cpp
@@ -17,10 +17,8 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/merge.hpp>
 #include <cudf/table/table.hpp>
@@ -30,10 +28,6 @@
 
 #include <gtest/gtest.h>
 
-#include <algorithm>
-#include <cassert>
-#include <initializer_list>
-#include <limits>
 #include <memory>
 #include <vector>
 
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index 6208d395f0a..fad390105d7 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -21,7 +21,6 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
@@ -34,7 +33,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/merge.h>
 
 #include <vector>
 
diff --git a/cpp/tests/partitioning/round_robin_test.cpp b/cpp/tests/partitioning/round_robin_test.cpp
index 89d23c39dca..3693cfbcc72 100644
--- a/cpp/tests/partitioning/round_robin_test.cpp
+++ b/cpp/tests/partitioning/round_robin_test.cpp
@@ -17,10 +17,8 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/partitioning.hpp>
 #include <cudf/table/table.hpp>
@@ -30,12 +28,7 @@
 
 #include <gtest/gtest.h>
 
-#include <algorithm>
-#include <cassert>
-#include <initializer_list>
-#include <limits>
 #include <memory>
-#include <numeric>
 #include <vector>
 
 using cudf::test::fixed_width_column_wrapper;
diff --git a/cpp/tests/quantiles/quantile_test.cpp b/cpp/tests/quantiles/quantile_test.cpp
index 6e88365b6e8..23b58618fe1 100644
--- a/cpp/tests/quantiles/quantile_test.cpp
+++ b/cpp/tests/quantiles/quantile_test.cpp
@@ -22,7 +22,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/quantiles.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
 #include <limits>
diff --git a/cpp/tests/quantiles/quantiles_test.cpp b/cpp/tests/quantiles/quantiles_test.cpp
index 44d4ec61852..c7e11af8c85 100644
--- a/cpp/tests/quantiles/quantiles_test.cpp
+++ b/cpp/tests/quantiles/quantiles_test.cpp
@@ -16,7 +16,6 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/reductions/ewm_tests.cpp b/cpp/tests/reductions/ewm_tests.cpp
index 09cec688509..1117b0d1acf 100644
--- a/cpp/tests/reductions/ewm_tests.cpp
+++ b/cpp/tests/reductions/ewm_tests.cpp
@@ -18,9 +18,7 @@
 
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/reduction.hpp>
 
 template <typename T>
diff --git a/cpp/tests/reductions/list_rank_test.cpp b/cpp/tests/reductions/list_rank_test.cpp
index f5470f7d881..736b5081d8f 100644
--- a/cpp/tests/reductions/list_rank_test.cpp
+++ b/cpp/tests/reductions/list_rank_test.cpp
@@ -14,14 +14,9 @@
  * limitations under the License.
  */
 
-#include <benchmarks/common/generate_input.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/filling.hpp>
 #include <cudf/reduction.hpp>
 
 struct ListRankScanTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/reductions/rank_tests.cpp b/cpp/tests/reductions/rank_tests.cpp
index 3ab1fc01eaa..19633211192 100644
--- a/cpp/tests/reductions/rank_tests.cpp
+++ b/cpp/tests/reductions/rank_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@
 #include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/aggregation.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/types.hpp>
 
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index bdb98372836..c09cde8f9e4 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -22,9 +22,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/dictionary/encode.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar.hpp>
@@ -33,11 +31,9 @@
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
-#include <iostream>
 #include <iterator>
 #include <vector>
 
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index c4463d68a68..72d92c5ac53 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -20,13 +20,11 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/reduction.hpp>
 
 #include <thrust/host_vector.h>
-#include <thrust/iterator/zip_iterator.h>
 #include <thrust/tuple.h>
 
 #include <algorithm>
diff --git a/cpp/tests/reductions/scan_tests.hpp b/cpp/tests/reductions/scan_tests.hpp
index 858697d8ef5..c2cce4bbbfa 100644
--- a/cpp/tests/reductions/scan_tests.hpp
+++ b/cpp/tests/reductions/scan_tests.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,7 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/strings/string_view.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
@@ -30,7 +28,6 @@
 
 #include <initializer_list>
 #include <type_traits>
-#include <vector>
 
 template <typename T>
 struct TypeParam_to_host_type {
diff --git a/cpp/tests/replace/clamp_test.cpp b/cpp/tests/replace/clamp_test.cpp
index 239c9ce6ddd..e972ea35ed0 100644
--- a/cpp/tests/replace/clamp_test.cpp
+++ b/cpp/tests/replace/clamp_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/replace/normalize_replace_tests.cpp b/cpp/tests/replace/normalize_replace_tests.cpp
index 2de17388ee8..c35f385329a 100644
--- a/cpp/tests/replace/normalize_replace_tests.cpp
+++ b/cpp/tests/replace/normalize_replace_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/testing_main.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/replace.hpp>
 
 // This is the main test fixture
diff --git a/cpp/tests/replace/replace_nans_tests.cpp b/cpp/tests/replace/replace_nans_tests.cpp
index 35232204db7..1b9fe92066a 100644
--- a/cpp/tests/replace/replace_nans_tests.cpp
+++ b/cpp/tests/replace/replace_nans_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index fcee27305f2..0c8ccea52a6 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -20,13 +20,11 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/dictionary/encode.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index b12bf08520f..ae4041bcfaf 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -20,20 +20,16 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/encode.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <thrust/host_vector.h>
-#include <thrust/iterator/transform_iterator.h>
 
 #include <gtest/gtest.h>
 
diff --git a/cpp/tests/reshape/byte_cast_tests.cpp b/cpp/tests/reshape/byte_cast_tests.cpp
index b3d9b2e2f5f..59585c0e947 100644
--- a/cpp/tests/reshape/byte_cast_tests.cpp
+++ b/cpp/tests/reshape/byte_cast_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/reshape.hpp>
diff --git a/cpp/tests/reshape/tile_tests.cpp b/cpp/tests/reshape/tile_tests.cpp
index ed76b9d2ea5..25cfc5c5108 100644
--- a/cpp/tests/reshape/tile_tests.cpp
+++ b/cpp/tests/reshape/tile_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp
index 165e0347785..e8a36d9ab48 100644
--- a/cpp/tests/rolling/collect_ops_test.cpp
+++ b/cpp/tests/rolling/collect_ops_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/rolling/empty_input_test.cpp b/cpp/tests/rolling/empty_input_test.cpp
index e7d1e3f0b10..2e1815671a9 100644
--- a/cpp/tests/rolling/empty_input_test.cpp
+++ b/cpp/tests/rolling/empty_input_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,7 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/aggregation.hpp>
diff --git a/cpp/tests/rolling/grouped_rolling_range_test.cpp b/cpp/tests/rolling/grouped_rolling_range_test.cpp
index fcfbd0eee78..2cb9b60000b 100644
--- a/cpp/tests/rolling/grouped_rolling_range_test.cpp
+++ b/cpp/tests/rolling/grouped_rolling_range_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,21 +17,16 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/bit.hpp>
 
-#include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
diff --git a/cpp/tests/rolling/grouped_rolling_test.cpp b/cpp/tests/rolling/grouped_rolling_test.cpp
index 78d5daf7e83..78b444bcd93 100644
--- a/cpp/tests/rolling/grouped_rolling_test.cpp
+++ b/cpp/tests/rolling/grouped_rolling_test.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/rolling/lead_lag_test.cpp b/cpp/tests/rolling/lead_lag_test.cpp
index de057e96320..6519b0ed4ee 100644
--- a/cpp/tests/rolling/lead_lag_test.cpp
+++ b/cpp/tests/rolling/lead_lag_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -26,7 +25,6 @@
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/tests/rolling/nth_element_test.cpp b/cpp/tests/rolling/nth_element_test.cpp
index 2444992e68f..5f2b383ed55 100644
--- a/cpp/tests/rolling/nth_element_test.cpp
+++ b/cpp/tests/rolling/nth_element_test.cpp
@@ -17,22 +17,15 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/aggregation.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/rolling.hpp>
 
-#include <rmm/device_buffer.hpp>
-
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
-#include <gtest/gtest-typed-test.h>
-
 #include <memory>
 #include <optional>
 
diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp
index 0eaab0c9f7a..dcaa47e722b 100644
--- a/cpp/tests/rolling/offset_row_window_test.cpp
+++ b/cpp/tests/rolling/offset_row_window_test.cpp
@@ -17,14 +17,10 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/aggregation.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/lists/explode.hpp>
 #include <cudf/rolling.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 template <typename T>
 using fwcw = cudf::test::fixed_width_column_wrapper<T>;
diff --git a/cpp/tests/rolling/range_rolling_window_test.cpp b/cpp/tests/rolling/range_rolling_window_test.cpp
index 461c41025e9..daf5fcc1d96 100644
--- a/cpp/tests/rolling/range_rolling_window_test.cpp
+++ b/cpp/tests/rolling/range_rolling_window_test.cpp
@@ -17,22 +17,17 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/aggregation.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/unary.hpp>
-#include <cudf/utilities/bit.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
 #include <src/rolling/detail/range_window_bounds.hpp>
-#include <src/rolling/detail/rolling.hpp>
 
 #include <utility>
 #include <vector>
diff --git a/cpp/tests/rolling/range_window_bounds_test.cpp b/cpp/tests/rolling/range_window_bounds_test.cpp
index b77451bf0bc..a67555280f4 100644
--- a/cpp/tests/rolling/range_window_bounds_test.cpp
+++ b/cpp/tests/rolling/range_window_bounds_test.cpp
@@ -15,9 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/rolling/range_window_bounds.hpp>
@@ -25,8 +22,6 @@
 
 #include <src/rolling/detail/range_window_bounds.hpp>
 
-#include <vector>
-
 struct RangeWindowBoundsTest : public cudf::test::BaseFixture {};
 
 template <typename Timestamp>
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index 6e0dc16dca9..72a511fd5f1 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/testing_main.hpp>
@@ -30,7 +29,6 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/rolling.hpp>
-#include <cudf/unary.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/traits.hpp>
 
diff --git a/cpp/tests/scalar/factories_test.cpp b/cpp/tests/scalar/factories_test.cpp
index 5f132f3ace9..26987ea1b7b 100644
--- a/cpp/tests/scalar/factories_test.cpp
+++ b/cpp/tests/scalar/factories_test.cpp
@@ -22,11 +22,8 @@
 
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-
 class ScalarFactoryTest : public cudf::test::BaseFixture {};
 
 template <typename T>
diff --git a/cpp/tests/search/search_dictionary_test.cpp b/cpp/tests/search/search_dictionary_test.cpp
index 78f79ccc648..a3bb1dfda10 100644
--- a/cpp/tests/search/search_dictionary_test.cpp
+++ b/cpp/tests/search/search_dictionary_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/search.hpp>
 
diff --git a/cpp/tests/search/search_list_test.cpp b/cpp/tests/search/search_list_test.cpp
index 7584003e800..fb5d0fcc889 100644
--- a/cpp/tests/search/search_list_test.cpp
+++ b/cpp/tests/search/search_list_test.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/search.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/search/search_struct_test.cpp b/cpp/tests/search/search_struct_test.cpp
index c35d359e75c..05b9deb3463 100644
--- a/cpp/tests/search/search_struct_test.cpp
+++ b/cpp/tests/search/search_struct_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/search.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/search/search_test.cpp b/cpp/tests/search/search_test.cpp
index 7550cc27161..8d750be5677 100644
--- a/cpp/tests/search/search_test.cpp
+++ b/cpp/tests/search/search_test.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/search.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/tests/sort/is_sorted_tests.cpp b/cpp/tests/sort/is_sorted_tests.cpp
index 109095192f9..e3c9f8d349e 100644
--- a/cpp/tests/sort/is_sorted_tests.cpp
+++ b/cpp/tests/sort/is_sorted_tests.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
diff --git a/cpp/tests/sort/rank_test.cpp b/cpp/tests/sort/rank_test.cpp
index e08a2105aea..ded46cb1f31 100644
--- a/cpp/tests/sort/rank_test.cpp
+++ b/cpp/tests/sort/rank_test.cpp
@@ -18,10 +18,8 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
diff --git a/cpp/tests/sort/sort_nested_types_tests.cpp b/cpp/tests/sort/sort_nested_types_tests.cpp
index 8ab23936ceb..ce4148a941e 100644
--- a/cpp/tests/sort/sort_nested_types_tests.cpp
+++ b/cpp/tests/sort/sort_nested_types_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
 #include <cudf/sorting.hpp>
diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp
index 6a35e977b46..e1505c7a474 100644
--- a/cpp/tests/sort/sort_test.cpp
+++ b/cpp/tests/sort/sort_test.cpp
@@ -28,7 +28,6 @@
 #include <cudf/types.hpp>
 
 #include <thrust/host_vector.h>
-#include <thrust/sort.h>
 
 #include <type_traits>
 #include <vector>
diff --git a/cpp/tests/sort/stable_sort_tests.cpp b/cpp/tests/sort/stable_sort_tests.cpp
index 655166e0d62..88de9d51523 100644
--- a/cpp/tests/sort/stable_sort_tests.cpp
+++ b/cpp/tests/sort/stable_sort_tests.cpp
@@ -25,9 +25,6 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
-#include <thrust/host_vector.h>
-#include <thrust/sort.h>
-
 #include <type_traits>
 #include <vector>
 
diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
index 6c0582fb846..1204b019739 100644
--- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
+++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
@@ -20,9 +20,7 @@
 #include <cudf_test/random.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/stream_compaction.hpp>
@@ -31,8 +29,6 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <thrust/copy.h>
-#include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 
 struct ApplyBooleanMask : public cudf::test::BaseFixture {};
diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp
index a2dab649961..ee1bb3ead92 100644
--- a/cpp/tests/stream_compaction/distinct_count_tests.cpp
+++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp
@@ -15,16 +15,11 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
-#include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
diff --git a/cpp/tests/stream_compaction/distinct_tests.cpp b/cpp/tests/stream_compaction/distinct_tests.cpp
index 14d7d8789ac..c618ff68cbb 100644
--- a/cpp/tests/stream_compaction/distinct_tests.cpp
+++ b/cpp/tests/stream_compaction/distinct_tests.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
@@ -27,8 +26,6 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
-#include <cmath>
-
 auto constexpr null{0};  // null at current level
 auto constexpr XXX{0};   // null pushed down from parent level
 auto constexpr NaN          = std::numeric_limits<double>::quiet_NaN();
diff --git a/cpp/tests/stream_compaction/drop_nans_tests.cpp b/cpp/tests/stream_compaction/drop_nans_tests.cpp
index bf72da5c840..71321361564 100644
--- a/cpp/tests/stream_compaction/drop_nans_tests.cpp
+++ b/cpp/tests/stream_compaction/drop_nans_tests.cpp
@@ -15,12 +15,9 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/stream_compaction/drop_nulls_tests.cpp b/cpp/tests/stream_compaction/drop_nulls_tests.cpp
index dbac1d58195..d3b45c2323e 100644
--- a/cpp/tests/stream_compaction/drop_nulls_tests.cpp
+++ b/cpp/tests/stream_compaction/drop_nulls_tests.cpp
@@ -15,12 +15,10 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/tests/stream_compaction/stable_distinct_tests.cpp b/cpp/tests/stream_compaction/stable_distinct_tests.cpp
index 6c6c53331d4..cc847da6340 100644
--- a/cpp/tests/stream_compaction/stable_distinct_tests.cpp
+++ b/cpp/tests/stream_compaction/stable_distinct_tests.cpp
@@ -15,20 +15,16 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
-#include <cmath>
-
 auto constexpr null{0};  // null at current level
 auto constexpr XXX{0};   // null pushed down from parent level
 auto constexpr NaN          = std::numeric_limits<double>::quiet_NaN();
diff --git a/cpp/tests/stream_compaction/unique_count_tests.cpp b/cpp/tests/stream_compaction/unique_count_tests.cpp
index 640d159fc4f..bad93e92712 100644
--- a/cpp/tests/stream_compaction/unique_count_tests.cpp
+++ b/cpp/tests/stream_compaction/unique_count_tests.cpp
@@ -15,16 +15,11 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/copying.hpp>
-#include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
diff --git a/cpp/tests/stream_compaction/unique_tests.cpp b/cpp/tests/stream_compaction/unique_tests.cpp
index d5b6915b520..e2b32b898b3 100644
--- a/cpp/tests/stream_compaction/unique_tests.cpp
+++ b/cpp/tests/stream_compaction/unique_tests.cpp
@@ -15,22 +15,16 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
-#include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
-#include <algorithm>
-#include <cmath>
-
 using cudf::nan_policy;
 using cudf::null_equality;
 using cudf::null_policy;
diff --git a/cpp/tests/streams/binaryop_test.cpp b/cpp/tests/streams/binaryop_test.cpp
index 2a7b52b1b6b..3dcc6f9e632 100644
--- a/cpp/tests/streams/binaryop_test.cpp
+++ b/cpp/tests/streams/binaryop_test.cpp
@@ -21,7 +21,6 @@
 #include <cudf_test/default_stream.hpp>
 
 #include <cudf/binaryop.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 
 class BinaryopTest : public cudf::test::BaseFixture {};
diff --git a/cpp/tests/streams/io/csv_test.cpp b/cpp/tests/streams/io/csv_test.cpp
index 42894a0ebcb..a74ee64f8de 100644
--- a/cpp/tests/streams/io/csv_test.cpp
+++ b/cpp/tests/streams/io/csv_test.cpp
@@ -17,13 +17,9 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/io/csv.hpp>
-#include <cudf/io/detail/csv.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
 
 #include <string>
 #include <vector>
diff --git a/cpp/tests/streams/io/json_test.cpp b/cpp/tests/streams/io/json_test.cpp
index f98e685ed0c..d352c6c3b2a 100644
--- a/cpp/tests/streams/io/json_test.cpp
+++ b/cpp/tests/streams/io/json_test.cpp
@@ -19,9 +19,7 @@
 #include <cudf_test/default_stream.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
diff --git a/cpp/tests/streams/io/multibyte_split_test.cpp b/cpp/tests/streams/io/multibyte_split_test.cpp
index b0eff1d3340..5bb17226029 100644
--- a/cpp/tests/streams/io/multibyte_split_test.cpp
+++ b/cpp/tests/streams/io/multibyte_split_test.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/default_stream.hpp>
 
-#include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/multibyte_split.hpp>
 
diff --git a/cpp/tests/streams/io/orc_test.cpp b/cpp/tests/streams/io/orc_test.cpp
index cc43bf15b5d..10722557e6a 100644
--- a/cpp/tests/streams/io/orc_test.cpp
+++ b/cpp/tests/streams/io/orc_test.cpp
@@ -17,19 +17,11 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/io/detail/orc.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/io/orc_metadata.hpp>
-#include <cudf/io/orc_types.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
 
-#include <iostream>
-#include <random>
-#include <sstream>
 #include <string>
 #include <vector>
 
diff --git a/cpp/tests/streams/io/parquet_test.cpp b/cpp/tests/streams/io/parquet_test.cpp
index 9d2dec2d697..18bb80e64af 100644
--- a/cpp/tests/streams/io/parquet_test.cpp
+++ b/cpp/tests/streams/io/parquet_test.cpp
@@ -17,13 +17,9 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/iterator_utilities.hpp>
 
-#include <cudf/io/detail/parquet.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
 
 #include <string>
 #include <vector>
diff --git a/cpp/tests/streams/join_test.cpp b/cpp/tests/streams/join_test.cpp
index 2811bb676fa..27bd7e080c9 100644
--- a/cpp/tests/streams/join_test.cpp
+++ b/cpp/tests/streams/join_test.cpp
@@ -19,11 +19,9 @@
 #include <cudf_test/default_stream.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 
 #include <cstdint>
 #include <memory>
diff --git a/cpp/tests/streams/null_mask_test.cpp b/cpp/tests/streams/null_mask_test.cpp
index e96224003f4..ed37a72545f 100644
--- a/cpp/tests/streams/null_mask_test.cpp
+++ b/cpp/tests/streams/null_mask_test.cpp
@@ -14,15 +14,12 @@
  * limitations under the License.
  */
 
-#include <tests/binaryop/util/runtime_support.h>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/null_mask.hpp>
-#include <cudf/scalar/scalar.hpp>
 
 class NullMaskTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/streams/reduction_test.cpp b/cpp/tests/streams/reduction_test.cpp
index b4f013fc960..9ab972302e4 100644
--- a/cpp/tests/streams/reduction_test.cpp
+++ b/cpp/tests/streams/reduction_test.cpp
@@ -17,11 +17,8 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/reduction.hpp>
-#include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
diff --git a/cpp/tests/streams/rolling_test.cpp b/cpp/tests/streams/rolling_test.cpp
index b352ad2c0d2..4d9899870b4 100644
--- a/cpp/tests/streams/rolling_test.cpp
+++ b/cpp/tests/streams/rolling_test.cpp
@@ -17,12 +17,10 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 
 class RollingTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp
index 07b2d77cc04..e7b282601e1 100644
--- a/cpp/tests/streams/stream_compaction_test.cpp
+++ b/cpp/tests/streams/stream_compaction_test.cpp
@@ -15,20 +15,16 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <cudf/copying.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
-#include <cmath>
-
 auto constexpr NaN          = std::numeric_limits<double>::quiet_NaN();
 auto constexpr KEEP_ANY     = cudf::duplicate_keep_option::KEEP_ANY;
 auto constexpr KEEP_FIRST   = cudf::duplicate_keep_option::KEEP_FIRST;
diff --git a/cpp/tests/streams/strings/factory_test.cpp b/cpp/tests/streams/strings/factory_test.cpp
index 36e595ab9fa..449e0830b0c 100644
--- a/cpp/tests/streams/strings/factory_test.cpp
+++ b/cpp/tests/streams/strings/factory_test.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
 
 #include <cudf/column/column_factories.hpp>
diff --git a/cpp/tests/streams/strings/reverse_test.cpp b/cpp/tests/streams/strings/reverse_test.cpp
index 4b4d0a7aff5..154e1c1b715 100644
--- a/cpp/tests/streams/strings/reverse_test.cpp
+++ b/cpp/tests/streams/strings/reverse_test.cpp
@@ -21,7 +21,6 @@
 #include <cudf/strings/reverse.hpp>
 
 #include <string>
-#include <vector>
 
 class StringsReverseTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/streams/transform_test.cpp b/cpp/tests/streams/transform_test.cpp
index cf81dc6fb42..9f168abcb31 100644
--- a/cpp/tests/streams/transform_test.cpp
+++ b/cpp/tests/streams/transform_test.cpp
@@ -15,17 +15,11 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
-#include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/ast/expressions.hpp>
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 
diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp
index 9c0ecaa52c0..06b9c2fa3c1 100644
--- a/cpp/tests/strings/array_tests.cpp
+++ b/cpp/tests/strings/array_tests.cpp
@@ -23,10 +23,8 @@
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/sorting.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/utilities.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp
index bb57d6f5e8a..e53adcf373a 100644
--- a/cpp/tests/strings/combine/concatenate_tests.cpp
+++ b/cpp/tests/strings/combine/concatenate_tests.cpp
@@ -22,7 +22,6 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/tests/strings/combine/join_list_elements_tests.cpp b/cpp/tests/strings/combine/join_list_elements_tests.cpp
index 00317146088..c92f1cfc8f8 100644
--- a/cpp/tests/strings/combine/join_list_elements_tests.cpp
+++ b/cpp/tests/strings/combine/join_list_elements_tests.cpp
@@ -22,7 +22,6 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/types.hpp>
 
 using namespace cudf::test::iterators;
 
diff --git a/cpp/tests/strings/concatenate_tests.cpp b/cpp/tests/strings/concatenate_tests.cpp
index 5cf4015b9e9..51dcc60d95e 100644
--- a/cpp/tests/strings/concatenate_tests.cpp
+++ b/cpp/tests/strings/concatenate_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/concatenate.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 
 #include <vector>
 
diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp
index b3dc3010c67..da0db0fc056 100644
--- a/cpp/tests/strings/datetime_tests.cpp
+++ b/cpp/tests/strings/datetime_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/strings/convert/convert_datetime.hpp>
-#include <cudf/strings/convert/convert_durations.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/wrappers/durations.hpp>
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index 7e0338f1bf4..37b25d9b287 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -21,7 +21,6 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <cudf/detail/iterator.cuh>
 #include <cudf/strings/extract.hpp>
 #include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index 4821a7fa999..7eb4b32d078 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -20,7 +20,6 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
 
 #include <cudf/strings/findall.hpp>
 #include <cudf/strings/regex/regex_program.hpp>
@@ -28,8 +27,6 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
-#include <vector>
-
 struct StringsFindallTests : public cudf::test::BaseFixture {};
 
 TEST_F(StringsFindallTests, FindallTest)
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index 79054551498..b788c05c152 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -23,8 +23,6 @@
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <limits>
-
 struct StringsConvertTest : public cudf::test::BaseFixture {};
 
 template <typename T>
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 26bcfe8028d..c08effdb969 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -24,9 +24,6 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-
 #include <thrust/host_vector.h>
 #include <thrust/iterator/transform_iterator.h>
 
diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp
index 219bd6d8b01..a34ff25cb69 100644
--- a/cpp/tests/structs/structs_column_tests.cpp
+++ b/cpp/tests/structs/structs_column_tests.cpp
@@ -17,28 +17,18 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/structs/structs_column_view.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/device_buffer.hpp>
 
-#include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/scan.h>
-#include <thrust/sequence.h>
 
 #include <algorithm>
 #include <functional>
diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp
index c33eedf9bd9..c0df2f01a63 100644
--- a/cpp/tests/structs/utilities_tests.cpp
+++ b/cpp/tests/structs/utilities_tests.cpp
@@ -14,21 +14,15 @@
  * limitations under the License.
  */
 
-#include "cudf_test/default_stream.hpp"
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/structs/utilities.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
diff --git a/cpp/tests/table/row_operators_tests.cpp b/cpp/tests/table/row_operators_tests.cpp
index 5fa63c47cf0..216c4d7b6bb 100644
--- a/cpp/tests/table/row_operators_tests.cpp
+++ b/cpp/tests/table/row_operators_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
diff --git a/cpp/tests/table/table_tests.cpp b/cpp/tests/table/table_tests.cpp
index 1637ba7d7d3..363f1a0ba5d 100644
--- a/cpp/tests/table/table_tests.cpp
+++ b/cpp/tests/table/table_tests.cpp
@@ -17,17 +17,14 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
-#include <cudf/copying.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
 #include <memory>
-#include <random>
 
 template <typename T>
 using column_wrapper = cudf::test::fixed_width_column_wrapper<T>;
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index e23f3f6e7d8..ef35a4472cf 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -21,13 +21,9 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/span.hpp>
 
 #include <nvtext/minhash.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-
 #include <vector>
 
 struct MinHashTest : public cudf::test::BaseFixture {};
diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp
index 1acb4fc4265..c72c7cfc80e 100644
--- a/cpp/tests/text/ngrams_tests.cpp
+++ b/cpp/tests/text/ngrams_tests.cpp
@@ -28,8 +28,6 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
-#include <vector>
-
 struct TextGenerateNgramsTest : public cudf::test::BaseFixture {};
 
 TEST_F(TextGenerateNgramsTest, Ngrams)
diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp
index b0d41004e7e..2515cc917fa 100644
--- a/cpp/tests/text/normalize_tests.cpp
+++ b/cpp/tests/text/normalize_tests.cpp
@@ -20,7 +20,6 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/normalize.hpp>
diff --git a/cpp/tests/text/stemmer_tests.cpp b/cpp/tests/text/stemmer_tests.cpp
index a343913411c..82c4bf53cfc 100644
--- a/cpp/tests/text/stemmer_tests.cpp
+++ b/cpp/tests/text/stemmer_tests.cpp
@@ -19,7 +19,6 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/stemmer.hpp>
diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp
index a615780c02a..782551ad66e 100644
--- a/cpp/tests/text/subword_tests.cpp
+++ b/cpp/tests/text/subword_tests.cpp
@@ -19,13 +19,11 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/subword_tokenize.hpp>
 
 #include <fstream>
-#include <iostream>
 #include <vector>
 
 // Global environment for temporary files
diff --git a/cpp/tests/transform/bools_to_mask_test.cpp b/cpp/tests/transform/bools_to_mask_test.cpp
index 2684123c08a..9437440f34d 100644
--- a/cpp/tests/transform/bools_to_mask_test.cpp
+++ b/cpp/tests/transform/bools_to_mask_test.cpp
@@ -20,10 +20,8 @@
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/transform.hpp>
-#include <cudf/types.hpp>
 
 #include <thrust/host_vector.h>
 
diff --git a/cpp/tests/transform/nans_to_null_test.cpp b/cpp/tests/transform/nans_to_null_test.cpp
index ba16c100e7a..42ca872a936 100644
--- a/cpp/tests/transform/nans_to_null_test.cpp
+++ b/cpp/tests/transform/nans_to_null_test.cpp
@@ -17,12 +17,10 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/transform.hpp>
-#include <cudf/types.hpp>
 
 template <typename T>
 struct NaNsToNullTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/transpose/transpose_test.cpp b/cpp/tests/transpose/transpose_test.cpp
index 5a88c402b8c..7797b2b2cf8 100644
--- a/cpp/tests/transpose/transpose_test.cpp
+++ b/cpp/tests/transpose/transpose_test.cpp
@@ -22,7 +22,6 @@
 #include <cudf/transpose.hpp>
 
 #include <algorithm>
-#include <limits>
 #include <random>
 #include <string>
 
diff --git a/cpp/tests/types/traits_test.cpp b/cpp/tests/types/traits_test.cpp
index 0d9092c33da..46468af515d 100644
--- a/cpp/tests/types/traits_test.cpp
+++ b/cpp/tests/types/traits_test.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf_test/base_fixture.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index 45b89b76070..ed4c1340dbb 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -20,18 +20,15 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <cuda/std/limits>
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
 
-#include <type_traits>
 #include <vector>
 
 static auto const test_timestamps_D = std::vector<int32_t>{
diff --git a/cpp/tests/unary/math_ops_test.cpp b/cpp/tests/unary/math_ops_test.cpp
index 5bfbf70d5f9..663a919f3f4 100644
--- a/cpp/tests/unary/math_ops_test.cpp
+++ b/cpp/tests/unary/math_ops_test.cpp
@@ -22,10 +22,6 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/unary.hpp>
-#include <cudf/utilities/bit.hpp>
-#include <cudf/wrappers/timestamps.hpp>
-
-#include <cuda/std/climits>
 
 #include <vector>
 
diff --git a/cpp/tests/unary/unary_ops_test.cpp b/cpp/tests/unary/unary_ops_test.cpp
index e7477c34642..3c616461c74 100644
--- a/cpp/tests/unary/unary_ops_test.cpp
+++ b/cpp/tests/unary/unary_ops_test.cpp
@@ -23,7 +23,6 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/unary.hpp>
 
-#include <cuda/std/limits>
 #include <thrust/iterator/counting_iterator.h>
 
 template <typename T>
diff --git a/cpp/tests/utilities/random_seed.cpp b/cpp/tests/utilities/random_seed.cpp
index ab5a31ce161..555d89b7dc5 100644
--- a/cpp/tests/utilities/random_seed.cpp
+++ b/cpp/tests/utilities/random_seed.cpp
@@ -13,8 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cudf/utilities/export.hpp>
 
-#include <cudf/types.hpp>
+#include <cstdint>
 
 namespace cudf {
 namespace test {
diff --git a/cpp/tests/utilities_tests/column_debug_tests.cpp b/cpp/tests/utilities_tests/column_debug_tests.cpp
index 7aa05af4591..2a57d678d07 100644
--- a/cpp/tests/utilities_tests/column_debug_tests.cpp
+++ b/cpp/tests/utilities_tests/column_debug_tests.cpp
@@ -16,12 +16,9 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/strings/strings_column_view.hpp>
-
 #include <thrust/iterator/transform_iterator.h>
 
 #include <type_traits>
diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp
index 9d6d5ccb9b5..a13ce825d0b 100644
--- a/cpp/tests/utilities_tests/column_utilities_tests.cpp
+++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp
@@ -17,20 +17,16 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/strings/strings_column_view.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
-#include <type_traits>
-
 template <typename T>
 struct ColumnUtilitiesTest : public cudf::test::BaseFixture {
   cudf::test::UniformRandomGenerator<cudf::size_type> random;
diff --git a/cpp/tests/utilities_tests/column_wrapper_tests.cpp b/cpp/tests/utilities_tests/column_wrapper_tests.cpp
index 479c6687e75..339678f3be8 100644
--- a/cpp/tests/utilities_tests/column_wrapper_tests.cpp
+++ b/cpp/tests/utilities_tests/column_wrapper_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/random.hpp>
 #include <cudf_test/type_lists.hpp>
 
diff --git a/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp b/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp
index 5e3fda5e6f7..ff50dc39979 100644
--- a/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp
+++ b/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_factories.hpp>
diff --git a/cpp/tests/utilities_tests/type_check_tests.cpp b/cpp/tests/utilities_tests/type_check_tests.cpp
index fecb896f95a..c1c5776be74 100644
--- a/cpp/tests/utilities_tests/type_check_tests.cpp
+++ b/cpp/tests/utilities_tests/type_check_tests.cpp
@@ -18,7 +18,6 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/wrappers/durations.hpp>
diff --git a/cpp/tests/utilities_tests/type_list_tests.cpp b/cpp/tests/utilities_tests/type_list_tests.cpp
index 849457056e4..6c3a84763a0 100644
--- a/cpp/tests/utilities_tests/type_list_tests.cpp
+++ b/cpp/tests/utilities_tests/type_list_tests.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/type_list_utilities.hpp>
 
 using namespace cudf::test;  // this will make reading code way easier
@@ -23,6 +22,7 @@ namespace {
 // Work around to remove parentheses surrounding a type
 template <typename T>
 struct argument_type;
+
 template <typename T, typename U>
 struct argument_type<T(U)> {
   using type = U;
diff --git a/dependencies.yaml b/dependencies.yaml
index ff97b67f0ce..7c7aa43fa41 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -232,7 +232,7 @@ files:
       key: cudf-pandas-tests
     includes:
       - test_python_cudf_pandas
-  py_rapids_build_cudf_polars:
+  py_build_cudf_polars:
     output: pyproject
     pyproject_dir: python/cudf_polars
     extras:
@@ -727,7 +727,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.8,<1.9
+          - polars>=1.11,<1.12
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index ecf619ddc44..5942cc16850 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -342,10 +342,7 @@ def clean_all_xml_files(path):
     "cudf.Series": ("cudf.core.series.Series", "cudf.Series"),
     "cudf.Index": ("cudf.core.index.Index", "cudf.Index"),
     "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"),
-    # TODO: Replace the first entry in a follow-up with rmm.pylibrmm.device_buffer.DeviceBuffer
-    # when the RMM objects inventory is generated from branch-24.12. The RMM objects inventory
-    # can be accessed here : https://docs.rapids.ai/api/rmm/nightly/objects.inv
-    "DeviceBuffer": ("rmm.DeviceBuffer", "rmm.DeviceBuffer"),
+    "DeviceBuffer": ("rmm.pylibrmm.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"),
 }
 
 
diff --git a/docs/cudf/source/developer_guide/contributing_guide.md b/docs/cudf/source/developer_guide/contributing_guide.md
index 6fce268f309..f4d2c7319b3 100644
--- a/docs/cudf/source/developer_guide/contributing_guide.md
+++ b/docs/cudf/source/developer_guide/contributing_guide.md
@@ -15,8 +15,7 @@ Developers are strongly recommended to set up `pre-commit` prior to any developm
 The `.pre-commit-config.yaml` file at the root of the repo is the primary source of truth linting.
 Specifically, cuDF uses the following tools:
 
-- [`ruff`](https://beta.ruff.rs/) checks for general code formatting compliance.
-- [`isort`](https://pycqa.github.io/isort/) ensures imports are sorted consistently.
+- [`ruff`](https://docs.astral.sh/ruff/) checks for general code formatting compliance.
 - [`mypy`](http://mypy-lang.org/) performs static type checking.
   In conjunction with [type hints](https://docs.python.org/3/library/typing.html),
   `mypy` can help catch various bugs that are otherwise difficult to find.
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index 95f5f9734dd..46221b6015b 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -38,10 +38,10 @@
     "import os\n",
     "\n",
     "import cupy as cp\n",
+    "import dask_cudf\n",
     "import pandas as pd\n",
     "\n",
     "import cudf\n",
-    "import dask_cudf\n",
     "\n",
     "cp.random.seed(12)\n",
     "\n",
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
index 3a79c869971..e0735a197fd 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst
@@ -10,3 +10,5 @@ nvtext
     minhash
     ngrams_tokenize
     normalize
+    replace
+    stemmer
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst
new file mode 100644
index 00000000000..04cee972dc1
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst
@@ -0,0 +1,6 @@
+=======
+replace
+=======
+
+.. automodule:: pylibcudf.nvtext.replace
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst
new file mode 100644
index 00000000000..b407ff8451a
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst
@@ -0,0 +1,6 @@
+=======
+stemmer
+=======
+
+.. automodule:: pylibcudf.nvtext.stemmer
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index c8c0016126d..ae670b5bd8a 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -16,6 +16,7 @@ strings
     regex_flags
     regex_program
     repeat
+    replace_re
     replace
     side_type
     slice
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst
new file mode 100644
index 00000000000..5bf715ef657
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst
@@ -0,0 +1,6 @@
+==========
+replace_re
+==========
+
+.. automodule:: pylibcudf.strings.replace_re
+   :members:
diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
index 75eafcc5387..abfe5a1b178 100644
--- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb
+++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
@@ -101,6 +101,8 @@
    "outputs": [],
    "source": [
     "# define a scalar function\n",
+    "\n",
+    "\n",
     "def f(x):\n",
     "    return x + 1"
    ]
@@ -247,6 +249,8 @@
    "outputs": [],
    "source": [
     "# redefine the same function from above\n",
+    "\n",
+    "\n",
     "def f(x):\n",
     "    return x + 1"
    ]
@@ -1622,6 +1626,8 @@
    "outputs": [],
    "source": [
     "# a user defined aggregation function.\n",
+    "\n",
+    "\n",
     "def udaf(df):\n",
     "    return df[\"b\"].max() - df[\"b\"].min() / 2"
    ]
diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java b/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java
deleted file mode 100644
index 72c2e659372..00000000000
--- a/java/src/main/java/ai/rapids/cudf/HostMemoryReservation.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *
- *  Copyright (c) 2023, NVIDIA CORPORATION.
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- */
-
-package ai.rapids.cudf;
-
-/**
- * Represents some amount of host memory that has been reserved. A reservation guarantees that one
- * or more allocations up to the reserved amount, minus padding for alignment will succeed. A
- * reservation typically guarantees the amount can be allocated one, meaning when a buffer
- * allocated from a reservation is freed it is not returned to the reservation, but to the pool of
- * memory the reservation originally came from. If more memory is allocated from the reservation
- * an OutOfMemoryError may be thrown, but it is not guaranteed to happen.
- *
- * When the reservation is closed any unused reservation will be returned to the pool of memory
- * the reservation came from.
- */
-public interface HostMemoryReservation extends HostMemoryAllocator, AutoCloseable {}
diff --git a/java/src/main/java/ai/rapids/cudf/RegexFlag.java b/java/src/main/java/ai/rapids/cudf/RegexFlag.java
index 7ed8e0354c9..68a3856f37d 100644
--- a/java/src/main/java/ai/rapids/cudf/RegexFlag.java
+++ b/java/src/main/java/ai/rapids/cudf/RegexFlag.java
@@ -28,7 +28,16 @@ public enum RegexFlag {
   DEFAULT(0),   // default
   MULTILINE(8), // the '^' and '$' honor new-line characters
   DOTALL(16),   // the '.' matching includes new-line characters
-  ASCII(256);   // use only ASCII when matching built-in character classes
+  ASCII(256),   // use only ASCII when matching built-in character classes
+  /**
+   * EXT_NEWLINE(512): Extends line delimiters to include the following Unicode characters
+   * - NEXT_LINE ('\u0085')
+   * - LINE_SEPARATOR ('\u2028')
+   * - PARAGRAPH_SEPARATOR ('\u2029')
+   * - CARRIAGE_RETURN ('\r')
+   * - NEW_LINE ('\n')
+   */
+  EXT_NEWLINE(512);
 
   final int nativeId; // Native id, for use with libcudf.
   private RegexFlag(int nativeId) { // Only constant values should be used
diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java
index 76b2799aad6..6da591d659f 100644
--- a/java/src/main/java/ai/rapids/cudf/Schema.java
+++ b/java/src/main/java/ai/rapids/cudf/Schema.java
@@ -29,26 +29,52 @@ public class Schema {
   public static final Schema INFERRED = new Schema();
 
   private final DType topLevelType;
+
+  /**
+   * Default value for precision value, when it is not specified or the column type is not decimal.
+   */
+  private static final int UNKNOWN_PRECISION = -1;
+
+  /**
+  * Store precision for the top level column, only applicable if the column is a decimal type.
+  * <p/>
+  * This variable is not designed to be used by any libcudf's APIs since libcudf does not support
+  * precisions for fixed point numbers.
+  * Instead, it is used only to pass down the precision values from Spark's DecimalType to the
+  * JNI level, where some JNI functions require these values to perform their operations.
+  */
+  private final int topLevelPrecision;
+
   private final List<String> childNames;
   private final List<Schema> childSchemas;
   private boolean flattened = false;
   private String[] flattenedNames;
   private DType[] flattenedTypes;
+  private int[] flattenedPrecisions;
   private int[] flattenedCounts;
 
   private Schema(DType topLevelType,
+                 int topLevelPrecision,
                  List<String> childNames,
                  List<Schema> childSchemas) {
     this.topLevelType = topLevelType;
+    this.topLevelPrecision = topLevelPrecision;
     this.childNames = childNames;
     this.childSchemas = childSchemas;
   }
 
+  private Schema(DType topLevelType,
+                 List<String> childNames,
+                 List<Schema> childSchemas) {
+    this(topLevelType, UNKNOWN_PRECISION, childNames, childSchemas);
+  }
+
   /**
    * Inferred schema.
    */
   private Schema() {
     topLevelType = null;
+    topLevelPrecision = UNKNOWN_PRECISION;
     childNames = null;
     childSchemas = null;
   }
@@ -104,14 +130,17 @@ private void flattenIfNeeded() {
       if (flatLen == 0) {
         flattenedNames = null;
         flattenedTypes = null;
+        flattenedPrecisions = null;
         flattenedCounts = null;
       } else {
         String[] names = new String[flatLen];
         DType[] types = new DType[flatLen];
+        int[] precisions = new int[flatLen];
         int[] counts = new int[flatLen];
-        collectFlattened(names, types, counts, 0);
+        collectFlattened(names, types, precisions, counts, 0);
         flattenedNames = names;
         flattenedTypes = types;
+        flattenedPrecisions = precisions;
         flattenedCounts = counts;
       }
       flattened = true;
@@ -128,19 +157,20 @@ private int flattenedLength(int startingLength) {
     return startingLength;
   }
 
-  private int collectFlattened(String[] names, DType[] types, int[] counts, int offset) {
+  private int collectFlattened(String[] names, DType[] types, int[] precisions, int[] counts, int offset) {
     if (childSchemas != null) {
       for (int i = 0; i < childSchemas.size(); i++) {
         Schema child = childSchemas.get(i);
         names[offset] = childNames.get(i);
         types[offset] = child.topLevelType;
+        precisions[offset] = child.topLevelPrecision;
         if (child.childNames != null) {
           counts[offset] = child.childNames.size();
         } else {
           counts[offset] = 0;
         }
         offset++;
-        offset = this.childSchemas.get(i).collectFlattened(names, types, counts, offset);
+        offset = this.childSchemas.get(i).collectFlattened(names, types, precisions, counts, offset);
       }
     }
     return offset;
@@ -226,6 +256,22 @@ public int[] getFlattenedTypeScales() {
     return ret;
   }
 
+  /**
+   * Get decimal precisions of the columns' types flattened from all levels in schema by
+   * depth-first traversal.
+   * <p/>
+   * This is used to pass down the decimal precisions from Spark to only the JNI layer, where
+   * some JNI functions require precision values to perform their operations.
+   * Decimal precisions should not be consumed by any libcudf's APIs since libcudf does not
+   * support precisions for fixed point numbers.
+   *
+   * @return An array containing decimal precision of all columns in schema.
+   */
+  public int[] getFlattenedDecimalPrecisions() {
+    flattenIfNeeded();
+    return flattenedPrecisions;
+  }
+
   /**
    * Get the types of the columns in schema flattened from all levels by depth-first traversal.
    * @return An array containing types of all columns in schema.
@@ -307,11 +353,13 @@ public HostColumnVector.DataType asHostDataType() {
 
   public static class Builder {
     private final DType topLevelType;
+    private final int topLevelPrecision;
     private final List<String> names;
     private final List<Builder> types;
 
-    private Builder(DType topLevelType) {
+    private Builder(DType topLevelType, int topLevelPrecision) {
       this.topLevelType = topLevelType;
+      this.topLevelPrecision = topLevelPrecision;
       if (topLevelType == DType.STRUCT || topLevelType == DType.LIST) {
         // There can be children
         names = new ArrayList<>();
@@ -322,14 +370,19 @@ private Builder(DType topLevelType) {
       }
     }
 
+    private Builder(DType topLevelType) {
+      this(topLevelType, UNKNOWN_PRECISION);
+    }
+
     /**
      * Add a new column
      * @param type the type of column to add
      * @param name the name of the column to add (Ignored for list types)
+     * @param precision the decimal precision, only applicable for decimal types
      * @return the builder for the new column. This should really only be used when the type
      * passed in is a LIST or a STRUCT.
      */
-    public Builder addColumn(DType type, String name) {
+    public Builder addColumn(DType type, String name, int precision) {
       if (names == null) {
         throw new IllegalStateException("A column of type " + topLevelType +
             " cannot have children");
@@ -340,21 +393,31 @@ public Builder addColumn(DType type, String name) {
       if (names.contains(name)) {
         throw new IllegalStateException("Cannot add duplicate names to a schema");
       }
-      Builder ret = new Builder(type);
+      Builder ret = new Builder(type, precision);
       types.add(ret);
       names.add(name);
       return ret;
     }
 
+    public Builder addColumn(DType type, String name) {
+      return addColumn(type, name, UNKNOWN_PRECISION);
+    }
+
     /**
      * Adds a single column to the current schema. addColumn is preferred as it can be used
      * to support nested types.
      * @param type the type of the column.
      * @param name the name of the column.
+     * @param precision the decimal precision, only applicable for decimal types.
      * @return this for chaining.
      */
+    public Builder column(DType type, String name, int precision) {
+      addColumn(type, name, precision);
+      return this;
+    }
+
     public Builder column(DType type, String name) {
-      addColumn(type, name);
+      addColumn(type, name, UNKNOWN_PRECISION);
       return this;
     }
 
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 708744569df..14c290b300a 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -31,6 +31,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.EnumSet;
 import java.util.List;
 import java.util.Optional;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -3877,6 +3878,43 @@ void testExtractRe() {
     }
   }
 
+  @Test
+void testExtractReWithMultiLineDelimiters() {
+    String NEXT_LINE = "\u0085";
+    String LINE_SEPARATOR = "\u2028";
+    String PARAGRAPH_SEPARATOR = "\u2029";
+    String CARRIAGE_RETURN = "\r";
+    String NEW_LINE = "\n";
+
+    try (ColumnVector input = ColumnVector.fromStrings(
+            "boo:" + NEXT_LINE + "boo::" + LINE_SEPARATOR + "boo:::",
+            "boo:::" + LINE_SEPARATOR + "zzé" + CARRIAGE_RETURN + "lll",
+            "boo::",
+            "",
+            "boo::" + NEW_LINE,
+            "boo::" + CARRIAGE_RETURN,
+            "boo:" + NEXT_LINE + "boo::" + PARAGRAPH_SEPARATOR,
+            "boo:" + NEW_LINE + "boo::" + LINE_SEPARATOR,
+            "boo:" + NEXT_LINE + "boo::" + NEXT_LINE);
+         Table expected_ext_newline = new Table.TestBuilder()
+             .column("boo:::", null, "boo::", null, "boo::", "boo::", "boo::", "boo::", "boo::")
+             .build();
+         Table expected_default = new Table.TestBuilder()
+             .column("boo:::", null, "boo::", null, "boo::", null, null, null, null)
+             .build()) {
+
+        // Regex pattern to match 'boo:' followed by one or more colons at the end of the string
+        try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.EXT_NEWLINE)))) {
+          assertColumnsAreEqual(expected_ext_newline.getColumns()[0], found.getColumns()[0]);
+        }
+
+        try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.DEFAULT)))) {
+          assertColumnsAreEqual(expected_default.getColumns()[0], found.getColumns()[0]);
+        }
+    }
+  }
+
+
   @Test
   void testExtractAllRecord() {
     String pattern = "([ab])(\\d)";
diff --git a/pyproject.toml b/pyproject.toml
index 661c68ee62e..6933484f4e7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,6 +34,8 @@ select = [
     "F",
     # pycodestyle Warning
     "W",
+    # isort
+    "I",
     # no-blank-line-before-function
     "D201",
     # one-blank-line-after-class
diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py
index 7b2b71cf216..0e4afadccf5 100644
--- a/python/cudf/benchmarks/conftest.py
+++ b/python/cudf/benchmarks/conftest.py
@@ -56,27 +56,23 @@
 # into the main repo.
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
 
-from config import cudf  # noqa: W0611, E402, F401
-from utils import (  # noqa: E402
-    OrderedSet,
-    collapse_fixtures,
-    column_generators,
-    make_fixture,
-)
-
 # Turn off isort until we upgrade to 5.8.0
 # https://github.com/pycqa/isort/issues/1594
-# isort: off
 from config import (  # noqa: W0611, E402, F401
     NUM_COLS,
     NUM_ROWS,
     collect_ignore,
+    cudf,  # noqa: W0611, E402, F401
     pytest_collection_modifyitems,
     pytest_sessionfinish,
     pytest_sessionstart,
 )
-
-# isort: on
+from utils import (  # noqa: E402
+    OrderedSet,
+    collapse_fixtures,
+    column_generators,
+    make_fixture,
+)
 
 
 @pytest_cases.fixture(params=[0, 1], ids=["AxisIndex", "AxisColumn"])
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 065655505b8..94dbdf5534d 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -688,15 +688,18 @@ cdef class Column:
         # special case for string column
         is_string_column = (cv.type().id() == libcudf_types.type_id.STRING)
         if is_string_column:
-            # get the size from offset child column (device to host copy)
-            offsets_column_index = 0
-            offset_child_column = cv.child(offsets_column_index)
-            if offset_child_column.size() == 0:
+            if cv.num_children() == 0:
                 base_nbytes = 0
             else:
-                chars_size = get_element(
-                    offset_child_column, offset_child_column.size()-1).value
-                base_nbytes = chars_size
+                # get the size from offset child column (device to host copy)
+                offsets_column_index = 0
+                offset_child_column = cv.child(offsets_column_index)
+                if offset_child_column.size() == 0:
+                    base_nbytes = 0
+                else:
+                    chars_size = get_element(
+                        offset_child_column, offset_child_column.size()-1).value
+                    base_nbytes = chars_size
 
         if data_ptr:
             if data_owner is None:
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 30353c4be6c..4221e745e65 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -4,7 +4,7 @@ import pickle
 
 from libc.stdint cimport uint8_t, uintptr_t
 from libcpp cimport bool
-from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -30,10 +30,6 @@ from libcpp.memory cimport make_unique
 cimport pylibcudf.libcudf.contiguous_split as cpp_contiguous_split
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.lists.gather cimport (
-    segmented_gather as cpp_segmented_gather,
-)
-from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.types cimport size_type
 
@@ -339,26 +335,6 @@ def get_element(Column input_column, size_type index):
     )
 
 
-@acquire_spill_lock()
-def segmented_gather(Column source_column, Column gather_map):
-    cdef shared_ptr[lists_column_view] source_LCV = (
-        make_shared[lists_column_view](source_column.view())
-    )
-    cdef shared_ptr[lists_column_view] gather_map_LCV = (
-        make_shared[lists_column_view](gather_map.view())
-    )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_segmented_gather(
-                source_LCV.get()[0], gather_map_LCV.get()[0])
-        )
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
-
-
 cdef class _CPackedColumns:
 
     @staticmethod
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 1dc586bb257..1c9d3a01b80 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -1,49 +1,22 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cpython cimport pycapsule
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 import pylibcudf
 
-from pylibcudf.libcudf.interop cimport (
-    DLManagedTensor,
-    from_dlpack as cpp_from_dlpack,
-    to_dlpack as cpp_to_dlpack,
-)
-from pylibcudf.libcudf.table.table cimport table
-from pylibcudf.libcudf.table.table_view cimport table_view
-
-from cudf._lib.utils cimport (
-    columns_from_pylibcudf_table,
-    columns_from_unique_ptr,
-    table_view_from_columns,
-)
+from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.dtypes import ListDtype, StructDtype
 
 
-def from_dlpack(dlpack_capsule):
+def from_dlpack(object dlpack_capsule):
     """
     Converts a DLPack Tensor PyCapsule into a list of columns.
 
     DLPack Tensor PyCapsule is expected to have the name "dltensor".
     """
-    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>pycapsule.\
-        PyCapsule_GetPointer(dlpack_capsule, 'dltensor')
-    pycapsule.PyCapsule_SetName(dlpack_capsule, 'used_dltensor')
-
-    cdef unique_ptr[table] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_from_dlpack(dlpack_tensor)
-        )
-
-    res = columns_from_unique_ptr(move(c_result))
-    dlpack_tensor.deleter(dlpack_tensor)
-    return res
+    return columns_from_pylibcudf_table(
+        pylibcudf.interop.from_dlpack(dlpack_capsule)
+    )
 
 
 def to_dlpack(list source_columns):
@@ -52,39 +25,13 @@ def to_dlpack(list source_columns):
 
     DLPack Tensor PyCapsule will have the name "dltensor".
     """
-    if any(column.null_count for column in source_columns):
-        raise ValueError(
-            "Cannot create a DLPack tensor with null values. \
-                Input is required to have null count as zero."
-        )
-
-    cdef DLManagedTensor *dlpack_tensor
-    cdef table_view source_table_view = table_view_from_columns(source_columns)
-
-    with nogil:
-        dlpack_tensor = cpp_to_dlpack(
-            source_table_view
+    return pylibcudf.interop.to_dlpack(
+        pylibcudf.Table(
+            [col.to_pylibcudf(mode="read") for col in source_columns]
         )
-
-    return pycapsule.PyCapsule_New(
-        dlpack_tensor,
-        'dltensor',
-        dlmanaged_tensor_pycapsule_deleter
     )
 
 
-cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept:
-    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>0
-    try:
-        dlpack_tensor = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(
-            pycap_obj, 'used_dltensor')
-        return  # we do not call a used capsule's deleter
-    except Exception:
-        dlpack_tensor = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(
-            pycap_obj, 'dltensor')
-    dlpack_tensor.deleter(dlpack_tensor)
-
-
 def gather_metadata(object cols_dtypes):
     """
     Generates a ColumnMetadata vector for each column.
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 7e8710bedb6..12432ac6d5d 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.types cimport null_order, size_type
 from cudf._lib.column cimport Column
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
-import pylibcudf
+import pylibcudf as plc
 
 from pylibcudf cimport Scalar
 
@@ -17,7 +17,7 @@ from pylibcudf cimport Scalar
 @acquire_spill_lock()
 def count_elements(Column col):
     return Column.from_pylibcudf(
-        pylibcudf.lists.count_elements(
+        plc.lists.count_elements(
             col.to_pylibcudf(mode="read"))
     )
 
@@ -25,8 +25,8 @@ def count_elements(Column col):
 @acquire_spill_lock()
 def explode_outer(list source_columns, int explode_column_idx):
     return columns_from_pylibcudf_table(
-        pylibcudf.lists.explode_outer(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]),
+        plc.lists.explode_outer(
+            plc.Table([c.to_pylibcudf(mode="read") for c in source_columns]),
             explode_column_idx,
         )
     )
@@ -35,7 +35,7 @@ def explode_outer(list source_columns, int explode_column_idx):
 @acquire_spill_lock()
 def distinct(Column col, bool nulls_equal, bool nans_all_equal):
     return Column.from_pylibcudf(
-        pylibcudf.lists.distinct(
+        plc.lists.distinct(
             col.to_pylibcudf(mode="read"),
             nulls_equal,
             nans_all_equal,
@@ -46,7 +46,7 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal):
 @acquire_spill_lock()
 def sort_lists(Column col, bool ascending, str na_position):
     return Column.from_pylibcudf(
-        pylibcudf.lists.sort_lists(
+        plc.lists.sort_lists(
             col.to_pylibcudf(mode="read"),
             ascending,
             null_order.BEFORE if na_position == "first" else null_order.AFTER,
@@ -58,7 +58,7 @@ def sort_lists(Column col, bool ascending, str na_position):
 @acquire_spill_lock()
 def extract_element_scalar(Column col, size_type index):
     return Column.from_pylibcudf(
-        pylibcudf.lists.extract_list_element(
+        plc.lists.extract_list_element(
             col.to_pylibcudf(mode="read"),
             index,
         )
@@ -68,7 +68,7 @@ def extract_element_scalar(Column col, size_type index):
 @acquire_spill_lock()
 def extract_element_column(Column col, Column index):
     return Column.from_pylibcudf(
-        pylibcudf.lists.extract_list_element(
+        plc.lists.extract_list_element(
             col.to_pylibcudf(mode="read"),
             index.to_pylibcudf(mode="read"),
         )
@@ -78,7 +78,7 @@ def extract_element_column(Column col, Column index):
 @acquire_spill_lock()
 def contains_scalar(Column col, py_search_key):
     return Column.from_pylibcudf(
-        pylibcudf.lists.contains(
+        plc.lists.contains(
             col.to_pylibcudf(mode="read"),
             <Scalar> py_search_key.device_value.c_value,
         )
@@ -88,7 +88,7 @@ def contains_scalar(Column col, py_search_key):
 @acquire_spill_lock()
 def index_of_scalar(Column col, object py_search_key):
     return Column.from_pylibcudf(
-        pylibcudf.lists.index_of(
+        plc.lists.index_of(
             col.to_pylibcudf(mode="read"),
             <Scalar> py_search_key.device_value.c_value,
             True,
@@ -99,7 +99,7 @@ def index_of_scalar(Column col, object py_search_key):
 @acquire_spill_lock()
 def index_of_column(Column col, Column search_keys):
     return Column.from_pylibcudf(
-        pylibcudf.lists.index_of(
+        plc.lists.index_of(
             col.to_pylibcudf(mode="read"),
             search_keys.to_pylibcudf(mode="read"),
             True,
@@ -110,8 +110,8 @@ def index_of_column(Column col, Column search_keys):
 @acquire_spill_lock()
 def concatenate_rows(list source_columns):
     return Column.from_pylibcudf(
-        pylibcudf.lists.concatenate_rows(
-            pylibcudf.Table([
+        plc.lists.concatenate_rows(
+            plc.Table([
                 c.to_pylibcudf(mode="read") for c in source_columns
             ])
         )
@@ -121,8 +121,18 @@ def concatenate_rows(list source_columns):
 @acquire_spill_lock()
 def concatenate_list_elements(Column input_column, dropna=False):
     return Column.from_pylibcudf(
-        pylibcudf.lists.concatenate_list_elements(
+        plc.lists.concatenate_list_elements(
             input_column.to_pylibcudf(mode="read"),
             dropna,
         )
     )
+
+
+@acquire_spill_lock()
+def segmented_gather(Column source_column, Column gather_map):
+    return Column.from_pylibcudf(
+        plc.lists.segmented_gather(
+            source_column.to_pylibcudf(mode="read"),
+            gather_map.to_pylibcudf(mode="read"),
+        )
+    )
diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx
index 633bc902db1..cc45123dd0a 100644
--- a/python/cudf/cudf/_lib/nvtext/normalize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/normalize.pyx
@@ -11,16 +11,18 @@ from pylibcudf import nvtext
 
 @acquire_spill_lock()
 def normalize_spaces(Column input):
-    result = nvtext.normalize.normalize_spaces(
-        input.to_pylibcudf(mode="read")
+    return Column.from_pylibcudf(
+        nvtext.normalize.normalize_spaces(
+            input.to_pylibcudf(mode="read")
+        )
     )
-    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
 def normalize_characters(Column input, bool do_lower=True):
-    result = nvtext.normalize.normalize_characters(
-        input.to_pylibcudf(mode="read"),
-        do_lower,
+    return Column.from_pylibcudf(
+        nvtext.normalize.normalize_characters(
+            input.to_pylibcudf(mode="read"),
+            do_lower,
+        )
     )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx
index 61ae3da5782..bec56ade83c 100644
--- a/python/cudf/cudf/_lib/nvtext/replace.pyx
+++ b/python/cudf/cudf/_lib/nvtext/replace.pyx
@@ -2,20 +2,10 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.nvtext.replace cimport (
-    filter_tokens as cpp_filter_tokens,
-    replace_tokens as cpp_replace_tokens,
-)
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
+from pylibcudf import nvtext
 
 
 @acquire_spill_lock()
@@ -30,27 +20,14 @@ def replace_tokens(Column strings,
     provided.
     """
 
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_targets = targets.view()
-    cdef column_view c_replacements = replacements.view()
-
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_replace_tokens(
-                c_strings,
-                c_targets,
-                c_replacements,
-                c_delimiter[0],
-            )
+    return Column.from_pylibcudf(
+        nvtext.replace.replace_tokens(
+            strings.to_pylibcudf(mode="read"),
+            targets.to_pylibcudf(mode="read"),
+            replacements.to_pylibcudf(mode="read"),
+            py_delimiter.device_value.c_value,
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
@@ -65,24 +42,11 @@ def filter_tokens(Column strings,
     character provided.
     """
 
-    cdef DeviceScalar replacement = py_replacement.device_value
-    cdef DeviceScalar delimiter = py_delimiter.device_value
-
-    cdef column_view c_strings = strings.view()
-    cdef const string_scalar* c_repl = <const string_scalar*>replacement\
-        .get_raw_ptr()
-    cdef const string_scalar* c_delimiter = <const string_scalar*>delimiter\
-        .get_raw_ptr()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_filter_tokens(
-                c_strings,
-                min_token_length,
-                c_repl[0],
-                c_delimiter[0],
-            )
+    return Column.from_pylibcudf(
+        nvtext.replace.filter_tokens(
+            strings.to_pylibcudf(mode="read"),
+            min_token_length,
+            py_replacement.device_value.c_value,
+            py_delimiter.device_value.c_value,
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
index 5bf25562fed..63a389b64d5 100644
--- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx
+++ b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
@@ -1,24 +1,19 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from enum import IntEnum
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
+from cudf.core.buffer import acquire_spill_lock
+
 from pylibcudf.libcudf.nvtext.stemmer cimport (
-    is_letter as cpp_is_letter,
     letter_type,
-    porter_stemmer_measure as cpp_porter_stemmer_measure,
     underlying_type_t_letter_type,
 )
 from pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.column cimport Column
 
+from pylibcudf import nvtext
+
 
 class LetterType(IntEnum):
     CONSONANT = <underlying_type_t_letter_type> letter_type.CONSONANT
@@ -27,43 +22,34 @@ class LetterType(IntEnum):
 
 @acquire_spill_lock()
 def porter_stemmer_measure(Column strings):
-    cdef column_view c_strings = strings.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_porter_stemmer_measure(c_strings))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        nvtext.stemmer.porter_stemmer_measure(
+            strings.to_pylibcudf(mode="read"),
+        )
+    )
 
 
 @acquire_spill_lock()
 def is_letter(Column strings,
               object ltype,
               size_type index):
-    cdef column_view c_strings = strings.view()
-    cdef letter_type c_ltype = <letter_type>(
-        <underlying_type_t_letter_type> ltype
+    return Column.from_pylibcudf(
+        nvtext.stemmer.is_letter(
+            strings.to_pylibcudf(mode="read"),
+            ltype==LetterType.VOWEL,
+            index,
+        )
     )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_is_letter(c_strings, c_ltype, index))
-
-    return Column.from_unique_ptr(move(c_result))
 
 
 @acquire_spill_lock()
 def is_letter_multi(Column strings,
                     object ltype,
                     Column indices):
-    cdef column_view c_strings = strings.view()
-    cdef column_view c_indices = indices.view()
-    cdef letter_type c_ltype = <letter_type>(
-        <underlying_type_t_letter_type> ltype
+    return Column.from_pylibcudf(
+        nvtext.stemmer.is_letter(
+            strings.to_pylibcudf(mode="read"),
+            ltype==LetterType.VOWEL,
+            indices.to_pylibcudf(mode="read"),
+        )
     )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_is_letter(c_strings, c_ltype, c_indices))
-
-    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx
index fffc8b7c3f6..462d5c903e8 100644
--- a/python/cudf/cudf/_lib/strings/replace_re.pyx
+++ b/python/cudf/cudf/_lib/strings/replace_re.pyx
@@ -1,26 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
+from pylibcudf.libcudf.types cimport size_type
+import pylibcudf as plc
 
 from cudf.core.buffer import acquire_spill_lock
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from pylibcudf.libcudf.strings.regex_program cimport regex_program
-from pylibcudf.libcudf.strings.replace_re cimport (
-    replace_re as cpp_replace_re,
-    replace_with_backrefs as cpp_replace_with_backrefs,
-)
-from pylibcudf.libcudf.types cimport size_type
-
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
 
 
 @acquire_spill_lock()
@@ -34,28 +19,16 @@ def replace_re(Column source_strings,
     `n` indicates the number of resplacements to be made from
     start. (-1 indicates all)
     """
-
-    cdef DeviceScalar repl = py_repl.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef const string_scalar* scalar_repl = \
-        <const string_scalar*>(repl.get_raw_ptr())
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_replace_re(
-            source_view,
-            dereference(c_prog),
-            scalar_repl[0],
-            n
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.replace_re.replace_re(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        py_repl.device_value.c_value,
+        n
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
@@ -68,50 +41,29 @@ def replace_with_backrefs(
     new string with the extracted elements found using
     `pattern` regular expression in `source_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string pattern_string = <string>str(pattern).encode()
-    cdef string repl_string = <string>str(repl).encode()
-    cdef regex_flags c_flags = regex_flags.DEFAULT
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(pattern_string, c_flags))
-        c_result = move(cpp_replace_with_backrefs(
-            source_view,
-            dereference(c_prog),
-            repl_string
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.replace_re.replace_with_backrefs(
+        source_strings.to_pylibcudf(mode="read"),
+        plc.strings.regex_program.RegexProgram.create(
+            str(pattern),
+            plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        repl
+    )
+    return Column.from_pylibcudf(plc_column)
 
 
 @acquire_spill_lock()
 def replace_multi_re(Column source_strings,
-                     object patterns,
+                     list patterns,
                      Column repl_strings):
     """
     Returns a Column after replacing occurrences of multiple
     regular expressions `patterns` with their corresponding
     strings in `repl_strings` in `source_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view repl_view = repl_strings.view()
-
-    cdef int pattern_size = len(patterns)
-    cdef vector[string] patterns_vector
-    patterns_vector.reserve(pattern_size)
-
-    for pattern in patterns:
-        patterns_vector.push_back(str.encode(pattern))
-
-    with nogil:
-        c_result = move(cpp_replace_re(
-            source_view,
-            patterns_vector,
-            repl_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.strings.replace_re.replace_re(
+        source_strings.to_pylibcudf(mode="read"),
+        patterns,
+        repl_strings.to_pylibcudf(mode="read")
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py
index 6e8ad556b08..3b13cc258ab 100644
--- a/python/cudf/cudf/_typing.py
+++ b/python/cudf/cudf/_typing.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 import sys
-from collections.abc import Callable
-from typing import TYPE_CHECKING, Any, Dict, Iterable, TypeVar, Union
+from collections.abc import Callable, Iterable
+from typing import TYPE_CHECKING, Any, TypeVar, Union
 
 import numpy as np
 from pandas import Period, Timedelta, Timestamp
@@ -42,7 +42,7 @@
 SeriesOrSingleColumnIndex = Union["cudf.Series", "cudf.core.index.Index"]
 
 # Groupby aggregation
-AggType = Union[str, Callable]
-MultiColumnAggType = Union[
-    AggType, Iterable[AggType], Dict[Any, Iterable[AggType]]
+AggType = Union[str, Callable]  # noqa: UP007
+MultiColumnAggType = Union[  # noqa: UP007
+    AggType, Iterable[AggType], dict[Any, Iterable[AggType]]
 ]
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index caff019f575..ffa306bf93f 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -6,7 +6,7 @@
 import pickle
 import weakref
 from types import SimpleNamespace
-from typing import Any, Literal, Mapping
+from typing import TYPE_CHECKING, Any, Literal
 
 import numpy
 from typing_extensions import Self
@@ -18,6 +18,9 @@
 from cudf.core.abc import Serializable
 from cudf.utils.string import format_bytes
 
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
 
 def host_memory_allocation(nbytes: int) -> memoryview:
     """Allocate host memory using NumPy
diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
index 0bd8d6054b3..ecf9807cfc2 100644
--- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
+++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
@@ -2,13 +2,16 @@
 
 from __future__ import annotations
 
-from typing import Literal, Mapping
+from typing import TYPE_CHECKING, Literal
 
 from typing_extensions import Self
 
 import cudf
 from cudf.core.buffer.buffer import Buffer, BufferOwner
 
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
 
 class ExposureTrackedBuffer(Buffer):
     """An exposure tracked buffer.
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index 06791df7dc0..a1e87d04bc9 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -29,4 +29,3 @@
     Decimal128Column,
     DecimalBaseColumn,
 )
-from cudf.core.column.interval import IntervalColumn  # noqa: F401
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 864e87b5377..087d0ed65f5 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -4,7 +4,7 @@
 
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Mapping, Sequence, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 import pandas as pd
@@ -26,6 +26,7 @@
 
 if TYPE_CHECKING:
     from collections import abc
+    from collections.abc import Mapping, Sequence
 
     import numba.cuda
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 7674565e2c3..d2cd6e8ac8f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -4,10 +4,11 @@
 
 import pickle
 from collections import abc
+from collections.abc import MutableSequence, Sequence
 from functools import cached_property
 from itertools import chain
 from types import SimpleNamespace
-from typing import TYPE_CHECKING, Any, Literal, MutableSequence, Sequence, cast
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 import cupy
 import numpy as np
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 2c9b0baa9b6..b6dc250e64d 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -8,7 +8,7 @@
 import locale
 import re
 from locale import nl_langinfo
-from typing import TYPE_CHECKING, Literal, Sequence, cast
+from typing import TYPE_CHECKING, Literal, cast
 
 import numpy as np
 import pandas as pd
@@ -31,6 +31,8 @@
 from cudf.utils.utils import _all_bools_with_nulls
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from cudf._typing import (
         ColumnBinaryOperand,
         DatetimeLikeScalar,
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 8803ebd6791..8ae06f72d1e 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -3,8 +3,9 @@
 from __future__ import annotations
 
 import warnings
+from collections.abc import Sequence
 from decimal import Decimal
-from typing import TYPE_CHECKING, Sequence, cast
+from typing import TYPE_CHECKING, cast
 
 import cupy as cp
 import numpy as np
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index c6a39199e3b..6b25e568f00 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from functools import cached_property
-from typing import TYPE_CHECKING, Sequence, cast
+from typing import TYPE_CHECKING, cast
 
 import numpy as np
 import pandas as pd
@@ -11,7 +11,6 @@
 from typing_extensions import Self
 
 import cudf
-from cudf._lib.copying import segmented_gather
 from cudf._lib.lists import (
     concatenate_list_elements,
     concatenate_rows,
@@ -22,6 +21,7 @@
     extract_element_scalar,
     index_of_column,
     index_of_scalar,
+    segmented_gather,
     sort_lists,
 )
 from cudf._lib.strings.convert.convert_lists import format_list_column
@@ -34,6 +34,8 @@
 from cudf.core.missing import NA
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
     from cudf.core.buffer import Buffer
 
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index 05a0ab2e09a..a91c080fe21 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -2,9 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Union, overload
-
-from typing_extensions import Literal
+from typing import Literal, Union, overload
 
 import cudf
 import cudf.core.column
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 78d2814ed26..620cae65374 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import functools
-from typing import TYPE_CHECKING, Any, Sequence, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 import pandas as pd
@@ -28,7 +28,7 @@
 from .numerical_base import NumericalBaseColumn
 
 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from collections.abc import Callable, Sequence
 
     from cudf._typing import (
         ColumnBinaryOperand,
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 45d1a8b087b..856ce0f75de 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5,7 +5,7 @@
 import re
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Sequence, cast, overload
+from typing import TYPE_CHECKING, cast, overload
 
 import numpy as np
 import pandas as pd
@@ -35,6 +35,8 @@ def str_to_boolean(column: StringColumn):
 
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     import cupy
     import numba.cuda
 
@@ -998,7 +1000,7 @@ def replace(
             return self._return_or_inplace(
                 libstrings.replace_multi_re(
                     self._column,
-                    pat,
+                    list(pat),
                     column.as_column(repl, dtype="str"),
                 )
                 if regex
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 6b6f3e517a8..087d6474e7f 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -4,7 +4,7 @@
 
 import datetime
 import functools
-from typing import TYPE_CHECKING, Sequence, cast
+from typing import TYPE_CHECKING, cast
 
 import numpy as np
 import pandas as pd
@@ -19,6 +19,8 @@
 from cudf.utils.utils import _all_bools_with_nulls
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype
 
 _unit_to_nanoseconds_conversion = {
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index bc093fdaa9a..496e86ed709 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -5,8 +5,9 @@
 import itertools
 import sys
 from collections import abc
+from collections.abc import Mapping
 from functools import cached_property, reduce
-from typing import TYPE_CHECKING, Any, Mapping, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 import pandas as pd
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7d4d34f5b04..bf1c39b23da 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -13,8 +13,8 @@
 import textwrap
 import warnings
 from collections import abc, defaultdict
-from collections.abc import Callable, Iterator
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast
+from collections.abc import Callable, Iterator, MutableMapping
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 import cupy
 import numba
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 5250a741d3d..aa601a2b322 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -3,7 +3,7 @@
 
 import enum
 from collections import abc
-from typing import Any, Iterable, Mapping, Sequence, Tuple, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import cupy as cp
 import numpy as np
@@ -20,6 +20,9 @@
     build_column,
 )
 
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Mapping, Sequence
+
 # Implementation of interchange protocol classes
 # ----------------------------------------------
 
@@ -61,7 +64,7 @@ class _MaskKind(enum.IntEnum):
     _DtypeKind.BOOL,
     _DtypeKind.STRING,
 }
-ProtoDtype = Tuple[_DtypeKind, int, str, str]
+ProtoDtype = tuple[_DtypeKind, int, str, str]
 
 
 class _CuDFBuffer:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 37ad6b8fabb..205edd91d9d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6,7 +6,7 @@
 import pickle
 import warnings
 from collections import abc
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping
+from typing import TYPE_CHECKING, Any, Literal
 
 # TODO: The `numpy` import is needed for typing purposes during doc builds
 # only, need to figure out why the `np` alias is insufficient then remove.
@@ -36,6 +36,7 @@
 from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf
 
 if TYPE_CHECKING:
+    from collections.abc import MutableMapping
     from types import ModuleType
 
     from cudf._typing import Dtype, ScalarLike
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 81b20488d8d..6630bd96c01 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -8,7 +8,7 @@
 import warnings
 from collections import abc
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Iterable, Literal
+from typing import TYPE_CHECKING, Any, Literal
 
 import cupy as cp
 import numpy as np
@@ -36,6 +36,8 @@
 from cudf.utils.utils import GetAttrGetItemMixin
 
 if TYPE_CHECKING:
+    from collections.abc import Iterable
+
     from cudf._typing import (
         AggType,
         DataFrameOrSeries,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index cd07c58c5d9..1b90e9f9df0 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -5,10 +5,10 @@
 import operator
 import pickle
 import warnings
-from collections.abc import Hashable
+from collections.abc import Hashable, MutableMapping
 from functools import cache, cached_property
 from numbers import Number
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast
+from typing import TYPE_CHECKING, Any, Literal, cast
 
 import cupy
 import numpy as np
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 5952815deef..e031f2a4e8e 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -10,9 +10,7 @@
 from typing import (
     TYPE_CHECKING,
     Any,
-    Callable,
     Literal,
-    MutableMapping,
     TypeVar,
     cast,
 )
@@ -63,6 +61,8 @@
 from cudf.utils.utils import _warn_no_dask_cudf
 
 if TYPE_CHECKING:
+    from collections.abc import Callable, MutableMapping
+
     from cudf._typing import (
         ColumnLike,
         DataFrameOrSeries,
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index 8182e5cede2..ce6a5c960dd 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -3,9 +3,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Any, List, Union
-
-from typing_extensions import TypeAlias
+from typing import Any, TypeAlias
 
 import cudf
 from cudf.api.types import _is_scalar_or_zero_d_array, is_integer
@@ -46,11 +44,11 @@ class ScalarIndexer:
     key: GatherMap
 
 
-IndexingSpec: TypeAlias = Union[
-    EmptyIndexer, MapIndexer, MaskIndexer, ScalarIndexer, SliceIndexer
-]
+IndexingSpec: TypeAlias = (
+    EmptyIndexer | MapIndexer | MaskIndexer | ScalarIndexer | SliceIndexer
+)
 
-ColumnLabels: TypeAlias = List[str]
+ColumnLabels: TypeAlias = list[str]
 
 
 def destructure_iloc_key(
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 92d094d9de5..bfff62f0a89 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -8,7 +8,7 @@
 import pickle
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, MutableMapping
+from typing import TYPE_CHECKING, Any
 
 import cupy as cp
 import numpy as np
@@ -36,7 +36,7 @@
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Hashable
+    from collections.abc import Generator, Hashable, MutableMapping
 
     from typing_extensions import Self
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 29ed18ac0ce..9b60424c924 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -9,7 +9,7 @@
 import warnings
 from collections import abc
 from shutil import get_terminal_size
-from typing import TYPE_CHECKING, Any, Literal, MutableMapping
+from typing import TYPE_CHECKING, Any, Literal
 
 import cupy
 import numpy as np
@@ -71,6 +71,8 @@
 from cudf.utils.performance_tracking import _performance_tracking
 
 if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+
     import pyarrow as pa
 
     from cudf._typing import (
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 68f34fa28ff..885e7b16644 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -4,7 +4,7 @@
 import math
 import re
 import warnings
-from typing import Literal, Sequence
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 import pandas as pd
@@ -20,6 +20,9 @@
 from cudf.core import column
 from cudf.core.index import ensure_index
 
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
 # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112
 _unit_map = {
     "year": "year",
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index c364d55e677..73afde407db 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -10,9 +10,9 @@
 import pickle
 import types
 import warnings
-from collections.abc import Callable, Iterator
+from collections.abc import Callable, Iterator, Mapping
 from enum import IntEnum
-from typing import Any, Literal, Mapping
+from typing import Any, Literal
 
 import numpy as np
 
diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index f82e300e83d..38103a71908 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -17,7 +17,7 @@
 from abc import abstractmethod
 from importlib._bootstrap import _ImportLockContext as ImportLock
 from types import ModuleType
-from typing import Any, ContextManager, NamedTuple
+from typing import Any, ContextManager, NamedTuple  # noqa: UP035
 
 from typing_extensions import Self
 
diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
index 8870fbc5c28..bb2fc00d9fc 100644
--- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
+++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
@@ -9,6 +9,7 @@
     python analyze-test-failures.py <path-to-test-log> <file-or-pattern>
 
 Example:
+-------
     python analyze-test-failures.py log.json frame/*
 """
 
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index d12d2697729..59966a5ff0c 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -35,7 +35,7 @@ def null_assert_warnings(*args, **kwargs):
 
 @pytest.fixture(scope="session", autouse=True)  # type: ignore
 def patch_testing_functions():
-    tm.assert_produces_warning = null_assert_warnings
+    tm.assert_produces_warning = null_assert_warnings  # noqa: F821
     pytest.raises = replace_kwargs({"match": None})(pytest.raises)
 
 
diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
index 4ea0b3b4413..a0ad872e4c7 100644
--- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py
+++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
@@ -5,7 +5,8 @@
 """
 Summarizes the test results per module.
 
-Examples:
+Examples
+--------
     python summarize-test-results.py log.json
     python summarize-test-results.py log.json --output json
     python summarize-test-results.py log.json --output table
diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
index a75a20a4681..63fd9601fc1 100644
--- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
+++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
@@ -387,7 +387,8 @@ def test_dir_bound_method(
 ):
     """This test will fail because dir for bound methods is currently
     incorrect, but we have no way to fix it without materializing the slow
-    type, which is unnecessarily expensive."""
+    type, which is unnecessarily expensive.
+    """
     Fast, FastIntermediate = fast_and_intermediate_with_doc
     Slow, SlowIntermediate = slow_and_intermediate_with_doc
 
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index feab04ffadc..80201dd84db 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -81,50 +81,6 @@ cudf-pandas-tests = [
 Homepage = "https://github.com/rapidsai/cudf"
 Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-    "pylibcudf"
-]
-known_first_party = [
-    "cudf",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
-
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
 empty_parameter_set_mark = "fail_at_collect"
@@ -174,3 +130,18 @@ wheel.packages = ["cudf"]
 provider = "scikit_build_core.metadata.regex"
 input = "cudf/VERSION"
 regex = "(?P<value>.*)"
+
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["cudf"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm", "pylibcudf"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 87e19a2bccf..667cd7b1db8 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -32,51 +32,20 @@ test = [
 Homepage = "https://github.com/rapidsai/cudf"
 Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-    "streamz",
-]
-known_rapids = [
-    "rmm",
-    "cudf",
-    "dask_cudf",
-]
-known_first_party = [
-    "cudf_kafka",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["cudf_kafka"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda", "streamz"]
+rapids = ["rmm", "cudf", "dask_cudf"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
 
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
index b8b18ec5039..41b1defab39 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
@@ -30,14 +30,13 @@
 
 
 class Agg(Expr):
-    __slots__ = ("name", "options", "op", "request", "children")
+    __slots__ = ("name", "options", "op", "request")
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
 
     def __init__(
         self, dtype: plc.DataType, name: str, options: Any, *children: Expr
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.name = name
         self.options = options
         self.children = children
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
index 8d021b0231d..effe8cb2378 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/base.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
@@ -13,9 +13,10 @@
 import pylibcudf as plc
 
 from cudf_polars.containers import Column
+from cudf_polars.dsl.nodebase import Node
 
 if TYPE_CHECKING:
-    from collections.abc import Mapping, Sequence
+    from collections.abc import Mapping
 
     from cudf_polars.containers import Column, DataFrame
 
@@ -32,100 +33,16 @@ class ExecutionContext(IntEnum):
     ROLLING = enum.auto()
 
 
-class Expr:
-    """
-    An abstract expression object.
+class Expr(Node["Expr"]):
+    """An abstract expression object."""
 
-    This contains a (potentially empty) tuple of child expressions,
-    along with non-child data. For uniform reconstruction and
-    implementation of hashing and equality schemes, child classes need
-    to provide a certain amount of metadata when they are defined.
-    Specifically, the ``_non_child`` attribute must list, in-order,
-    the names of the slots that are passed to the constructor. The
-    constructor must take arguments in the order ``(*_non_child,
-    *children).``
-    """
-
-    __slots__ = ("dtype", "_hash_value", "_repr_value")
+    __slots__ = ("dtype",)
     dtype: plc.DataType
     """Data type of the expression."""
-    _hash_value: int
-    """Caching slot for the hash of the expression."""
-    _repr_value: str
-    """Caching slot for repr of the expression."""
-    children: tuple[Expr, ...] = ()
-    """Children of the expression."""
+    # This annotation is needed because of https://github.com/python/mypy/issues/17981
     _non_child: ClassVar[tuple[str, ...]] = ("dtype",)
     """Names of non-child data (not Exprs) for reconstruction."""
 
-    # Constructor must take arguments in order (*_non_child, *children)
-    def __init__(self, dtype: plc.DataType) -> None:
-        self.dtype = dtype
-
-    def _ctor_arguments(self, children: Sequence[Expr]) -> Sequence:
-        return (*(getattr(self, attr) for attr in self._non_child), *children)
-
-    def get_hash(self) -> int:
-        """
-        Return the hash of this expr.
-
-        Override this in subclasses, rather than __hash__.
-
-        Returns
-        -------
-        The integer hash value.
-        """
-        return hash((type(self), self._ctor_arguments(self.children)))
-
-    def __hash__(self) -> int:
-        """Hash of an expression with caching."""
-        try:
-            return self._hash_value
-        except AttributeError:
-            self._hash_value = self.get_hash()
-            return self._hash_value
-
-    def is_equal(self, other: Any) -> bool:
-        """
-        Equality of two expressions.
-
-        Override this in subclasses, rather than __eq__.
-
-        Parameter
-        ---------
-        other
-            object to compare to
-
-        Returns
-        -------
-        True if the two expressions are equal, false otherwise.
-        """
-        if type(self) is not type(other):
-            return False  # pragma: no cover; __eq__ trips first
-        return self._ctor_arguments(self.children) == other._ctor_arguments(
-            other.children
-        )
-
-    def __eq__(self, other: Any) -> bool:
-        """Equality of expressions."""
-        if type(self) is not type(other) or hash(self) != hash(other):
-            return False
-        else:
-            return self.is_equal(other)
-
-    def __ne__(self, other: Any) -> bool:
-        """Inequality of expressions."""
-        return not self.__eq__(other)
-
-    def __repr__(self) -> str:
-        """String representation of an expression with caching."""
-        try:
-            return self._repr_value
-        except AttributeError:
-            args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children))
-            self._repr_value = f"{type(self).__name__}({args})"
-            return self._repr_value
-
     def do_evaluate(
         self,
         df: DataFrame,
@@ -311,11 +228,11 @@ class Col(Expr):
     __slots__ = ("name",)
     _non_child = ("dtype", "name")
     name: str
-    children: tuple[()]
 
     def __init__(self, dtype: plc.DataType, name: str) -> None:
         self.dtype = dtype
         self.name = name
+        self.children = ()
 
     def do_evaluate(
         self,
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
index 19baae3611d..11a47e7ea51 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
@@ -24,9 +24,8 @@
 
 
 class BinOp(Expr):
-    __slots__ = ("op", "children")
+    __slots__ = ("op",)
     _non_child = ("dtype", "op")
-    children: tuple[Expr, Expr]
 
     def __init__(
         self,
@@ -35,7 +34,7 @@ def __init__(
         left: Expr,
         right: Expr,
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         if plc.traits.is_boolean(self.dtype):
             # For boolean output types, bitand and bitor implement
             # boolean logic, so translate. bitxor also does, but the
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
index ff9973a47d5..9c14a8386f3 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
@@ -31,9 +31,8 @@
 
 
 class BooleanFunction(Expr):
-    __slots__ = ("name", "options", "children")
+    __slots__ = ("name", "options")
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
 
     def __init__(
         self,
@@ -42,7 +41,7 @@ def __init__(
         options: tuple[Any, ...],
         *children: Expr,
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.name = name
         self.children = children
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
index f752a23b628..596e193d8fe 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
@@ -25,7 +25,7 @@
 
 
 class TemporalFunction(Expr):
-    __slots__ = ("name", "options", "children")
+    __slots__ = ("name", "options")
     _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = {
         pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR,
         pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH,
@@ -39,7 +39,6 @@ class TemporalFunction(Expr):
         pl_expr.TemporalFunction.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND,
     }
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
 
     def __init__(
         self,
@@ -48,7 +47,7 @@ def __init__(
         options: tuple[Any, ...],
         *children: Expr,
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.name = name
         self.children = children
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
index 562a2255033..c8aa993b994 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
@@ -16,7 +16,7 @@
 from cudf_polars.utils import dtypes
 
 if TYPE_CHECKING:
-    from collections.abc import Mapping
+    from collections.abc import Hashable, Mapping
 
     import pyarrow as pa
 
@@ -31,12 +31,12 @@ class Literal(Expr):
     __slots__ = ("value",)
     _non_child = ("dtype", "value")
     value: pa.Scalar[Any]
-    children: tuple[()]
 
     def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         assert value.type == plc.interop.to_arrow(dtype)
         self.value = value
+        self.children = ()
 
     def do_evaluate(
         self,
@@ -58,19 +58,19 @@ class LiteralColumn(Expr):
     __slots__ = ("value",)
     _non_child = ("dtype", "value")
     value: pa.Array[Any, Any]
-    children: tuple[()]
 
     def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         data = value.to_arrow()
         self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
+        self.children = ()
 
-    def get_hash(self) -> int:
+    def get_hashable(self) -> Hashable:
         """Compute a hash of the column."""
         # This is stricter than necessary, but we only need this hash
         # for identity in groupby replacements so it's OK. And this
         # way we avoid doing potentially expensive compute.
-        return hash((type(self), self.dtype, id(self.value)))
+        return (type(self), self.dtype, id(self.value))
 
     def do_evaluate(
         self,
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
index f7dcc3c542c..fa68bcb9426 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
@@ -17,24 +17,22 @@
 
 
 class RollingWindow(Expr):
-    __slots__ = ("options", "children")
+    __slots__ = ("options",)
     _non_child = ("dtype", "options")
-    children: tuple[Expr]
 
     def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.children = (agg,)
         raise NotImplementedError("Rolling window not implemented")
 
 
 class GroupedRollingWindow(Expr):
-    __slots__ = ("options", "children")
+    __slots__ = ("options",)
     _non_child = ("dtype", "options")
-    children: tuple[Expr, ...]
 
     def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.children = (agg, *by)
         raise NotImplementedError("Grouped rolling window not implemented")
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
index a7a3e68a28c..0247256e507 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
@@ -23,12 +23,11 @@
 
 
 class Gather(Expr):
-    __slots__ = ("children",)
+    __slots__ = ()
     _non_child = ("dtype",)
-    children: tuple[Expr, Expr]
 
     def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.children = (values, indices)
 
     def do_evaluate(
@@ -65,12 +64,11 @@ def do_evaluate(
 
 
 class Filter(Expr):
-    __slots__ = ("children",)
+    __slots__ = ()
     _non_child = ("dtype",)
-    children: tuple[Expr, Expr]
 
     def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
-        super().__init__(dtype)
+        self.dtype = dtype
         self.children = (values, indices)
 
     def do_evaluate(
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
index 861b73ce6a0..99512e2ef52 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
@@ -23,14 +23,13 @@
 
 
 class Sort(Expr):
-    __slots__ = ("options", "children")
+    __slots__ = ("options",)
     _non_child = ("dtype", "options")
-    children: tuple[Expr]
 
     def __init__(
         self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.children = (column,)
 
@@ -59,9 +58,8 @@ def do_evaluate(
 
 
 class SortBy(Expr):
-    __slots__ = ("options", "children")
+    __slots__ = ("options",)
     _non_child = ("dtype", "options")
-    children: tuple[Expr, ...]
 
     def __init__(
         self,
@@ -70,7 +68,7 @@ def __init__(
         column: Expr,
         *by: Expr,
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.children = (column, *by)
 
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
index 6669669aadc..62b54c63a8d 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
@@ -28,9 +28,8 @@
 
 
 class StringFunction(Expr):
-    __slots__ = ("name", "options", "children", "_regex_program")
+    __slots__ = ("name", "options", "_regex_program")
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
 
     def __init__(
         self,
@@ -39,7 +38,7 @@ def __init__(
         options: tuple[Any, ...],
         *children: Expr,
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.options = options
         self.name = name
         self.children = children
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
index c7d7a802ded..d2b5d6bae29 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
@@ -26,14 +26,13 @@
 
 
 class Ternary(Expr):
-    __slots__ = ("children",)
+    __slots__ = ()
     _non_child = ("dtype",)
-    children: tuple[Expr, Expr, Expr]
 
     def __init__(
         self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.children = (when, then, otherwise)
 
     def do_evaluate(
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
index 3d4d15be1ce..53f6ed29239 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
@@ -26,12 +26,11 @@
 class Cast(Expr):
     """Class representing a cast of an expression."""
 
-    __slots__ = ("children",)
+    __slots__ = ()
     _non_child = ("dtype",)
-    children: tuple[Expr]
 
     def __init__(self, dtype: plc.DataType, value: Expr) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.children = (value,)
         if not dtypes.can_cast(value.dtype, self.dtype):
             raise NotImplementedError(
@@ -60,7 +59,9 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 class Len(Expr):
     """Class representing the length of an expression."""
 
-    children: tuple[()]
+    def __init__(self, dtype: plc.DataType) -> None:
+        self.dtype = dtype
+        self.children = ()
 
     def do_evaluate(
         self,
@@ -90,9 +91,8 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 class UnaryFunction(Expr):
     """Class representing unary functions of an expression."""
 
-    __slots__ = ("name", "options", "children")
+    __slots__ = ("name", "options")
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr, ...]
 
     # Note: log, and pow are handled via translation to binops
     _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = {
@@ -142,7 +142,7 @@ class UnaryFunction(Expr):
     def __init__(
         self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr
     ) -> None:
-        super().__init__(dtype)
+        self.dtype = dtype
         self.name = name
         self.options = options
         self.children = children
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index e319c363a23..f79e229d3f3 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -13,8 +13,8 @@
 
 from __future__ import annotations
 
-import dataclasses
 import itertools
+import json
 from functools import cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar
@@ -27,10 +27,11 @@
 
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import Column, DataFrame
-from cudf_polars.utils import dtypes, sorting
+from cudf_polars.dsl.nodebase import Node
+from cudf_polars.utils import dtypes
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, MutableMapping
+    from collections.abc import Callable, Hashable, MutableMapping, Sequence
     from typing import Literal
 
     from cudf_polars.typing import Schema
@@ -121,16 +122,27 @@ def broadcast(*columns: Column, target_length: int | None = None) -> list[Column
     ]
 
 
-@dataclasses.dataclass
-class IR:
+class IR(Node["IR"]):
     """Abstract plan node, representing an unevaluated dataframe."""
 
+    __slots__ = ("schema",)
+    # This annotation is needed because of https://github.com/python/mypy/issues/17981
+    _non_child: ClassVar[tuple[str, ...]] = ("schema",)
     schema: Schema
     """Mapping from column names to their data types."""
 
-    def __post_init__(self):
-        """Validate preconditions."""
-        pass  # noqa: PIE790
+    def get_hashable(self) -> Hashable:
+        """
+        Hashable representation of node, treating schema dictionary.
+
+        Since the schema is a dictionary, even though it is morally
+        immutable, it is not hashable. We therefore convert it to
+        tuples for hashing purposes.
+        """
+        # Schema is the first constructor argument
+        args = self._ctor_arguments(self.children)[1:]
+        schema_hash = tuple(self.schema.items())
+        return (type(self), schema_hash, args)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """
@@ -159,24 +171,50 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         )  # pragma: no cover
 
 
-@dataclasses.dataclass
 class PythonScan(IR):
     """Representation of input from a python function."""
 
+    __slots__ = ("options", "predicate")
+    _non_child = ("schema", "options", "predicate")
     options: Any
     """Arbitrary options."""
     predicate: expr.NamedExpr | None
     """Filter to apply to the constructed dataframe before returning it."""
 
-    def __post_init__(self):
-        """Validate preconditions."""
+    def __init__(self, schema: Schema, options: Any, predicate: expr.NamedExpr | None):
+        self.schema = schema
+        self.options = options
+        self.predicate = predicate
+        self.children = ()
         raise NotImplementedError("PythonScan not implemented")
 
 
-@dataclasses.dataclass
 class Scan(IR):
     """Input from files."""
 
+    __slots__ = (
+        "typ",
+        "reader_options",
+        "cloud_options",
+        "paths",
+        "with_columns",
+        "skip_rows",
+        "n_rows",
+        "row_index",
+        "predicate",
+    )
+    _non_child = (
+        "schema",
+        "typ",
+        "reader_options",
+        "cloud_options",
+        "paths",
+        "with_columns",
+        "skip_rows",
+        "n_rows",
+        "row_index",
+        "predicate",
+    )
     typ: str
     """What type of file are we reading? Parquet, CSV, etc..."""
     reader_options: dict[str, Any]
@@ -185,7 +223,7 @@ class Scan(IR):
     """Cloud-related authentication options, currently ignored."""
     paths: list[str]
     """List of paths to read from."""
-    with_columns: list[str]
+    with_columns: list[str] | None
     """Projected columns to return."""
     skip_rows: int
     """Rows to skip at the start when reading."""
@@ -196,9 +234,30 @@ class Scan(IR):
     predicate: expr.NamedExpr | None
     """Mask to apply to the read dataframe."""
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        super().__post_init__()
+    def __init__(
+        self,
+        schema: Schema,
+        typ: str,
+        reader_options: dict[str, Any],
+        cloud_options: dict[str, Any] | None,
+        paths: list[str],
+        with_columns: list[str] | None,
+        skip_rows: int,
+        n_rows: int,
+        row_index: tuple[str, int] | None,
+        predicate: expr.NamedExpr | None,
+    ):
+        self.schema = schema
+        self.typ = typ
+        self.reader_options = reader_options
+        self.cloud_options = cloud_options
+        self.paths = paths
+        self.with_columns = with_columns
+        self.skip_rows = skip_rows
+        self.n_rows = n_rows
+        self.row_index = row_index
+        self.predicate = predicate
+        self.children = ()
         if self.typ not in ("csv", "parquet", "ndjson"):  # pragma: no cover
             # This line is unhittable ATM since IPC/Anonymous scan raise
             # on the polars side
@@ -258,6 +317,28 @@ def __post_init__(self) -> None:
                 "Reading only parquet metadata to produce row index."
             )
 
+    def get_hashable(self) -> Hashable:
+        """
+        Hashable representation of the node.
+
+        The options dictionaries are serialised for hashing purposes
+        as json strings.
+        """
+        schema_hash = tuple(self.schema.items())
+        return (
+            type(self),
+            schema_hash,
+            self.typ,
+            json.dumps(self.reader_options),
+            json.dumps(self.cloud_options),
+            tuple(self.paths),
+            tuple(self.with_columns) if self.with_columns is not None else None,
+            self.skip_rows,
+            self.n_rows,
+            self.row_index,
+            self.predicate,
+        )
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         with_columns = self.with_columns
@@ -401,7 +482,6 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return df.filter(mask)
 
 
-@dataclasses.dataclass
 class Cache(IR):
     """
     Return a cached plan node.
@@ -409,20 +489,25 @@ class Cache(IR):
     Used for CSE at the plan level.
     """
 
+    __slots__ = ("key",)
+    _non_child = ("schema", "key")
     key: int
     """The cache key."""
-    value: IR
-    """The unevaluated node to cache."""
+
+    def __init__(self, schema: Schema, key: int, value: IR):
+        self.schema = schema
+        self.key = key
+        self.children = (value,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         try:
             return cache[self.key]
         except KeyError:
-            return cache.setdefault(self.key, self.value.evaluate(cache=cache))
+            (value,) = self.children
+            return cache.setdefault(self.key, value.evaluate(cache=cache))
 
 
-@dataclasses.dataclass
 class DataFrameScan(IR):
     """
     Input from an existing polars DataFrame.
@@ -430,13 +515,38 @@ class DataFrameScan(IR):
     This typically arises from ``q.collect().lazy()``
     """
 
+    __slots__ = ("df", "projection", "predicate")
+    _non_child = ("schema", "df", "projection", "predicate")
     df: Any
     """Polars LazyFrame object."""
-    projection: list[str]
+    projection: tuple[str, ...] | None
     """List of columns to project out."""
     predicate: expr.NamedExpr | None
     """Mask to apply."""
 
+    def __init__(
+        self,
+        schema: Schema,
+        df: Any,
+        projection: Sequence[str] | None,
+        predicate: expr.NamedExpr | None,
+    ):
+        self.schema = schema
+        self.df = df
+        self.projection = tuple(projection) if projection is not None else None
+        self.predicate = predicate
+        self.children = ()
+
+    def get_hashable(self) -> Hashable:
+        """
+        Hashable representation of the node.
+
+        The (heavy) dataframe object is hashed as its id, so this is
+        not stable across runs, or repeat instances of the same equal dataframes.
+        """
+        schema_hash = tuple(self.schema.items())
+        return (type(self), schema_hash, id(self.df), self.projection, self.predicate)
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         pdf = pl.DataFrame._from_pydf(self.df)
@@ -454,28 +564,39 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return df
 
 
-@dataclasses.dataclass
 class Select(IR):
     """Produce a new dataframe selecting given expressions from an input."""
 
-    df: IR
-    """Input dataframe."""
-    expr: list[expr.NamedExpr]
+    __slots__ = ("exprs", "should_broadcast")
+    _non_child = ("schema", "exprs", "should_broadcast")
+    exprs: tuple[expr.NamedExpr, ...]
     """List of expressions to evaluate to form the new dataframe."""
     should_broadcast: bool
     """Should columns be broadcast?"""
 
+    def __init__(
+        self,
+        schema: Schema,
+        exprs: Sequence[expr.NamedExpr],
+        should_broadcast: bool,  # noqa: FBT001
+        df: IR,
+    ):
+        self.schema = schema
+        self.exprs = tuple(exprs)
+        self.should_broadcast = should_broadcast
+        self.children = (df,)
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         # Handle any broadcasting
-        columns = [e.evaluate(df) for e in self.expr]
+        columns = [e.evaluate(df) for e in self.exprs]
         if self.should_broadcast:
             columns = broadcast(*columns)
         return DataFrame(columns)
 
 
-@dataclasses.dataclass
 class Reduce(IR):
     """
     Produce a new dataframe selecting given expressions from an input.
@@ -483,36 +604,73 @@ class Reduce(IR):
     This is a special case of :class:`Select` where all outputs are a single row.
     """
 
-    df: IR
-    """Input dataframe."""
-    expr: list[expr.NamedExpr]
+    __slots__ = ("exprs",)
+    _non_child = ("schema", "exprs")
+    exprs: tuple[expr.NamedExpr, ...]
     """List of expressions to evaluate to form the new dataframe."""
 
+    def __init__(
+        self, schema: Schema, exprs: Sequence[expr.NamedExpr], df: IR
+    ):  # pragma: no cover; polars doesn't emit this node yet
+        self.schema = schema
+        self.exprs = tuple(exprs)
+        self.children = (df,)
+
     def evaluate(
         self, *, cache: MutableMapping[int, DataFrame]
     ) -> DataFrame:  # pragma: no cover; polars doesn't emit this node yet
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
-        columns = broadcast(*(e.evaluate(df) for e in self.expr))
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
+        columns = broadcast(*(e.evaluate(df) for e in self.exprs))
         assert all(column.obj.size() == 1 for column in columns)
         return DataFrame(columns)
 
 
-@dataclasses.dataclass
 class GroupBy(IR):
     """Perform a groupby."""
 
-    df: IR
-    """Input dataframe."""
-    agg_requests: list[expr.NamedExpr]
-    """List of expressions to evaluate groupwise."""
-    keys: list[expr.NamedExpr]
-    """List of expressions forming the keys."""
+    __slots__ = (
+        "agg_requests",
+        "keys",
+        "maintain_order",
+        "options",
+        "agg_infos",
+    )
+    _non_child = ("schema", "keys", "agg_requests", "maintain_order", "options")
+    keys: tuple[expr.NamedExpr, ...]
+    """Grouping keys."""
+    agg_requests: tuple[expr.NamedExpr, ...]
+    """Aggregation expressions."""
     maintain_order: bool
-    """Should the order of the input dataframe be maintained?"""
+    """Preserve order in groupby."""
     options: Any
-    """Options controlling style of groupby."""
-    agg_infos: list[expr.AggInfo] = dataclasses.field(init=False)
+    """Arbitrary options."""
+
+    def __init__(
+        self,
+        schema: Schema,
+        keys: Sequence[expr.NamedExpr],
+        agg_requests: Sequence[expr.NamedExpr],
+        maintain_order: bool,  # noqa: FBT001
+        options: Any,
+        df: IR,
+    ):
+        self.schema = schema
+        self.keys = tuple(keys)
+        self.agg_requests = tuple(agg_requests)
+        self.maintain_order = maintain_order
+        self.options = options
+        self.children = (df,)
+        if self.options.rolling:
+            raise NotImplementedError(
+                "rolling window/groupby"
+            )  # pragma: no cover; rollingwindow constructor has already raised
+        if self.options.dynamic:
+            raise NotImplementedError("dynamic group by")
+        if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests):
+            raise NotImplementedError("Nested aggregations in groupby")
+        self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
 
     @staticmethod
     def check_agg(agg: expr.Expr) -> int:
@@ -542,22 +700,10 @@ def check_agg(agg: expr.Expr) -> int:
         else:
             raise NotImplementedError(f"No handler for {agg=}")
 
-    def __post_init__(self) -> None:
-        """Check whether all the aggregations are implemented."""
-        super().__post_init__()
-        if self.options.rolling:
-            raise NotImplementedError(
-                "rolling window/groupby"
-            )  # pragma: no cover; rollingwindow constructor has already raised
-        if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests):
-            raise NotImplementedError("Nested aggregations in groupby")
-        self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
-        if len(self.keys) == 0:
-            raise NotImplementedError("dynamic groupby")
-
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         keys = broadcast(
             *(k.evaluate(df) for k in self.keys), target_length=df.num_rows
         )
@@ -646,23 +792,20 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(broadcasted).slice(self.options.slice)
 
 
-@dataclasses.dataclass
 class Join(IR):
     """A join of two dataframes."""
 
-    left: IR
-    """Left frame."""
-    right: IR
-    """Right frame."""
-    left_on: list[expr.NamedExpr]
+    __slots__ = ("left_on", "right_on", "options")
+    _non_child = ("schema", "left_on", "right_on", "options")
+    left_on: tuple[expr.NamedExpr, ...]
     """List of expressions used as keys in the left frame."""
-    right_on: list[expr.NamedExpr]
+    right_on: tuple[expr.NamedExpr, ...]
     """List of expressions used as keys in the right frame."""
     options: tuple[
-        Literal["inner", "left", "right", "full", "leftsemi", "leftanti", "cross"],
+        Literal["inner", "left", "right", "full", "semi", "anti", "cross"],
         bool,
         tuple[int, int] | None,
-        str | None,
+        str,
         bool,
     ]
     """
@@ -674,9 +817,20 @@ class Join(IR):
     - coalesce: should key columns be coalesced (only makes sense for outer joins)
     """
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        super().__post_init__()
+    def __init__(
+        self,
+        schema: Schema,
+        left_on: Sequence[expr.NamedExpr],
+        right_on: Sequence[expr.NamedExpr],
+        options: Any,
+        left: IR,
+        right: IR,
+    ):
+        self.schema = schema
+        self.left_on = tuple(left_on)
+        self.right_on = tuple(right_on)
+        self.options = options
+        self.children = (left, right)
         if any(
             isinstance(e.value, expr.Literal)
             for e in itertools.chain(self.left_on, self.right_on)
@@ -686,7 +840,7 @@ def __post_init__(self) -> None:
     @staticmethod
     @cache
     def _joiners(
-        how: Literal["inner", "left", "right", "full", "leftsemi", "leftanti"],
+        how: Literal["inner", "left", "right", "full", "semi", "anti"],
     ) -> tuple[
         Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None
     ]:
@@ -708,13 +862,13 @@ def _joiners(
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
             )
-        elif how == "leftsemi":
+        elif how == "semi":
             return (
                 plc.join.left_semi_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 None,
             )
-        elif how == "leftanti":
+        elif how == "anti":
             return (
                 plc.join.left_anti_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
@@ -777,10 +931,8 @@ def _reorder_maps(
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        left = self.left.evaluate(cache=cache)
-        right = self.right.evaluate(cache=cache)
+        left, right = (c.evaluate(cache=cache) for c in self.children)
         how, join_nulls, zlice, suffix, coalesce = self.options
-        suffix = "_right" if suffix is None else suffix
         if how == "cross":
             # Separate implementation, since cross_join returns the
             # result, not the gather maps
@@ -802,7 +954,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                     columns[left.num_columns :], right.column_names, strict=True
                 )
             ]
-            return DataFrame([*left_cols, *right_cols])
+            return DataFrame([*left_cols, *right_cols]).slice(zlice)
         # TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184
         left_on = DataFrame(broadcast(*(e.evaluate(left) for e in self.left_on)))
         right_on = DataFrame(broadcast(*(e.evaluate(right) for e in self.right_on)))
@@ -866,20 +1018,30 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return result.slice(zlice)
 
 
-@dataclasses.dataclass
 class HStack(IR):
     """Add new columns to a dataframe."""
 
-    df: IR
-    """Input dataframe."""
-    columns: list[expr.NamedExpr]
-    """List of expressions to produce new columns."""
+    __slots__ = ("columns", "should_broadcast")
+    _non_child = ("schema", "columns", "should_broadcast")
     should_broadcast: bool
-    """Should columns be broadcast?"""
+    """Should the resulting evaluated columns be broadcast to the same length."""
+
+    def __init__(
+        self,
+        schema: Schema,
+        columns: Sequence[expr.NamedExpr],
+        should_broadcast: bool,  # noqa: FBT001
+        df: IR,
+    ):
+        self.schema = schema
+        self.columns = tuple(columns)
+        self.should_broadcast = should_broadcast
+        self.children = (df,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         columns = [c.evaluate(df) for c in self.columns]
         if self.should_broadcast:
             columns = broadcast(*columns, target_length=df.num_rows)
@@ -895,20 +1057,36 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return df.with_columns(columns)
 
 
-@dataclasses.dataclass
 class Distinct(IR):
     """Produce a new dataframe with distinct rows."""
 
-    df: IR
-    """Input dataframe."""
+    __slots__ = ("keep", "subset", "zlice", "stable")
+    _non_child = ("schema", "keep", "subset", "zlice", "stable")
     keep: plc.stream_compaction.DuplicateKeepOption
-    """Which rows to keep."""
-    subset: set[str] | None
-    """Which columns to inspect when computing distinct rows."""
+    """Which distinct value to keep."""
+    subset: frozenset[str] | None
+    """Which columns should be used to define distinctness. If None,
+    then all columns are used."""
     zlice: tuple[int, int] | None
-    """Optional slice to perform after compaction."""
+    """Optional slice to apply to the result."""
     stable: bool
-    """Should order be preserved?"""
+    """Should the result maintain ordering."""
+
+    def __init__(
+        self,
+        schema: Schema,
+        keep: plc.stream_compaction.DuplicateKeepOption,
+        subset: frozenset[str] | None,
+        zlice: tuple[int, int] | None,
+        stable: bool,  # noqa: FBT001
+        df: IR,
+    ):
+        self.schema = schema
+        self.keep = keep
+        self.subset = subset
+        self.zlice = zlice
+        self.stable = stable
+        self.children = (df,)
 
     _KEEP_MAP: ClassVar[dict[str, plc.stream_compaction.DuplicateKeepOption]] = {
         "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
@@ -917,18 +1095,10 @@ class Distinct(IR):
         "any": plc.stream_compaction.DuplicateKeepOption.KEEP_ANY,
     }
 
-    def __init__(self, schema: Schema, df: IR, options: Any) -> None:
-        self.schema = schema
-        self.df = df
-        (keep, subset, maintain_order, zlice) = options
-        self.keep = Distinct._KEEP_MAP[keep]
-        self.subset = set(subset) if subset is not None else None
-        self.stable = maintain_order
-        self.zlice = zlice
-
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         if self.subset is None:
             indices = list(range(df.num_columns))
             keys_sorted = all(c.is_sorted for c in df.column_map.values())
@@ -967,46 +1137,44 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return result.slice(self.zlice)
 
 
-@dataclasses.dataclass
 class Sort(IR):
     """Sort a dataframe."""
 
-    df: IR
-    """Input."""
-    by: list[expr.NamedExpr]
-    """List of expressions to produce sort keys."""
-    do_sort: Callable[..., plc.Table]
-    """pylibcudf sorting function."""
+    __slots__ = ("by", "order", "null_order", "stable", "zlice")
+    _non_child = ("schema", "by", "order", "null_order", "stable", "zlice")
+    by: tuple[expr.NamedExpr, ...]
+    """Sort keys."""
+    order: tuple[plc.types.Order, ...]
+    """Sort order for each sort key."""
+    null_order: tuple[plc.types.NullOrder, ...]
+    """Null sorting location for each sort key."""
+    stable: bool
+    """Should the sort be stable?"""
     zlice: tuple[int, int] | None
-    """Optional slice to apply after sorting."""
-    order: list[plc.types.Order]
-    """Order keys should be sorted in."""
-    null_order: list[plc.types.NullOrder]
-    """Where nulls sort to."""
+    """Optional slice to apply to the result."""
 
     def __init__(
         self,
         schema: Schema,
-        df: IR,
-        by: list[expr.NamedExpr],
-        options: Any,
+        by: Sequence[expr.NamedExpr],
+        order: Sequence[plc.types.Order],
+        null_order: Sequence[plc.types.NullOrder],
+        stable: bool,  # noqa: FBT001
         zlice: tuple[int, int] | None,
-    ) -> None:
+        df: IR,
+    ):
         self.schema = schema
-        self.df = df
-        self.by = by
+        self.by = tuple(by)
+        self.order = tuple(order)
+        self.null_order = tuple(null_order)
+        self.stable = stable
         self.zlice = zlice
-        stable, nulls_last, descending = options
-        self.order, self.null_order = sorting.sort_order(
-            descending, nulls_last=nulls_last, num_keys=len(by)
-        )
-        self.do_sort = (
-            plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
-        )
+        self.children = (df,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         sort_keys = broadcast(
             *(k.evaluate(df) for k in self.by), target_length=df.num_rows
         )
@@ -1016,11 +1184,14 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             for i, k in enumerate(sort_keys)
             if k.name in df.column_map and k.obj is df.column_map[k.name].obj
         }
-        table = self.do_sort(
+        do_sort = (
+            plc.sorting.stable_sort_by_key if self.stable else plc.sorting.sort_by_key
+        )
+        table = do_sort(
             df.table,
             plc.Table([k.obj for k in sort_keys]),
-            self.order,
-            self.null_order,
+            list(self.order),
+            list(self.null_order),
         )
         columns: list[Column] = []
         for name, c in zip(df.column_map, table.columns(), strict=True):
@@ -1037,49 +1208,64 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(columns).slice(self.zlice)
 
 
-@dataclasses.dataclass
 class Slice(IR):
     """Slice a dataframe."""
 
-    df: IR
-    """Input."""
+    __slots__ = ("offset", "length")
+    _non_child = ("schema", "offset", "length")
     offset: int
     """Start of the slice."""
     length: int
     """Length of the slice."""
 
+    def __init__(self, schema: Schema, offset: int, length: int, df: IR):
+        self.schema = schema
+        self.offset = offset
+        self.length = length
+        self.children = (df,)
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         return df.slice((self.offset, self.length))
 
 
-@dataclasses.dataclass
 class Filter(IR):
     """Filter a dataframe with a boolean mask."""
 
-    df: IR
-    """Input."""
+    __slots__ = ("mask",)
+    _non_child = ("schema", "mask")
     mask: expr.NamedExpr
-    """Expression evaluating to a mask."""
+    """Expression to produce the filter mask."""
+
+    def __init__(self, schema: Schema, mask: expr.NamedExpr, df: IR):
+        self.schema = schema
+        self.mask = mask
+        self.children = (df,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         (mask,) = broadcast(self.mask.evaluate(df), target_length=df.num_rows)
         return df.filter(mask)
 
 
-@dataclasses.dataclass
 class Projection(IR):
     """Select a subset of columns from a dataframe."""
 
-    df: IR
-    """Input."""
+    __slots__ = ()
+    _non_child = ("schema",)
+
+    def __init__(self, schema: Schema, df: IR):
+        self.schema = schema
+        self.children = (df,)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        df = self.df.evaluate(cache=cache)
+        (child,) = self.children
+        df = child.evaluate(cache=cache)
         # This can reorder things.
         columns = broadcast(
             *(df.column_map[name] for name in self.schema), target_length=df.num_rows
@@ -1087,16 +1273,15 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(columns)
 
 
-@dataclasses.dataclass
 class MapFunction(IR):
     """Apply some function to a dataframe."""
 
-    df: IR
-    """Input."""
+    __slots__ = ("name", "options")
+    _non_child = ("schema", "name", "options")
     name: str
-    """Function name."""
+    """Name of the function to apply"""
     options: Any
-    """Arbitrary options, interpreted per function."""
+    """Arbitrary name-specific options"""
 
     _NAMES: ClassVar[frozenset[str]] = frozenset(
         [
@@ -1111,9 +1296,11 @@ class MapFunction(IR):
         ]
     )
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        super().__post_init__()
+    def __init__(self, schema: Schema, name: str, options: Any, df: IR):
+        self.schema = schema
+        self.name = name
+        self.options = options
+        self.children = (df,)
         if self.name not in MapFunction._NAMES:
             raise NotImplementedError(f"Unhandled map function {self.name}")
         if self.name == "explode":
@@ -1127,7 +1314,7 @@ def __post_init__(self) -> None:
             old, new, _ = self.options
             # TODO: perhaps polars should validate renaming in the IR?
             if len(new) != len(set(new)) or (
-                set(new) & (set(self.df.schema.keys()) - set(old))
+                set(new) & (set(df.schema.keys()) - set(old))
             ):
                 raise NotImplementedError("Duplicate new names in rename.")
         elif self.name == "unpivot":
@@ -1136,31 +1323,31 @@ def __post_init__(self) -> None:
             variable_name = "variable" if variable_name is None else variable_name
             if len(pivotees) == 0:
                 index = frozenset(indices)
-                pivotees = [name for name in self.df.schema if name not in index]
+                pivotees = [name for name in df.schema if name not in index]
             if not all(
-                dtypes.can_cast(self.df.schema[p], self.schema[value_name])
-                for p in pivotees
+                dtypes.can_cast(df.schema[p], self.schema[value_name]) for p in pivotees
             ):
                 raise NotImplementedError(
                     "Unpivot cannot cast all input columns to "
                     f"{self.schema[value_name].id()}"
                 )
-            self.options = (indices, pivotees, variable_name, value_name)
+            self.options = (tuple(indices), tuple(pivotees), variable_name, value_name)
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
+        (child,) = self.children
         if self.name == "rechunk":
             # No-op in our data model
             # Don't think this appears in a plan tree from python
-            return self.df.evaluate(cache=cache)  # pragma: no cover
+            return child.evaluate(cache=cache)  # pragma: no cover
         elif self.name == "rename":
-            df = self.df.evaluate(cache=cache)
+            df = child.evaluate(cache=cache)
             # final tag is "swapping" which is useful for the
             # optimiser (it blocks some pushdown operations)
             old, new, _ = self.options
             return df.rename_columns(dict(zip(old, new, strict=True)))
         elif self.name == "explode":
-            df = self.df.evaluate(cache=cache)
+            df = child.evaluate(cache=cache)
             ((to_explode,),) = self.options
             index = df.column_names.index(to_explode)
             subset = df.column_names_set - {to_explode}
@@ -1170,7 +1357,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         elif self.name == "unpivot":
             indices, pivotees, variable_name, value_name = self.options
             npiv = len(pivotees)
-            df = self.df.evaluate(cache=cache)
+            df = child.evaluate(cache=cache)
             index_columns = [
                 Column(col, name=name)
                 for col, name in zip(
@@ -1209,37 +1396,40 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             raise AssertionError("Should never be reached")  # pragma: no cover
 
 
-@dataclasses.dataclass
 class Union(IR):
     """Concatenate dataframes vertically."""
 
-    dfs: list[IR]
-    """List of inputs."""
+    __slots__ = ("zlice",)
+    _non_child = ("schema", "zlice")
     zlice: tuple[int, int] | None
-    """Optional slice to apply after concatenation."""
+    """Optional slice to apply to the result."""
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        super().__post_init__()
-        schema = self.dfs[0].schema
-        if not all(s.schema == schema for s in self.dfs[1:]):
+    def __init__(self, schema: Schema, zlice: tuple[int, int] | None, *children: IR):
+        self.schema = schema
+        self.zlice = zlice
+        self.children = children
+        schema = self.children[0].schema
+        if not all(s.schema == schema for s in self.children[1:]):
             raise NotImplementedError("Schema mismatch")
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         # TODO: only evaluate what we need if we have a slice
-        dfs = [df.evaluate(cache=cache) for df in self.dfs]
+        dfs = [df.evaluate(cache=cache) for df in self.children]
         return DataFrame.from_table(
             plc.concatenate.concatenate([df.table for df in dfs]), dfs[0].column_names
         ).slice(self.zlice)
 
 
-@dataclasses.dataclass
 class HConcat(IR):
     """Concatenate dataframes horizontally."""
 
-    dfs: list[IR]
-    """List of inputs."""
+    __slots__ = ()
+    _non_child = ("schema",)
+
+    def __init__(self, schema: Schema, *children: IR):
+        self.schema = schema
+        self.children = children
 
     @staticmethod
     def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table:
@@ -1271,7 +1461,7 @@ def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table:
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        dfs = [df.evaluate(cache=cache) for df in self.dfs]
+        dfs = [df.evaluate(cache=cache) for df in self.children]
         max_rows = max(df.num_rows for df in dfs)
         # Horizontal concatenation extends shorter tables with nulls
         dfs = [
diff --git a/python/cudf_polars/cudf_polars/dsl/nodebase.py b/python/cudf_polars/cudf_polars/dsl/nodebase.py
new file mode 100644
index 00000000000..228d300f467
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/nodebase.py
@@ -0,0 +1,152 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Base class for IR nodes, and utilities."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar
+
+if TYPE_CHECKING:
+    from collections.abc import Hashable, Sequence
+
+    from typing_extensions import Self
+
+
+__all__: list[str] = ["Node"]
+
+T = TypeVar("T", bound="Node[Any]")
+
+
+class Node(Generic[T]):
+    """
+    An abstract node type.
+
+    Nodes are immutable!
+
+    This contains a (potentially empty) tuple of child nodes,
+    along with non-child data. For uniform reconstruction and
+    implementation of hashing and equality schemes, child classes need
+    to provide a certain amount of metadata when they are defined.
+    Specifically, the ``_non_child`` attribute must list, in-order,
+    the names of the slots that are passed to the constructor. The
+    constructor must take arguments in the order ``(*_non_child,
+    *children).``
+    """
+
+    __slots__ = ("_hash_value", "_repr_value", "children")
+    _hash_value: int
+    _repr_value: str
+    children: tuple[T, ...]
+    _non_child: ClassVar[tuple[str, ...]] = ()
+
+    def _ctor_arguments(self, children: Sequence[T]) -> Sequence[Any | T]:
+        return (*(getattr(self, attr) for attr in self._non_child), *children)
+
+    def reconstruct(
+        self, children: Sequence[T]
+    ) -> Self:  # pragma: no cover; not yet used
+        """
+        Rebuild this node with new children.
+
+        Parameters
+        ----------
+        children
+            New children
+
+        Returns
+        -------
+        New node with new children. Non-child data is shared with the input.
+        """
+        return type(self)(*self._ctor_arguments(children))
+
+    def get_hashable(self) -> Hashable:
+        """
+        Return a hashable object for the node.
+
+        Returns
+        -------
+        Hashable object.
+
+        Notes
+        -----
+        This method is used by the :meth:`__hash__` implementation
+        (which does caching). If your node type needs special-case
+        handling for some of its attributes, override this method, not
+        :meth:`__hash__`.
+        """
+        return (type(self), self._ctor_arguments(self.children))
+
+    def __hash__(self) -> int:
+        """
+        Hash of an expression with caching.
+
+        See Also
+        --------
+        get_hashable
+        """
+        try:
+            return self._hash_value
+        except AttributeError:
+            self._hash_value = hash(self.get_hashable())
+            return self._hash_value
+
+    def is_equal(self, other: Self) -> bool:
+        """
+        Equality of two nodes of equal type.
+
+        Override this in subclasses, rather than :meth:`__eq__`.
+
+        Parameter
+        ---------
+        other
+            object of same type to compare to.
+
+        Notes
+        -----
+        Since nodes are immutable, this does common subexpression
+        elimination when two nodes are determined to be equal.
+
+        :meth:`__eq__` handles the case where the objects being
+        compared are not of the same type, so in this method, we only
+        need to implement equality of equal types.
+
+        Returns
+        -------
+        True if the two nodes are equal, false otherwise.
+        """
+        if self is other:
+            return True
+        result = self._ctor_arguments(self.children) == other._ctor_arguments(
+            other.children
+        )
+        # Eager CSE for nodes that match.
+        if result:
+            self.children = other.children
+        return result
+
+    def __eq__(self, other: Any) -> bool:
+        """
+        Equality of expressions.
+
+        See Also
+        --------
+        is_equal
+        """
+        if type(self) is not type(other) or hash(self) != hash(other):
+            return False
+        else:
+            return self.is_equal(other)
+
+    def __ne__(self, other: Any) -> bool:
+        """Inequality of expressions."""
+        return not self.__eq__(other)
+
+    def __repr__(self) -> str:
+        """String representation of an expression with caching."""
+        try:
+            return self._repr_value
+        except AttributeError:
+            args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children))
+            self._repr_value = f"{type(self).__name__}({args})"
+            return self._repr_value
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index a0291037f01..c28f2c2651a 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -5,10 +5,11 @@
 
 from __future__ import annotations
 
+import functools
 import json
 from contextlib import AbstractContextManager, nullcontext
 from functools import singledispatch
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import pyarrow as pa
 import pylibcudf as plc
@@ -19,8 +20,12 @@
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
 from cudf_polars.dsl import expr, ir
+from cudf_polars.dsl.traversal import make_recursive, reuse_if_unchanged
 from cudf_polars.typing import NodeTraverser
-from cudf_polars.utils import dtypes
+from cudf_polars.utils import dtypes, sorting
+
+if TYPE_CHECKING:
+    from cudf_polars.typing import ExprTransformer
 
 __all__ = ["translate_ir", "translate_named_expr"]
 
@@ -148,7 +153,7 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
-    return ir.Select(schema, inp, exprs, node.should_broadcast)
+    return ir.Select(schema, exprs, node.should_broadcast, inp)
 
 
 @_translate_ir.register
@@ -161,11 +166,11 @@ def _(
         keys = [translate_named_expr(visitor, n=e) for e in node.keys]
     return ir.GroupBy(
         schema,
-        inp,
-        aggs,
         keys,
+        aggs,
         node.maintain_order,
         node.options,
+        inp,
     )
 
 
@@ -182,7 +187,71 @@ def _(
     with set_node(visitor, node.input_right):
         inp_right = translate_ir(visitor, n=None)
         right_on = [translate_named_expr(visitor, n=e) for e in node.right_on]
-    return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options)
+    if (how := node.options[0]) in {
+        "inner",
+        "left",
+        "right",
+        "full",
+        "cross",
+        "semi",
+        "anti",
+    }:
+        return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right)
+    else:
+        how, op1, op2 = how
+        if how != "ie_join":
+            raise NotImplementedError(
+                f"Unsupported join type {how}"
+            )  # pragma: no cover; asof joins not yet exposed
+        # No exposure of mixed/conditional joins in pylibcudf yet, so in
+        # the first instance, implement by doing a cross join followed by
+        # a filter.
+        _, join_nulls, zlice, suffix, coalesce = node.options
+        cross = ir.Join(
+            schema,
+            [],
+            [],
+            ("cross", join_nulls, None, suffix, coalesce),
+            inp_left,
+            inp_right,
+        )
+        dtype = plc.DataType(plc.TypeId.BOOL8)
+        if op2 is None:
+            ops = [op1]
+        else:
+            ops = [op1, op2]
+        suffix = cross.options[3]
+
+        # Column references in the right table refer to the post-join
+        # names, so with suffixes.
+        def _rename(e: expr.Expr, rec: ExprTransformer) -> expr.Expr:
+            if isinstance(e, expr.Col) and e.name in inp_left.schema:
+                return type(e)(e.dtype, f"{e.name}{suffix}")
+            return reuse_if_unchanged(e, rec)
+
+        mapper = make_recursive(_rename)
+        right_on = [
+            expr.NamedExpr(
+                f"{old.name}{suffix}" if old.name in inp_left.schema else old.name, new
+            )
+            for new, old in zip(
+                (mapper(e.value) for e in right_on), right_on, strict=True
+            )
+        ]
+        mask = functools.reduce(
+            functools.partial(
+                expr.BinOp, dtype, plc.binaryop.BinaryOperator.LOGICAL_AND
+            ),
+            (
+                expr.BinOp(dtype, expr.BinOp._MAPPING[op], left.value, right.value)
+                for op, left, right in zip(ops, left_on, right_on, strict=True)
+            ),
+        )
+        filtered = ir.Filter(schema, expr.NamedExpr("mask", mask), cross)
+        if zlice is not None:
+            offset, length = zlice
+            return ir.Slice(schema, offset, length, filtered)
+        return filtered
 
 
 @_translate_ir.register
@@ -192,7 +261,7 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.exprs]
-    return ir.HStack(schema, inp, exprs, node.should_broadcast)
+    return ir.HStack(schema, exprs, node.should_broadcast, inp)
 
 
 @_translate_ir.register
@@ -202,17 +271,23 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
-    return ir.Reduce(schema, inp, exprs)
+    return ir.Reduce(schema, exprs, inp)
 
 
 @_translate_ir.register
 def _(
     node: pl_ir.Distinct, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
+    (keep, subset, maintain_order, zlice) = node.options
+    keep = ir.Distinct._KEEP_MAP[keep]
+    subset = frozenset(subset) if subset is not None else None
     return ir.Distinct(
         schema,
+        keep,
+        subset,
+        zlice,
+        maintain_order,
         translate_ir(visitor, n=node.input),
-        node.options,
     )
 
 
@@ -223,14 +298,18 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         by = [translate_named_expr(visitor, n=e) for e in node.by_column]
-    return ir.Sort(schema, inp, by, node.sort_options, node.slice)
+    stable, nulls_last, descending = node.sort_options
+    order, null_order = sorting.sort_order(
+        descending, nulls_last=nulls_last, num_keys=len(by)
+    )
+    return ir.Sort(schema, by, order, null_order, stable, node.slice, inp)
 
 
 @_translate_ir.register
 def _(
     node: pl_ir.Slice, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    return ir.Slice(schema, translate_ir(visitor, n=node.input), node.offset, node.len)
+    return ir.Slice(schema, node.offset, node.len, translate_ir(visitor, n=node.input))
 
 
 @_translate_ir.register
@@ -240,7 +319,7 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         mask = translate_named_expr(visitor, n=node.predicate)
-    return ir.Filter(schema, inp, mask)
+    return ir.Filter(schema, mask, inp)
 
 
 @_translate_ir.register
@@ -259,10 +338,10 @@ def _(
     name, *options = node.function
     return ir.MapFunction(
         schema,
-        # TODO: merge_sorted breaks this pattern
-        translate_ir(visitor, n=node.input),
         name,
         options,
+        # TODO: merge_sorted breaks this pattern
+        translate_ir(visitor, n=node.input),
     )
 
 
@@ -271,7 +350,7 @@ def _(
     node: pl_ir.Union, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     return ir.Union(
-        schema, [translate_ir(visitor, n=n) for n in node.inputs], node.options
+        schema, node.options, *(translate_ir(visitor, n=n) for n in node.inputs)
     )
 
 
@@ -279,7 +358,7 @@ def _(
 def _(
     node: pl_ir.HConcat, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs])
+    return ir.HConcat(schema, *(translate_ir(visitor, n=n) for n in node.inputs))
 
 
 def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR:
@@ -309,8 +388,7 @@ def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR:
     # IR is versioned with major.minor, minor is bumped for backwards
     # compatible changes (e.g. adding new nodes), major is bumped for
     # incompatible changes (e.g. renaming nodes).
-    # Polars 1.7 changes definition of the CSV reader options schema name.
-    if (version := visitor.version()) >= (3, 0):
+    if (version := visitor.version()) >= (4, 0):
         raise NotImplementedError(
             f"No support for polars IR {version=}"
         )  # pragma: no cover; no such version for now.
diff --git a/python/cudf_polars/cudf_polars/dsl/traversal.py b/python/cudf_polars/cudf_polars/dsl/traversal.py
new file mode 100644
index 00000000000..be8338cb9a9
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/traversal.py
@@ -0,0 +1,175 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Traversal and visitor utilities for nodes."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Generic
+
+from cudf_polars.typing import U_contra, V_co
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Generator, Mapping, MutableMapping
+
+    from cudf_polars.typing import GenericTransformer, NodeT
+
+
+__all__: list[str] = [
+    "traversal",
+    "reuse_if_unchanged",
+    "make_recursive",
+    "CachingVisitor",
+]
+
+
+def traversal(node: NodeT) -> Generator[NodeT, None, None]:
+    """
+    Pre-order traversal of nodes in an expression.
+
+    Parameters
+    ----------
+    node
+        Root of expression to traverse.
+
+    Yields
+    ------
+    Unique nodes in the expression, parent before child, children
+    in-order from left to right.
+    """
+    seen = {node}
+    lifo = [node]
+
+    while lifo:
+        node = lifo.pop()
+        yield node
+        for child in reversed(node.children):
+            if child not in seen:
+                seen.add(child)
+                lifo.append(child)
+
+
+def reuse_if_unchanged(node: NodeT, fn: GenericTransformer[NodeT, NodeT]) -> NodeT:
+    """
+    Recipe for transforming nodes that returns the old object if unchanged.
+
+    Parameters
+    ----------
+    node
+         Node to recurse on
+    fn
+         Function to transform children
+
+    Notes
+    -----
+    This can be used as a generic "base case" handler when
+    writing transforms that take nodes and produce new nodes.
+
+    Returns
+    -------
+    Existing node `e` if transformed children are unchanged, otherwise
+    reconstructed node with new children.
+    """
+    new_children = [fn(c) for c in node.children]
+    if all(new == old for new, old in zip(new_children, node.children, strict=True)):
+        return node
+    return node.reconstruct(new_children)
+
+
+def make_recursive(
+    fn: Callable[[U_contra, GenericTransformer[U_contra, V_co]], V_co],
+    *,
+    state: Mapping[str, Any] | None = None,
+) -> GenericTransformer[U_contra, V_co]:
+    """
+    No-op wrapper for recursive visitors.
+
+    Facilitates using visitors that don't need caching but are written
+    in the same style.
+
+    Parameters
+    ----------
+    fn
+        Function to transform inputs to outputs. Should take as its
+        second argument a callable from input to output.
+    state
+        Arbitrary *immutable* state that should be accessible to the
+        visitor through the `state` property.
+
+    Notes
+    -----
+    All transformation functions *must* be free of side-effects.
+
+    Usually, prefer a :class:`CachingVisitor`, but if we know that we
+    don't need caching in a transformation and then this no-op
+    approach is slightly cheaper.
+
+    Returns
+    -------
+    Recursive function without caching.
+
+    See Also
+    --------
+    CachingVisitor
+    """
+
+    def rec(node: U_contra) -> V_co:
+        return fn(node, rec)  # type: ignore[arg-type]
+
+    rec.state = state if state is not None else {}  # type: ignore[attr-defined]
+    return rec  # type: ignore[return-value]
+
+
+class CachingVisitor(Generic[U_contra, V_co]):
+    """
+    Caching wrapper for recursive visitors.
+
+    Facilitates writing visitors where already computed results should
+    be cached and reused. The cache is managed automatically, and is
+    tied to the lifetime of the wrapper.
+
+    Parameters
+    ----------
+    fn
+        Function to transform inputs to outputs. Should take as its
+        second argument the recursive cache manager.
+    state
+        Arbitrary *immutable* state that should be accessible to the
+        visitor through the `state` property.
+
+    Notes
+    -----
+    All transformation functions *must* be free of side-effects.
+
+    Returns
+    -------
+    Recursive function with caching.
+    """
+
+    def __init__(
+        self,
+        fn: Callable[[U_contra, GenericTransformer[U_contra, V_co]], V_co],
+        *,
+        state: Mapping[str, Any] | None = None,
+    ) -> None:
+        self.fn = fn
+        self.cache: MutableMapping[U_contra, V_co] = {}
+        self.state = state if state is not None else {}
+
+    def __call__(self, value: U_contra) -> V_co:
+        """
+        Apply the function to a value.
+
+        Parameters
+        ----------
+        value
+            The value to transform.
+
+        Returns
+        -------
+        A transformed value.
+        """
+        try:
+            return self.cache[value]
+        except KeyError:
+            return self.cache.setdefault(value, self.fn(value, self))
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index 05b76d76808..a3607159e01 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -53,12 +53,34 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-columns]": "Correctly raises but different error",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "Correctly raises but different error",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "Correctly raises but different error",
+    "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "Correctly raises but different error",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[False-False]": "Needs some variant of cudf#16394",
+    "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[True-False]": "Needs some variant of cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception",
     "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception",
     "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-none]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-prefiltered]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-row_groups]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-columns]": "Mismatching column read cudf#16394",
+    "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394",
     "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match",
@@ -107,6 +129,14 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero",
     "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list",
     "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func0-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func1-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func2-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func3-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func0-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func1-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func2-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func3-none]": "cudf-polars doesn't nullify division by zero",
     "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values",
     "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852",
     "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
@@ -124,13 +154,6 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_by_monday_and_offset_5444": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[left-expected0]": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[right-expected1]": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[datapoint-expected2]": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_rolling_dynamic_sortedness_check": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_validation": "IR needs to expose groupby-dynamic information",
-    "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_15225": "IR needs to expose groupby-dynamic information",
     "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
     "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU",
     "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU",
@@ -140,6 +163,7 @@ def pytest_configure(config: pytest.Config):
     "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match",
     "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match",
     "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852",
+    "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised",
     # Maybe flaky, order-dependent?
     "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
     "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index 240b11bdf59..a27a3395c35 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -5,8 +5,8 @@
 
 from __future__ import annotations
 
-from collections.abc import Mapping
-from typing import TYPE_CHECKING, Literal, Protocol, Union
+from collections.abc import Hashable, Mapping
+from typing import TYPE_CHECKING, Any, Literal, Protocol, TypeVar, Union
 
 import pylibcudf as plc
 
@@ -18,7 +18,19 @@
 
     import polars as pl
 
-IR: TypeAlias = Union[
+    from cudf_polars.dsl import expr, ir, nodebase
+
+__all__: list[str] = [
+    "PolarsIR",
+    "PolarsExpr",
+    "NodeTraverser",
+    "OptimizationArgs",
+    "GenericTransformer",
+    "ExprTransformer",
+    "IRTransformer",
+]
+
+PolarsIR: TypeAlias = Union[
     pl_ir.PythonScan,
     pl_ir.Scan,
     pl_ir.Cache,
@@ -38,7 +50,7 @@
     pl_ir.ExtContext,
 ]
 
-Expr: TypeAlias = Union[
+PolarsExpr: TypeAlias = Union[
     pl_expr.Function,
     pl_expr.Window,
     pl_expr.Literal,
@@ -68,7 +80,7 @@ def set_node(self, n: int) -> None:
         """Set the current plan node to n."""
         ...
 
-    def view_current_node(self) -> IR:
+    def view_current_node(self) -> PolarsIR:
         """Convert current plan node to python rep."""
         ...
 
@@ -80,7 +92,7 @@ def get_dtype(self, n: int) -> pl.DataType:
         """Get the datatype of the given expression id."""
         ...
 
-    def view_expression(self, n: int) -> Expr:
+    def view_expression(self, n: int) -> PolarsExpr:
         """Convert the given expression to python rep."""
         ...
 
@@ -107,3 +119,29 @@ def set_udf(
     "cluster_with_columns",
     "no_optimization",
 ]
+
+
+U_contra = TypeVar("U_contra", bound=Hashable, contravariant=True)
+V_co = TypeVar("V_co", covariant=True)
+NodeT = TypeVar("NodeT", bound="nodebase.Node[Any]")
+
+
+class GenericTransformer(Protocol[U_contra, V_co]):
+    """Abstract protocol for recursive visitors."""
+
+    def __call__(self, __value: U_contra) -> V_co:
+        """Apply the visitor to the node."""
+        ...
+
+    @property
+    def state(self) -> Mapping[str, Any]:
+        """Arbitrary immutable state."""
+        ...
+
+
+# Quotes to avoid circular import
+ExprTransformer: TypeAlias = GenericTransformer["expr.Expr", "expr.Expr"]
+"""Protocol for transformation of Expr nodes."""
+
+IRTransformer: TypeAlias = GenericTransformer["ir.IR", "ir.IR"]
+"""Protocol for transformation of IR nodes."""
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index 7837a275f20..74b2cd4e5de 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -11,14 +11,17 @@ You will need:
    environment](https://github.com/rapidsai/cudf/blob/branch-24.12/CONTRIBUTING.md#setting-up-your-build-environment).
    The combined devcontainer works, or whatever your favourite approach is.
 
-> ![NOTE] These instructions will get simpler as we merge code in.
+:::{note}
+These instructions will get simpler as we merge code in.
+:::
 
 ## Installing polars
 
-`cudf-polars` works with polars >= 1.3, as long as the internal IR
-version doesn't get a major version bump. So `pip install polars>=1.3`
-should work. For development, if we're adding things to the polars
-side of things, we will need to build polars from source:
+The `cudf-polars` `pyproject.toml` advertises which polars versions it
+works with. So for pure `cudf-polars` development, installing as
+normal and satisfying the dependencies in the repository is
+sufficient. For development, if we're adding things to the polars side
+of things, we will need to build polars from source:
 
 ```sh
 git clone https://github.com/pola-rs/polars
@@ -36,7 +39,9 @@ pip install --upgrade uv
 uv pip install --upgrade -r py-polars/requirements-dev.txt
 ```
 
-> ![NOTE] plain `pip install` works fine, but `uv` is _much_ faster!
+:::{note}
+plain `pip install` works fine, but `uv` is _much_ faster!
+:::
 
 Now we have the necessary machinery to build polars
 ```sh
@@ -83,7 +88,7 @@ representation (IR). Second, an execution phase which executes using
 our IR.
 
 The translation phase receives the a low-level Rust `NodeTraverser`
-object which delivers Python representations of the plan nodes (and
+object that delivers Python representations of the plan nodes (and
 expressions) one at a time. During translation, we endeavour to raise
 `NotImplementedError` for any unsupported functionality. This way, if
 we can't execute something, we just don't modify the logical plan at
@@ -126,7 +131,6 @@ arguments, at the moment, `raise_on_fail` is also supported, which
 raises, rather than falling back, during translation:
 
 ```python
-
 result = q.collect(engine=pl.GPUEngine(raise_on_fail=True))
 ```
 
@@ -144,13 +148,73 @@ changes. We can therefore attempt to detect the IR version
 appropriately. This should be done during IR translation in
 `translate.py`.
 
-## Adding a handler for a new plan node
+# IR design
+
+As noted, we translate the polars DSL into our own IR. This is both so
+that we can smooth out minor version differences (advertised by
+`NodeTraverser` version changes) within `cudf-polars`, and so that we
+have the freedom to introduce new IR nodes and rewrite rules as might
+be appropriate for GPU execution.
+
+To that end, we provide facilities for definition of nodes as well as
+writing traversals and rewrite rules. The abstract base class `Node`
+in `dsl/nodebase.py` defines the interface for implementing new nodes,
+and provides many useful default methods. See also the docstrings of
+the `Node` class.
+
+:::{note}
+This generic implementation relies on nodes being treated as
+*immutable*. Do not implement in-place modification of nodes, bad
+things will happen.
+:::
+
+## Defining nodes
+
+A concrete node type (`cudf-polars` has expression nodes, `Expr`;
+and plan nodes, `IR`), should inherit from `Node`. Nodes have
+two types of data:
+
+1. `children`: a tuple (possibly empty) of concrete nodes;
+2. non-child: arbitrary data attached to the node that is _not_ a
+   concrete node.
+
+The base `Node` class requires that one advertise the names of the
+non-child attributes in the `_non_child` class variable. The
+constructor of the concrete node should take its arguments in the
+order `*_non_child` (ordered as the class variable does) and then
+`*children`. For example, the `Sort` node, which sorts a column
+generated by an expression, has this definition:
+
+```python
+class Expr(Node):
+    children: tuple[Expr, ...]
+
+class Sort(Expr):
+    _non_child = ("dtype", "options")
+    children: tuple[Expr]
+    def __init__(self, dtype, options, column: Expr):
+        self.dtype = dtype
+        self.options = options
+        self.children = (column,)
+```
+
+By following this pattern, we get an automatic (caching)
+implementation of `__hash__` and `__eq__`, as well as a useful
+`reconstruct` method that will rebuild the node with new children.
+
+If you want to control the behaviour of `__hash__` and `__eq__` for a
+single node, override (respectively) the `get_hashable` and `is_equal`
+methods.
+
+## Adding new translation rules from the polars IR
+
+### Plan nodes
 
-Plan node definitions live in `cudf_polars/dsl/ir.py`, these are
-`dataclasses` that inherit from the base `IR` node. The evaluation of
-a plan node is done by implementing the `evaluate` method.
+Plan node definitions live in `cudf_polars/dsl/ir.py`, these all
+inherit from the base `IR` node. The evaluation of a plan node is done
+by implementing the `evaluate` method.
 
-To translate the plan node, add a case handler in `translate_ir` which
+To translate the plan node, add a case handler in `translate_ir` that
 lives in `cudf_polars/dsl/translate.py`.
 
 As well as child nodes that are plans, most plan nodes contain child
@@ -163,25 +227,12 @@ translating a `Join` node, the left keys (expressions) should be
 translated with the left input active (and right keys with right
 input). To facilitate this, use the `set_node` context manager.
 
-## Adding a handler for a new expression node
+### Expression nodes
 
 Adding a handle for an expression node is very similar to a plan node.
-Expressions are all defined in `cudf_polars/dsl/expr.py` and inherit
-from `Expr`. Unlike plan nodes, these are not `dataclasses`, since it
-is simpler for us to implement efficient hashing, repr, and equality if we
-can write that ourselves.
-
-Every expression consists of two types of data:
-1. child data (other `Expr`s)
-2. non-child data (anything other than an `Expr`)
-The generic implementations of special methods in the base `Expr` base
-class require that the subclasses advertise which arguments to the
-constructor are non-child in a `_non_child` class slot. The
-constructor should then take arguments:
-```python
-def __init__(self, *non_child_data: Any, *children: Expr):
-```
-Read the docstrings in the `Expr` class for more details.
+Expressions are defined in `cudf_polars/dsl/expressions/` and exported
+into the `dsl` namespace via `expr.py`. They inherit
+from `Expr`.
 
 Expressions are evaluated by implementing a `do_evaluate` method that
 takes a `DataFrame` as context (this provides columns) along with an
@@ -198,6 +249,124 @@ To simplify state tracking, all columns should be considered immutable
 on construction. This matches the "functional" description coming from
 the logical plan in any case, so is reasonably natural.
 
+## Traversing and transforming nodes
+
+In addition to representing and evaluating nodes. We also provide
+facilities for traversing a tree of nodes and defining transformation
+rules in `dsl/traversal.py`. The simplest is `traversal`, a
+[pre-order](https://en.wikipedia.org/wiki/Tree_traversal) visit of all
+unique nodes in an expression. Use this if you want to know some
+specific thing about an expression. For example, to determine if an
+expression contains a `Literal` node:
+
+```python
+def has_literal(node: Expr) -> bool:
+    return any(isinstance(e, Literal) for e in traversal(node))
+```
+
+It is often convenient to provide (immutable) state to a visitor, as
+well as some facility to perform DAG-aware rewrites (reusing a
+transformation for an expression if we have already seen it). We
+therefore adopt the following pattern of writing DAG-aware visitors.
+Suppose we want a rewrite rule (`rewrite`) between expressions
+(`Expr`) and some new type `T`. We define our general transformation
+function `rewrite` with type `Expr -> (Expr -> T) -> T`:
+
+```python
+from cudf_polars.typing import GenericTransformer
+
+@singledispatch
+def rewrite(e: Expr, rec: GenericTransformer[Expr, T]) -> T:
+    ...
+```
+
+Note in particular that the function to perform the recursion is
+passed as the second argument. Rather than defining methods on each
+node in turn for a particular rewrite rule, we prefer free functions
+and use `functools.singledispatch` to provide dispatching. We now, in
+the usual fashion, register handlers for different expression types.
+To use this function, we need to be able to provide both the
+expression to convert and the recursive function itself. To do this we
+must convert our `rewrite` function into something that only takes a
+single argument (the expression to rewrite), but carries around
+information about how to perform the recursion. To this end, we have
+two utilities in `traversal.py`:
+
+- `make_recursive` and
+- `CachingVisitor`.
+
+These both implement the `GenericTransformer` protocol, and can be
+wrapped around a transformation function like `rewrite` to provide a
+function `Expr -> T`. They also allow us to attach arbitrary
+*immutable* state to our visitor by passing a `state` dictionary. This
+dictionary can then be inspected by the concrete transformation
+function. `make_recursive` is very simple, and provides no caching of
+intermediate results (so any DAGs that are visited will be viewed as
+trees). `CachingVisitor` provides the same interface, but maintains a
+cache of intermediate results, and reuses them if the same expression
+is seen again.
+
+Finally, for writing transformations that take nodes and deliver new
+nodes (e.g. rewrite rules), we have a final utility
+`reuse_if_unchanged` that can be used as a base case transformation
+for node to node rewrites. It is a depth-first visit that transforms
+children but only returns a new node with new children if the rewrite
+of children returned new nodes.
+
+To see how these pieces fit together, let us consider writing a
+`rename` function that takes an expression (potentially with
+references to columns) along with a mapping defining a renaming
+between (some subset of) column names. The goal is to deliver a new
+expression with appropriate columns renamed.
+
+To start, we define the dispatch function
+```python
+from collections.abc import Mapping
+from functools import singledispatch
+from cudf_polars.dsl.traversal import (
+    CachingVisitor, make_recursive, reuse_if_unchanged
+)
+from cudf_polars.dsl.expr import Col, Expr
+from cudf_polars.typing import ExprTransformer
+
+
+@singledispatch
+def _rename(e: Expr, rec: ExprTransformer) -> Expr:
+    raise NotImplementedError(f"No handler for {type(e)}")
+```
+then we register specific handlers, first for columns:
+```python
+@_rename.register
+def _(e: Col, rec: ExprTransformer) -> Expr:
+    mapping = rec.state["mapping"] # state set on rec
+    if e.name in mapping:
+        # If we have a rename, return a new Col reference
+        # with a new name
+        return type(e)(e.dtype, mapping[e.name])
+    return e
+```
+and then for the remaining expressions
+```python
+_rename.register(Expr)(reuse_if_unchanged)
+```
+
+:::{note}
+In this case, we could have put the generic handler in the `_rename`
+function, however, then we would not get a nice error message if we
+accidentally sent in an object of the incorrect type.
+:::
+
+Finally we tie everything together with a public function:
+
+```python
+def rename(e: Expr, mapping: Mapping[str, str]) -> Expr:
+    """Rename column references in an expression."""
+    mapper = CachingVisitor(_rename, state={"mapping": mapping})
+    # or
+    # mapper = make_recursive(_rename, state={"mapping": mapping})
+    return mapper(e)
+```
+
 # Containers
 
 Containers should be constructed as relatively lightweight objects
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 5345fad41a2..2afdab1be4b 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.8,<1.9",
+    "polars>=1.11,<1.12",
     "pylibcudf==24.12.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -60,7 +60,7 @@ xfail_strict = true
 [tool.coverage.report]
 exclude_also = [
   "if TYPE_CHECKING:",
-  "class .*\\bProtocol\\):",
+  "class .*\\bProtocol(?:\\[[^]]+\\])?\\):",
   "assert_never\\("
 ]
 # The cudf_polars test suite doesn't exercise the plugin, so we omit
diff --git a/python/cudf_polars/tests/dsl/test_expr.py b/python/cudf_polars/tests/dsl/test_expr.py
index b7d4672daca..84e33262869 100644
--- a/python/cudf_polars/tests/dsl/test_expr.py
+++ b/python/cudf_polars/tests/dsl/test_expr.py
@@ -73,3 +73,24 @@ def test_namedexpr_repr_stable():
     b2 = expr.NamedExpr("b1", expr.Col(plc.DataType(plc.TypeId.INT8), "a"))
 
     assert repr(b1) == repr(b2)
+
+
+def test_equality_cse():
+    dt = plc.DataType(plc.TypeId.INT8)
+
+    def make_expr(n1, n2):
+        a = expr.Col(plc.DataType(plc.TypeId.INT8), n1)
+        b = expr.Col(plc.DataType(plc.TypeId.INT8), n2)
+
+        return expr.BinOp(dt, plc.binaryop.BinaryOperator.ADD, a, b)
+
+    e1 = make_expr("a", "b")
+    e2 = make_expr("a", "b")
+    e3 = make_expr("a", "c")
+
+    assert e1.children is not e2.children
+    assert e1 == e2
+    assert e1.children is e2.children
+    assert e1 == e2
+    assert e1 != e3
+    assert e2 != e3
diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py
new file mode 100644
index 00000000000..6505a786855
--- /dev/null
+++ b/python/cudf_polars/tests/dsl/test_traversal.py
@@ -0,0 +1,229 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from functools import singledispatch
+
+import pylibcudf as plc
+
+import polars as pl
+from polars.testing import assert_frame_equal
+
+from cudf_polars import translate_ir
+from cudf_polars.dsl import expr, ir
+from cudf_polars.dsl.traversal import (
+    CachingVisitor,
+    make_recursive,
+    reuse_if_unchanged,
+    traversal,
+)
+from cudf_polars.typing import ExprTransformer, IRTransformer
+
+
+def make_expr(dt, n1, n2):
+    a1 = expr.Col(dt, n1)
+    a2 = expr.Col(dt, n2)
+
+    return expr.BinOp(dt, plc.binaryop.BinaryOperator.MUL, a1, a2)
+
+
+def test_traversal_unique():
+    dt = plc.DataType(plc.TypeId.INT8)
+
+    e1 = make_expr(dt, "a", "a")
+    unique_exprs = list(traversal(e1))
+
+    assert len(unique_exprs) == 2
+    assert set(unique_exprs) == {expr.Col(dt, "a"), e1}
+    assert unique_exprs == [e1, expr.Col(dt, "a")]
+
+    e2 = make_expr(dt, "a", "b")
+    unique_exprs = list(traversal(e2))
+
+    assert len(unique_exprs) == 3
+    assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e2}
+    assert unique_exprs == [e2, expr.Col(dt, "a"), expr.Col(dt, "b")]
+
+    e3 = make_expr(dt, "b", "a")
+    unique_exprs = list(traversal(e3))
+
+    assert len(unique_exprs) == 3
+    assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e3}
+    assert unique_exprs == [e3, expr.Col(dt, "b"), expr.Col(dt, "a")]
+
+
+def rename(e, rec):
+    mapping = rec.state["mapping"]
+    if isinstance(e, expr.Col) and e.name in mapping:
+        return type(e)(e.dtype, mapping[e.name])
+    return reuse_if_unchanged(e, rec)
+
+
+def test_caching_visitor():
+    dt = plc.DataType(plc.TypeId.INT8)
+
+    e1 = make_expr(dt, "a", "b")
+
+    mapper = CachingVisitor(rename, state={"mapping": {"b": "c"}})
+
+    renamed = mapper(e1)
+    assert renamed == make_expr(dt, "a", "c")
+    assert len(mapper.cache) == 3
+
+    e2 = make_expr(dt, "a", "a")
+    mapper = CachingVisitor(rename, state={"mapping": {"b": "c"}})
+
+    renamed = mapper(e2)
+    assert renamed == make_expr(dt, "a", "a")
+    assert len(mapper.cache) == 2
+    mapper = CachingVisitor(rename, state={"mapping": {"a": "c"}})
+
+    renamed = mapper(e2)
+    assert renamed == make_expr(dt, "c", "c")
+    assert len(mapper.cache) == 2
+
+
+def test_noop_visitor():
+    dt = plc.DataType(plc.TypeId.INT8)
+
+    e1 = make_expr(dt, "a", "b")
+
+    mapper = make_recursive(rename, state={"mapping": {"b": "c"}})
+
+    renamed = mapper(e1)
+    assert renamed == make_expr(dt, "a", "c")
+
+    e2 = make_expr(dt, "a", "a")
+    mapper = make_recursive(rename, state={"mapping": {"b": "c"}})
+
+    renamed = mapper(e2)
+    assert renamed == make_expr(dt, "a", "a")
+    mapper = make_recursive(rename, state={"mapping": {"a": "c"}})
+
+    renamed = mapper(e2)
+    assert renamed == make_expr(dt, "c", "c")
+
+
+def test_rewrite_ir_node():
+    df = pl.LazyFrame({"a": [1, 2, 1], "b": [1, 3, 4]})
+    q = df.group_by("a").agg(pl.col("b").sum()).sort("b")
+
+    orig = translate_ir(q._ldf.visit())
+
+    new_df = pl.DataFrame({"a": [1, 1, 2], "b": [-1, -2, -4]})
+
+    def replace_df(node, rec):
+        if isinstance(node, ir.DataFrameScan):
+            return ir.DataFrameScan(
+                node.schema, new_df._df, node.projection, node.predicate
+            )
+        return reuse_if_unchanged(node, rec)
+
+    mapper = CachingVisitor(replace_df)
+
+    new = mapper(orig)
+
+    result = new.evaluate(cache={}).to_polars()
+
+    expect = pl.DataFrame({"a": [2, 1], "b": [-4, -3]})
+
+    assert_frame_equal(result, expect)
+
+
+def test_rewrite_scan_node(tmp_path):
+    left = pl.LazyFrame({"a": [1, 2, 3], "b": [1, 3, 4]})
+    right = pl.DataFrame({"a": [1, 4, 2], "c": [1, 2, 3]})
+
+    right.write_parquet(tmp_path / "right.pq")
+
+    right_s = pl.scan_parquet(tmp_path / "right.pq")
+
+    q = left.join(right_s, on="a", how="inner")
+
+    def replace_scan(node, rec):
+        if isinstance(node, ir.Scan):
+            return ir.DataFrameScan(
+                node.schema, right._df, node.with_columns, node.predicate
+            )
+        return reuse_if_unchanged(node, rec)
+
+    mapper = CachingVisitor(replace_scan)
+
+    orig = translate_ir(q._ldf.visit())
+    new = mapper(orig)
+
+    result = new.evaluate(cache={}).to_polars()
+
+    expect = q.collect()
+
+    assert_frame_equal(result, expect, check_row_order=False)
+
+
+def test_rewrite_names_and_ops():
+    df = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": [5, 6, 7], "d": [7, 9, 8]})
+
+    q = df.select(pl.col("a") - (pl.col("b") + pl.col("c") * 2), pl.col("d")).sort("d")
+
+    # We will replace a -> d, c -> d, and addition with multiplication
+    expect = (
+        df.select(
+            (pl.col("d") - (pl.col("b") * pl.col("d") * 2)).alias("a"), pl.col("d")
+        )
+        .sort("d")
+        .collect()
+    )
+
+    qir = translate_ir(q._ldf.visit())
+
+    @singledispatch
+    def _transform(e: expr.Expr, fn: ExprTransformer) -> expr.Expr:
+        raise NotImplementedError("Unhandled")
+
+    @_transform.register
+    def _(e: expr.Col, fn: ExprTransformer):
+        mapping = fn.state["mapping"]
+        if e.name in mapping:
+            return type(e)(e.dtype, mapping[e.name])
+        return e
+
+    @_transform.register
+    def _(e: expr.BinOp, fn: ExprTransformer):
+        if e.op == plc.binaryop.BinaryOperator.ADD:
+            return type(e)(
+                e.dtype, plc.binaryop.BinaryOperator.MUL, *map(fn, e.children)
+            )
+        return reuse_if_unchanged(e, fn)
+
+    _transform.register(expr.Expr)(reuse_if_unchanged)
+
+    @singledispatch
+    def _rewrite(node: ir.IR, fn: IRTransformer) -> ir.IR:
+        raise NotImplementedError("Unhandled")
+
+    @_rewrite.register
+    def _(node: ir.Select, fn: IRTransformer):
+        expr_mapper = fn.state["expr_mapper"]
+        return type(node)(
+            node.schema,
+            [expr.NamedExpr(e.name, expr_mapper(e.value)) for e in node.exprs],
+            node.should_broadcast,
+            fn(node.children[0]),
+        )
+
+    _rewrite.register(ir.IR)(reuse_if_unchanged)
+
+    rewriter = CachingVisitor(
+        _rewrite,
+        state={
+            "expr_mapper": CachingVisitor(
+                _transform, state={"mapping": {"a": "d", "c": "d"}}
+            )
+        },
+    )
+
+    new_ir = rewriter(qir)
+
+    got = new_ir.evaluate(cache={}).to_polars()
+
+    assert_frame_equal(expect, got)
diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py
index 3c3986be19b..9900f598e5f 100644
--- a/python/cudf_polars/tests/test_config.py
+++ b/python/cudf_polars/tests/test_config.py
@@ -10,7 +10,7 @@
 
 import rmm
 
-from cudf_polars.dsl.ir import IR
+from cudf_polars.dsl.ir import DataFrameScan
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
     assert_ir_translation_raises,
@@ -18,10 +18,10 @@
 
 
 def test_polars_verbose_warns(monkeypatch):
-    def raise_unimplemented(self):
+    def raise_unimplemented(self, *args):
         raise NotImplementedError("We don't support this")
 
-    monkeypatch.setattr(IR, "__post_init__", raise_unimplemented)
+    monkeypatch.setattr(DataFrameScan, "__init__", raise_unimplemented)
     q = pl.LazyFrame({})
     # Ensure that things raise
     assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 7d9ec98db97..501560d15b8 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -2,9 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+from contextlib import nullcontext
+
 import pytest
 
 import polars as pl
+from polars.testing import assert_frame_equal
 
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
@@ -22,6 +25,11 @@ def how(request):
     return request.param
 
 
+@pytest.fixture(params=[None, (1, 5), (1, None), (0, 2), (0, None)])
+def zlice(request):
+    return request.param
+
+
 @pytest.fixture
 def left():
     return pl.LazyFrame(
@@ -37,8 +45,9 @@ def left():
 def right():
     return pl.LazyFrame(
         {
-            "a": [1, 4, 3, 7, None, None],
-            "c": [2, 3, 4, 5, 6, 7],
+            "a": [1, 4, 3, 7, None, None, 1],
+            "c": [2, 3, 4, 5, 6, 7, 8],
+            "d": [6, None, 7, 8, -1, 2, 4],
         }
     )
 
@@ -70,11 +79,31 @@ def test_coalesce_join(left, right, how, join_nulls, join_expr):
     query = left.join(
         right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=True
     )
-    assert_gpu_result_equal(query, check_row_order=False)
+    assert_gpu_result_equal(query, check_row_order=how == "left")
 
 
-def test_cross_join(left, right):
+def test_left_join_with_slice(left, right, join_nulls, zlice):
+    q = left.join(right, on="a", how="left", join_nulls=join_nulls, coalesce=True)
+    ctx = nullcontext()
+    if zlice is not None:
+        q_expect = q.collect().slice(*zlice)
+        q = q.slice(*zlice)
+        if zlice == (1, 5) or zlice == (0, 2):
+            # https://github.com/pola-rs/polars/issues/19403
+            # https://github.com/pola-rs/polars/issues/19405
+            ctx = pytest.raises(AssertionError)
+            assert_frame_equal(
+                q_expect, q.collect(engine=pl.GPUEngine(raise_on_fail=True))
+            )
+
+    with ctx:
+        assert_gpu_result_equal(q)
+
+
+def test_cross_join(left, right, zlice):
     q = left.join(right, how="cross")
+    if zlice is not None:
+        q = q.slice(*zlice)
 
     assert_gpu_result_equal(q)
 
@@ -86,3 +115,26 @@ def test_join_literal_key_unsupported(left, right, left_on, right_on):
     q = left.join(right, left_on=left_on, right_on=right_on, how="inner")
 
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize(
+    "conditions",
+    [
+        [pl.col("a") < pl.col("a_right")],
+        [pl.col("a_right") <= pl.col("a") * 2],
+        [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")],
+        [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")],
+        [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2],
+    ],
+)
+def test_join_where(left, right, conditions, zlice):
+    q = left.join_where(right, *conditions)
+
+    assert_gpu_result_equal(q, check_row_order=False)
+
+    if zlice is not None:
+        q_len = q.slice(*zlice).select(pl.len())
+        # Can't compare result, since row order is not guaranteed and
+        # therefore we only check the length
+
+        assert_gpu_result_equal(q_len)
diff --git a/python/custreamz/custreamz/tests/conftest.py b/python/custreamz/custreamz/tests/conftest.py
index 1cda9b71387..c5135bc6414 100644
--- a/python/custreamz/custreamz/tests/conftest.py
+++ b/python/custreamz/custreamz/tests/conftest.py
@@ -2,6 +2,7 @@
 import socket
 
 import pytest
+
 from custreamz import kafka
 
 
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index af45f49d9b4..a8ab05a3922 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -65,50 +65,20 @@ include = [
 ]
 exclude = ["*tests*"]
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-    "cudf",
-    "dask_cudf",
-]
-known_first_party = [
-    "streamz",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["streamz"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm", "cudf", "dask_cudf"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
 
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index 04c2ad65b99..f9df22cc436 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -7,15 +7,15 @@
 # do anything for dask==2024.2.0)
 config.set({"dataframe.query-planning-warning": False})
 
-import dask.dataframe as dd
-from dask.dataframe import from_delayed
+import dask.dataframe as dd  # noqa: E402
+from dask.dataframe import from_delayed  # noqa: E402
 
-import cudf
+import cudf  # noqa: E402
 
-from . import backends
-from ._version import __git_commit__, __version__
-from .core import concat, from_cudf, from_dask_dataframe
-from .expr import QUERY_PLANNING_ON
+from . import backends  # noqa: E402, F401
+from ._version import __git_commit__, __version__  # noqa: E402, F401
+from .core import concat, from_cudf, from_dask_dataframe  # noqa: E402
+from .expr import QUERY_PLANNING_ON  # noqa: E402
 
 
 def read_csv(*args, **kwargs):
@@ -55,9 +55,9 @@ def inner_func(*args, **kwargs):
     to_orc = raise_not_implemented_error("to_orc")
 
 else:
-    from .core import DataFrame, Index, Series
-    from .groupby import groupby_agg
-    from .io import read_text, to_orc
+    from .core import DataFrame, Index, Series  # noqa: F401
+    from .groupby import groupby_agg  # noqa: F401
+    from .io import read_text, to_orc  # noqa: F401
 
 
 __all__ = [
diff --git a/python/dask_cudf/dask_cudf/expr/__init__.py b/python/dask_cudf/dask_cudf/expr/__init__.py
index a76b655ef42..6dadadd5263 100644
--- a/python/dask_cudf/dask_cudf/expr/__init__.py
+++ b/python/dask_cudf/dask_cudf/expr/__init__.py
@@ -12,8 +12,8 @@
     config.set({"dataframe.shuffle.method": "tasks"})
 
     try:
-        import dask_cudf.expr._collection
-        import dask_cudf.expr._expr
+        import dask_cudf.expr._collection  # noqa: F401
+        import dask_cudf.expr._expr  # noqa: F401
 
     except ImportError as err:
         # Dask *should* raise an error before this.
diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py
index 4a9f4de8b9c..c7cf66fbffd 100644
--- a/python/dask_cudf/dask_cudf/expr/_expr.py
+++ b/python/dask_cudf/dask_cudf/expr/_expr.py
@@ -12,7 +12,7 @@
 )
 from dask_expr._reductions import Reduction, Var
 from dask_expr.io.io import FusedParquetIO
-from dask_expr.io.parquet import ReadParquetPyarrowFS
+from dask_expr.io.parquet import FragmentWrapper, ReadParquetPyarrowFS
 
 from dask.dataframe.core import (
     _concat,
@@ -302,16 +302,34 @@ def _dataset_info(self):
         return dataset_info
 
     @staticmethod
-    def _table_to_pandas(
-        table,
-        index_name,
-        *args,
-    ):
+    def _table_to_pandas(table, index_name):
         df = cudf.DataFrame.from_arrow(table)
         if index_name is not None:
             df = df.set_index(index_name)
         return df
 
+    def _filtered_task(self, index: int):
+        columns = self.columns.copy()
+        index_name = self.index.name
+        if self.index is not None:
+            index_name = self.index.name
+        schema = self._dataset_info["schema"].remove_metadata()
+        if index_name:
+            if columns is None:
+                columns = list(schema.names)
+            columns.append(index_name)
+        return (
+            self._table_to_pandas,
+            (
+                self._fragment_to_table,
+                FragmentWrapper(self.fragments[index], filesystem=self.fs),
+                self.filters,
+                columns,
+                schema,
+            ),
+            index_name,
+        )
+
     def _tune_up(self, parent):
         if self._fusion_compression_factor >= 1:
             return
diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py
index 76bb2ea99b4..0421bd755f4 100644
--- a/python/dask_cudf/dask_cudf/io/__init__.py
+++ b/python/dask_cudf/dask_cudf/io/__init__.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
-from .csv import read_csv
-from .json import read_json
-from .orc import read_orc, to_orc
-from .text import read_text
+from .csv import read_csv  # noqa: F401
+from .json import read_json  # noqa: F401
+from .orc import read_orc, to_orc  # noqa: F401
+from .text import read_text  # noqa: F401
 
 try:
-    from .parquet import read_parquet, to_parquet
+    from .parquet import read_parquet, to_parquet  # noqa: F401
 except ImportError:
     pass
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 896c4169f5b..ae5ca480e31 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -15,7 +15,11 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import skip_dask_expr, xfail_dask_expr
+from dask_cudf.tests.utils import (
+    require_dask_expr,
+    skip_dask_expr,
+    xfail_dask_expr,
+)
 
 # Check if create_metadata_file is supported by
 # the current dask.dataframe version
@@ -615,3 +619,28 @@ def test_timezone_column(tmpdir):
     got = dask_cudf.read_parquet(path)
     expect = cudf.read_parquet(path)
     dd.assert_eq(got, expect)
+
+
+@require_dask_expr()
+@pytest.mark.skipif(
+    not dask_cudf.backends.PYARROW_GE_15,
+    reason="Requires pyarrow 15",
+)
+@pytest.mark.parametrize("min_part_size", ["1B", "1GB"])
+def test_read_parquet_arrow_filesystem(tmpdir, min_part_size):
+    tmp_path = str(tmpdir)
+    with dask.config.set(
+        {
+            "dataframe.backend": "cudf",
+            "dataframe.parquet.minimum-partition-size": min_part_size,
+        }
+    ):
+        dd.from_dict(
+            {"x": range(1000), "y": ["a", "b", "c", "d"] * 250},
+            npartitions=10,
+        ).to_parquet(tmp_path, write_index=False)
+        df = cudf.read_parquet(tmp_path)
+        ddf = dask_cudf.read_parquet(tmp_path, filesystem="arrow")
+        dd.assert_eq(df, ddf, check_index=False)
+        assert isinstance(ddf._meta, cudf.DataFrame)
+        assert isinstance(ddf.compute(), cudf.DataFrame)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index cf8af82e112..90907f6fb99 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -11,6 +11,8 @@
 
 from dask.dataframe import assert_eq
 
+import cudf
+
 import dask_cudf
 from dask_cudf.tests.utils import QUERY_PLANNING_ON
 
@@ -168,6 +170,8 @@ def test_read_parquet_filesystem(s3_base, s3so, pdf, filesystem):
                 filesystem=filesystem,
             )
         assert df.b.sum().compute() == 9
+        assert isinstance(df._meta, cudf.DataFrame)
+        assert isinstance(df.compute(), cudf.DataFrame)
 
 
 def test_read_parquet_filesystem_explicit(s3_base, s3so, pdf):
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index fbcd7ae5dfb..862e8f36eaa 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -69,50 +69,17 @@ version = {file = "dask_cudf/VERSION"}
 [tool.setuptools.packages.find]
 exclude = ["*tests*"]
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
+[tool.ruff]
+extend = "../../pyproject.toml"
 
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-    "cudf",
-]
-known_first_party = [
-    "dask_cudf",
-]
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["dask_cudf"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
 
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-]
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm", "cudf"]
 
 [tool.pytest.ini_options]
 addopts = "--tb=native --strict-config --strict-markers"
@@ -126,5 +93,8 @@ filterwarnings = [
     # https://github.com/dask/partd/blob/main/partd/pandas.py#L198
     "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning",
     "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask",
+    # Dask now loudly throws warnings: https://github.com/dask/dask/pull/11437
+    # When the legacy implementation is removed we can remove this warning and stop running pytests with `DASK_DATAFRAME__QUERY_PLANNING=False`
+    "ignore:The legacy Dask DataFrame implementation is deprecated and will be removed in a future version.*:FutureWarning",
 ]
 xfail_strict = true
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index aa67b4b1149..9bdfdab97c2 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -13,6 +13,7 @@ from . cimport (
     expressions,
     filling,
     groupby,
+    interop,
     join,
     json,
     labeling,
@@ -62,6 +63,7 @@ __all__ = [
     "filling",
     "gpumemoryview",
     "groupby",
+    "interop",
     "join",
     "json",
     "lists",
diff --git a/python/pylibcudf/pylibcudf/interop.pxd b/python/pylibcudf/pylibcudf/interop.pxd
new file mode 100644
index 00000000000..2a0a8c15fdd
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/interop.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.table cimport Table
+
+
+cpdef Table from_dlpack(object managed_tensor)
+
+cpdef object to_dlpack(Table input)
diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx
index 642516a1b90..61e812353b7 100644
--- a/python/pylibcudf/pylibcudf/interop.pyx
+++ b/python/pylibcudf/pylibcudf/interop.pyx
@@ -1,6 +1,11 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
-from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_New
+from cpython.pycapsule cimport (
+    PyCapsule_GetPointer,
+    PyCapsule_IsValid,
+    PyCapsule_New,
+    PyCapsule_SetName,
+)
 from libc.stdlib cimport free
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -16,11 +21,14 @@ from pylibcudf.libcudf.interop cimport (
     ArrowArray,
     ArrowArrayStream,
     ArrowSchema,
+    DLManagedTensor,
     column_metadata,
     from_arrow_column as cpp_from_arrow_column,
     from_arrow_stream as cpp_from_arrow_stream,
+    from_dlpack as cpp_from_dlpack,
     to_arrow_host_raw,
     to_arrow_schema_raw,
+    to_dlpack as cpp_to_dlpack,
 )
 from pylibcudf.libcudf.table.table cimport table
 
@@ -315,3 +323,87 @@ def _to_arrow_scalar(cudf_object, metadata=None):
     # Note that metadata for scalars is primarily important for preserving
     # information on nested types since names are otherwise irrelevant.
     return to_arrow(Column.from_scalar(cudf_object, 1), metadata=metadata)[0]
+
+
+cpdef Table from_dlpack(object managed_tensor):
+    """
+    Convert a DLPack DLTensor into a cudf table.
+
+    For details, see :cpp:func:`cudf::from_dlpack`
+
+    Parameters
+    ----------
+    managed_tensor : PyCapsule
+        A 1D or 2D column-major (Fortran order) tensor.
+
+    Returns
+    -------
+    Table
+        Table with a copy of the tensor data.
+    """
+    if not PyCapsule_IsValid(managed_tensor, "dltensor"):
+        raise ValueError("Invalid PyCapsule object")
+    cdef unique_ptr[table] c_result
+    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>PyCapsule_GetPointer(
+        managed_tensor, "dltensor"
+    )
+    if dlpack_tensor is NULL:
+        raise ValueError("PyCapsule object contained a NULL pointer")
+    PyCapsule_SetName(managed_tensor, "used_dltensor")
+
+    # Note: A copy is always performed when converting the dlpack
+    # data to a libcudf table. We also delete the dlpack_tensor pointer
+    # as the pointer is not deleted by libcudf's from_dlpack function.
+    # TODO: https://github.com/rapidsai/cudf/issues/10874
+    # TODO: https://github.com/rapidsai/cudf/issues/10849
+    with nogil:
+        c_result = cpp_from_dlpack(dlpack_tensor)
+
+    cdef Table result = Table.from_libcudf(move(c_result))
+    dlpack_tensor.deleter(dlpack_tensor)
+    return result
+
+
+cpdef object to_dlpack(Table input):
+    """
+    Convert a cudf table into a DLPack DLTensor.
+
+    For details, see :cpp:func:`cudf::to_dlpack`
+
+    Parameters
+    ----------
+    input : Table
+        A 1D or 2D column-major (Fortran order) tensor.
+
+    Returns
+    -------
+    PyCapsule
+        1D or 2D DLPack tensor with a copy of the table data, or nullptr.
+    """
+    for col in input._columns:
+        if col.null_count():
+            raise ValueError(
+                "Cannot create a DLPack tensor with null values. "
+                "Input is required to have null count as zero."
+            )
+    cdef DLManagedTensor *dlpack_tensor
+
+    with nogil:
+        dlpack_tensor = cpp_to_dlpack(input.view())
+
+    return PyCapsule_New(
+        dlpack_tensor,
+        "dltensor",
+        dlmanaged_tensor_pycapsule_deleter
+    )
+
+
+cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept:
+    if PyCapsule_IsValid(pycap_obj, "used_dltensor"):
+        # we do not call a used capsule's deleter
+        return
+    cdef DLManagedTensor* dlpack_tensor = <DLManagedTensor*>PyCapsule_GetPointer(
+        pycap_obj, "dltensor"
+    )
+    if dlpack_tensor is not NULL:
+        dlpack_tensor.deleter(dlpack_tensor)
diff --git a/python/pylibcudf/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
index 30b97fdec34..b75e9ca7001 100644
--- a/python/pylibcudf/pylibcudf/libcudf/interop.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
@@ -32,11 +32,13 @@ cdef extern from "cudf/interop.hpp" nogil:
 
 cdef extern from "cudf/interop.hpp" namespace "cudf" \
         nogil:
-    cdef unique_ptr[table] from_dlpack(const DLManagedTensor* tensor
-                                       ) except +
+    cdef unique_ptr[table] from_dlpack(
+        const DLManagedTensor* managed_tensor
+    ) except +
 
-    DLManagedTensor* to_dlpack(table_view input_table
-                               ) except +
+    DLManagedTensor* to_dlpack(
+        const table_view& input
+    ) except +
 
     cdef cppclass column_metadata:
         column_metadata() except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
index 673bffa28ae..be3a2d75718 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
@@ -8,9 +9,9 @@ from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil:
-    ctypedef enum letter_type:
-        CONSONANT 'nvtext::letter_type::CONSONANT'
-        VOWEL 'nvtext::letter_type::VOWEL'
+    cpdef enum class letter_type:
+        CONSONANT
+        VOWEL
 
     cdef unique_ptr[column] porter_stemmer_measure(
         const column_view & strings
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
index 40f0e2fa50c..6b0c90d0acc 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
@@ -6,6 +6,7 @@ from libcpp.vector cimport vector
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
 from pylibcudf.libcudf.strings.regex_program cimport regex_program
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport size_type
@@ -14,17 +15,18 @@ from pylibcudf.libcudf.types cimport size_type
 cdef extern from "cudf/strings/replace_re.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] replace_re(
-        column_view source_strings,
-        regex_program,
-        string_scalar repl,
-        size_type maxrepl) except +
-
-    cdef unique_ptr[column] replace_with_backrefs(
-        column_view source_strings,
-        regex_program,
-        string repl) except +
+        column_view input,
+        regex_program prog,
+        string_scalar replacement,
+        size_type max_replace_count) except +
 
     cdef unique_ptr[column] replace_re(
-        column_view source_strings,
+        column_view input,
         vector[string] patterns,
-        column_view repls) except +
+        column_view replacements,
+        regex_flags flags) except +
+
+    cdef unique_ptr[column] replace_with_backrefs(
+        column_view input,
+        regex_program prog,
+        string replacement) except +
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index e01ca3fbdd3..d97c0a73267 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
-                   ngrams_tokenize.pyx normalize.pyx
+                   ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index 08dbec84090..a658e57018e 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -7,6 +7,8 @@ from . cimport (
     minhash,
     ngrams_tokenize,
     normalize,
+    replace,
+    stemmer,
 )
 
 __all__ = [
@@ -16,4 +18,6 @@ __all__ = [
     "minhash",
     "ngrams_tokenize",
     "normalize",
+    "replace",
+    "stemmer",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index 6dccf3dd9cf..2c1feb089a2 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -7,6 +7,8 @@
     minhash,
     ngrams_tokenize,
     normalize,
+    replace,
+    stemmer,
 )
 
 __all__ = [
@@ -16,4 +18,6 @@
     "minhash",
     "ngrams_tokenize",
     "normalize",
+    "replace",
+    "stemmer",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pxd b/python/pylibcudf/pylibcudf/nvtext/replace.pxd
new file mode 100644
index 00000000000..624f90e7486
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/replace.pxd
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column replace_tokens(
+    Column input,
+    Column targets,
+    Column replacements,
+    Scalar delimiter=*,
+)
+
+cpdef Column filter_tokens(
+    Column input,
+    size_type min_token_length,
+    Scalar replacement=*,
+    Scalar delimiter=*
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyx b/python/pylibcudf/pylibcudf/nvtext/replace.pyx
new file mode 100644
index 00000000000..b65348ce14d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyx
@@ -0,0 +1,109 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.replace cimport (
+    filter_tokens as cpp_filter_tokens,
+    replace_tokens as cpp_replace_tokens,
+)
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+
+
+cpdef Column replace_tokens(
+    Column input,
+    Column targets,
+    Column replacements,
+    Scalar delimiter=None,
+):
+    """
+    Replaces specified tokens with corresponding replacement strings.
+
+    For details, see :cpp:func:`replace_tokens`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to replace
+    targets : Column
+        Strings to compare against tokens found in ``input``
+    replacements : Column
+        Replacement strings for each string in ``targets``
+    delimiter : Scalar, optional
+        Characters used to separate each string into tokens.
+        The default of empty string will identify tokens using whitespace.
+
+    Returns
+    -------
+    Column
+        New strings column with replaced strings
+    """
+    cdef unique_ptr[column] c_result
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+    with nogil:
+        c_result = cpp_replace_tokens(
+            input.view(),
+            targets.view(),
+            replacements.view(),
+            dereference(<const string_scalar*>delimiter.get()),
+        )
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column filter_tokens(
+    Column input,
+    size_type min_token_length,
+    Scalar replacement=None,
+    Scalar delimiter=None
+):
+    """
+    Removes tokens whose lengths are less than a specified number of characters.
+
+    For details, see :cpp:func:`filter_tokens`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to replace
+    min_token_length : size_type
+        The minimum number of characters to retain a
+        token in the output string
+    replacement : Scalar, optional
+        Optional replacement string to be used in place of removed tokens
+    delimiter : Scalar, optional
+        Characters used to separate each string into tokens.
+        The default of empty string will identify tokens using whitespace.
+    Returns
+    -------
+    Column
+        New strings column of filtered strings
+    """
+    cdef unique_ptr[column] c_result
+    if delimiter is None:
+        delimiter = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+    if replacement is None:
+        replacement = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    with nogil:
+        c_result = cpp_filter_tokens(
+            input.view(),
+            min_token_length,
+            dereference(<const string_scalar*>replacement.get()),
+            dereference(<const string_scalar*>delimiter.get()),
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd
new file mode 100644
index 00000000000..48762efc01f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.stemmer cimport letter_type
+from pylibcudf.libcudf.types cimport size_type
+
+ctypedef fused ColumnOrSize:
+    Column
+    size_type
+
+cpdef Column is_letter(Column input, bool check_vowels, ColumnOrSize indices)
+
+cpdef Column porter_stemmer_measure(Column input)
diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
new file mode 100644
index 00000000000..854d1053624
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
@@ -0,0 +1,76 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.nvtext.stemmer cimport (
+    is_letter as cpp_is_letter,
+    letter_type,
+    porter_stemmer_measure as cpp_porter_stemmer_measure,
+)
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef Column is_letter(
+    Column input,
+    bool check_vowels,
+    ColumnOrSize indices
+):
+    """
+    Returns boolean column indicating if the character
+    or characters at the provided character index or
+    indices (respectively) are consonants or vowels
+
+    For details, see :cpp:func:`is_letter`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    check_vowels : bool
+        If true, the check is for vowels. Otherwise the check is
+        for consonants.
+    indices : Union[Column, size_type]
+        The character position(s) to check in each string
+
+    Returns
+    -------
+    Column
+        New boolean column.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_is_letter(
+            input.view(),
+            letter_type.VOWEL if check_vowels else letter_type.CONSONANT,
+            indices if ColumnOrSize is size_type else indices.view()
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column porter_stemmer_measure(Column input):
+    """
+    Returns the Porter Stemmer measurements of a strings column.
+
+    For details, see :cpp:func:`porter_stemmer_measure`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column of words to measure
+
+    Returns
+    -------
+    Column
+        New column of measure values
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_porter_stemmer_measure(input.view())
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
index 04dd131cd75..5d7fbd24b91 100644
--- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
@@ -28,6 +28,7 @@ set(cython_sources
     regex_program.pyx
     repeat.pyx
     replace.pyx
+    replace_re.pyx
     side_type.pyx
     slice.pyx
     strip.pyx
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
index 93c61f3f72c..da1c1c576c0 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd
@@ -17,6 +17,7 @@ from . cimport (
     regex_program,
     repeat,
     replace,
+    replace_re,
     side_type,
     slice,
     split,
@@ -42,6 +43,7 @@ __all__ = [
     "regex_program",
     "repeat",
     "replace",
+    "replace_re",
     "slice",
     "strip",
     "split",
diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
index d52b0405f1e..40fa8261905 100644
--- a/python/pylibcudf/pylibcudf/strings/__init__.py
+++ b/python/pylibcudf/pylibcudf/strings/__init__.py
@@ -17,6 +17,7 @@
     regex_program,
     repeat,
     replace,
+    replace_re,
     side_type,
     slice,
     split,
@@ -42,6 +43,7 @@
     "regex_program",
     "repeat",
     "replace",
+    "replace_re",
     "slice",
     "strip",
     "split",
diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pxd b/python/pylibcudf/pylibcudf/strings/replace_re.pxd
new file mode 100644
index 00000000000..e27ccd55f7d
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/replace_re.pxd
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.strings.regex_program cimport RegexProgram
+
+ctypedef fused Replacement:
+    Column
+    Scalar
+
+ctypedef fused Patterns:
+    RegexProgram
+    list
+
+
+cpdef Column replace_re(
+    Column input,
+    Patterns patterns,
+    Replacement replacement=*,
+    size_type max_replace_count=*,
+    regex_flags flags=*
+)
+
+cpdef Column replace_with_backrefs(
+    Column input,
+    RegexProgram prog,
+    str replacement
+)
diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyx b/python/pylibcudf/pylibcudf/strings/replace_re.pyx
new file mode 100644
index 00000000000..ccc33fd4425
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyx
@@ -0,0 +1,134 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from pylibcudf.libcudf.strings cimport replace_re as cpp_replace_re
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.strings.regex_program cimport RegexProgram
+
+
+cpdef Column replace_re(
+    Column input,
+    Patterns patterns,
+    Replacement replacement=None,
+    size_type max_replace_count=-1,
+    regex_flags flags=regex_flags.DEFAULT,
+):
+    """
+    For each string, replaces any character sequence matching the given patterns
+    with the provided replacement.
+
+    For details, see :cpp:func:`cudf::strings::replace_re`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+    patterns: RegexProgram or list[str]
+        If RegexProgram, the regex to match to each string.
+        If list[str], a list of regex strings to search within each string.
+    replacement : Scalar or Column
+        If Scalar, the string used to replace the matched sequence in each string.
+        ``patterns`` must be a RegexProgram.
+        If Column, the strings used for replacement.
+        ``patterns`` must be a list[str].
+    max_replace_count : int
+        The maximum number of times to replace the matched pattern
+        within each string. ``patterns`` must be a RegexProgram.
+        Default replaces every substring that is matched.
+    flags : RegexFlags
+        Regex flags for interpreting special characters in the patterns.
+        ``patterns`` must be a list[str]
+
+    Returns
+    -------
+    Column
+        New strings column
+    """
+    cdef unique_ptr[column] c_result
+    cdef vector[string] c_patterns
+
+    if Patterns is RegexProgram and Replacement is Scalar:
+        if replacement is None:
+            replacement = Scalar.from_libcudf(
+                cpp_make_string_scalar("".encode())
+            )
+        with nogil:
+            c_result = move(
+                cpp_replace_re.replace_re(
+                    input.view(),
+                    patterns.c_obj.get()[0],
+                    dereference(<string_scalar*>(replacement.get())),
+                    max_replace_count
+                )
+            )
+
+        return Column.from_libcudf(move(c_result))
+    elif Patterns is list and Replacement is Column:
+        c_patterns.reserve(len(patterns))
+        for pattern in patterns:
+            c_patterns.push_back(pattern.encode())
+
+        with nogil:
+            c_result = move(
+                cpp_replace_re.replace_re(
+                    input.view(),
+                    c_patterns,
+                    replacement.view(),
+                    flags,
+                )
+            )
+
+        return Column.from_libcudf(move(c_result))
+    else:
+        raise TypeError("Must pass either a RegexProgram and a Scalar or a list")
+
+
+cpdef Column replace_with_backrefs(
+    Column input,
+    RegexProgram prog,
+    str replacement
+):
+    """
+    For each string, replaces any character sequence matching the given regex
+    using the replacement template for back-references.
+
+    For details, see :cpp:func:`cudf::strings::replace_with_backrefs`
+
+    Parameters
+    ----------
+    input : Column
+        Strings instance for this operation.
+
+    prog: RegexProgram
+        Regex program instance.
+
+    replacement : str
+         The replacement template for creating the output string.
+
+    Returns
+    -------
+    Column
+        New strings column.
+    """
+    cdef unique_ptr[column] c_result
+    cdef string c_replacement = replacement.encode()
+
+    with nogil:
+        c_result = cpp_replace_re.replace_with_backrefs(
+            input.view(),
+            prog.c_obj.get()[0],
+            c_replacement,
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py
index 9f389fa42c4..d95849ef371 100644
--- a/python/pylibcudf/pylibcudf/tests/common/utils.py
+++ b/python/pylibcudf/pylibcudf/tests/common/utils.py
@@ -7,10 +7,11 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from pyarrow.orc import write_table as orc_write_table
 from pyarrow.parquet import write_table as pq_write_table
+
+import pylibcudf as plc
 from pylibcudf.io.types import CompressionType
 
 
diff --git a/python/pylibcudf/pylibcudf/tests/conftest.py b/python/pylibcudf/pylibcudf/tests/conftest.py
index fdce6f353ca..a19a8835498 100644
--- a/python/pylibcudf/pylibcudf/tests/conftest.py
+++ b/python/pylibcudf/pylibcudf/tests/conftest.py
@@ -8,8 +8,9 @@
 
 import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
+
+import pylibcudf as plc
 from pylibcudf.io.types import CompressionType
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_avro.py b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
index 0cd5064a697..3d9d99ffa61 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_avro.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
@@ -5,10 +5,11 @@
 
 import fastavro
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_and_meta_eq
 
+import pylibcudf as plc
+
 avro_dtype_pairs = [
     ("boolean", pa.bool_()),
     ("int", pa.int32()),
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
index ab26f23418d..22c83acc47c 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
@@ -5,9 +5,7 @@
 
 import pandas as pd
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
-from pylibcudf.io.types import CompressionType
 from utils import (
     _convert_types,
     assert_table_and_meta_eq,
@@ -15,6 +13,9 @@
     write_source_str,
 )
 
+import pylibcudf as plc
+from pylibcudf.io.types import CompressionType
+
 # Shared kwargs to pass to make_source
 _COMMON_CSV_SOURCE_KWARGS = {
     "format": "csv",
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_json.py b/python/pylibcudf/pylibcudf/tests/io/test_json.py
index 9d976fedf00..453e5ce32a8 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_json.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_json.py
@@ -3,9 +3,7 @@
 
 import pandas as pd
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
-from pylibcudf.io.types import CompressionType
 from utils import (
     assert_table_and_meta_eq,
     make_source,
@@ -13,6 +11,9 @@
     write_source_str,
 )
 
+import pylibcudf as plc
+from pylibcudf.io.types import CompressionType
+
 # Shared kwargs to pass to make_source
 _COMMON_JSON_SOURCE_KWARGS = {"format": "json", "orient": "records"}
 
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
index 42b14b1feff..5ed660ba6cf 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import _convert_types, assert_table_and_meta_eq, make_source
 
+import pylibcudf as plc
+
 # Shared kwargs to pass to make_source
 _COMMON_ORC_SOURCE_KWARGS = {"format": "orc"}
 
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
index f6e843ccf66..41298601539 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
@@ -1,9 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from pyarrow.parquet import read_table
+from utils import assert_table_and_meta_eq, make_source
+
+import pylibcudf as plc
 from pylibcudf.expressions import (
     ASTOperator,
     ColumnNameReference,
@@ -11,7 +13,6 @@
     Literal,
     Operation,
 )
-from utils import assert_table_and_meta_eq, make_source
 
 # Shared kwargs to pass to make_source
 _COMMON_PARQUET_SOURCE_KWARGS = {"format": "parquet"}
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
index 747f58ec8cf..0c43c363e55 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
@@ -2,9 +2,10 @@
 
 import io
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
 def io_class(request):
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
index 76b0424b2af..b3555013927 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import zoneinfo
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 def test_make_timezone_transition_table():
     if len(zoneinfo.TZPATH) == 0:
diff --git a/python/pylibcudf/pylibcudf/tests/test_binaryops.py b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
index f784cb3c191..bbb08e8b95a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_binaryops.py
+++ b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
@@ -4,10 +4,11 @@
 
 import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def idfn(param):
     ltype, rtype, outtype, plc_op, _ = param
diff --git a/python/pylibcudf/pylibcudf/tests/test_column_factories.py b/python/pylibcudf/pylibcudf/tests/test_column_factories.py
index 8cedbc6d42f..e317362a76b 100644
--- a/python/pylibcudf/pylibcudf/tests/test_column_factories.py
+++ b/python/pylibcudf/pylibcudf/tests/test_column_factories.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import DEFAULT_STRUCT_TESTING_TYPE, assert_column_eq
 
+import pylibcudf as plc
+
 EMPTY_COL_SIZE = 3
 
 NUMERIC_TYPES = [
diff --git a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
index 0e129fdf0ef..24cd6b9e35f 100644
--- a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
+++ b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
@@ -1,12 +1,13 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
 import rmm
 
+import pylibcudf as plc
+
 VALID_TYPES = [
     pa.int8(),
     pa.int16(),
diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
index 7a5c1664eed..6d8b5993964 100644
--- a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
+++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 param_pyarrow_tables = [
     pa.table([]),
     pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}),
diff --git a/python/pylibcudf/pylibcudf/tests/test_copying.py b/python/pylibcudf/pylibcudf/tests/test_copying.py
index 628682d0a66..c0a41b96b1a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_copying.py
+++ b/python/pylibcudf/pylibcudf/tests/test_copying.py
@@ -2,7 +2,6 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import (
     DEFAULT_STRUCT_TESTING_TYPE,
@@ -16,6 +15,8 @@
     metadata_from_arrow_type,
 )
 
+import pylibcudf as plc
+
 
 # TODO: consider moving this to conftest and "pairing"
 # it with pa_type, so that they don't get out of sync
diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py
index 75930d59058..a80ab8d9f65 100644
--- a/python/pylibcudf/pylibcudf/tests/test_datetime.py
+++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py
@@ -4,10 +4,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module", params=["s", "ms", "us", "ns"])
 def datetime_column(has_nulls, request):
diff --git a/python/pylibcudf/pylibcudf/tests/test_expressions.py b/python/pylibcudf/pylibcudf/tests/test_expressions.py
index 5894ef4624c..6eabd6db617 100644
--- a/python/pylibcudf/pylibcudf/tests/test_expressions.py
+++ b/python/pylibcudf/pylibcudf/tests/test_expressions.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 # We can't really evaluate these expressions, so just make sure
 # construction works properly
 
diff --git a/python/pylibcudf/pylibcudf/tests/test_interop.py b/python/pylibcudf/pylibcudf/tests/test_interop.py
index 01c998f16d4..af80b6e5978 100644
--- a/python/pylibcudf/pylibcudf/tests/test_interop.py
+++ b/python/pylibcudf/pylibcudf/tests/test_interop.py
@@ -1,8 +1,12 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+import cupy as cp
+import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
+from utils import assert_table_eq
+
+import pylibcudf as plc
 
 
 def test_list_dtype_roundtrip():
@@ -66,3 +70,31 @@ def test_decimal_other(data_type):
 
     arrow_type = plc.interop.to_arrow(data_type, precision=precision)
     assert arrow_type == pa.decimal128(precision, 0)
+
+
+def test_round_trip_dlpack_plc_table():
+    expected = pa.table({"a": [1, 2, 3], "b": [5, 6, 7]})
+    plc_table = plc.interop.from_arrow(expected)
+    result = plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table))
+    assert_table_eq(expected, result)
+
+
+@pytest.mark.parametrize("array", [np.array, cp.array])
+def test_round_trip_dlpack_array(array):
+    arr = array([1, 2, 3])
+    result = plc.interop.from_dlpack(arr.__dlpack__())
+    expected = pa.table({"a": [1, 2, 3]})
+    assert_table_eq(expected, result)
+
+
+def test_to_dlpack_error():
+    plc_table = plc.interop.from_arrow(
+        pa.table({"a": [1, None, 3], "b": [5, 6, 7]})
+    )
+    with pytest.raises(ValueError, match="Cannot create a DLPack tensor"):
+        plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table))
+
+
+def test_from_dlpack_error():
+    with pytest.raises(ValueError, match="Invalid PyCapsule object"):
+        plc.interop.from_dlpack(1)
diff --git a/python/pylibcudf/pylibcudf/tests/test_join.py b/python/pylibcudf/pylibcudf/tests/test_join.py
index 61e02f4d28d..f43a56046a4 100644
--- a/python/pylibcudf/pylibcudf/tests/test_join.py
+++ b/python/pylibcudf/pylibcudf/tests/test_join.py
@@ -2,9 +2,10 @@
 
 import numpy as np
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 
 def test_cross_join():
     left = pa.Table.from_arrays([[0, 1, 2], [3, 4, 5]], names=["a", "b"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_json.py b/python/pylibcudf/pylibcudf/tests/test_json.py
index 3d2955211f8..486a9524e92 100644
--- a/python/pylibcudf/pylibcudf/tests/test_json.py
+++ b/python/pylibcudf/pylibcudf/tests/test_json.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def plc_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py
index f7fb7463b50..beacfc63ce5 100644
--- a/python/pylibcudf/pylibcudf/tests/test_labeling.py
+++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("left_inclusive", [True, False])
 @pytest.mark.parametrize("right_inclusive", [True, False])
diff --git a/python/pylibcudf/pylibcudf/tests/test_lists.py b/python/pylibcudf/pylibcudf/tests/test_lists.py
index 2353a6ff8f9..f3ef555f11d 100644
--- a/python/pylibcudf/pylibcudf/tests/test_lists.py
+++ b/python/pylibcudf/pylibcudf/tests/test_lists.py
@@ -3,10 +3,11 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def test_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_null_mask.py b/python/pylibcudf/pylibcudf/tests/test_null_mask.py
index 3edcae59edc..cd3da856de2 100644
--- a/python/pylibcudf/pylibcudf/tests/test_null_mask.py
+++ b/python/pylibcudf/pylibcudf/tests/test_null_mask.py
@@ -1,12 +1,13 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
-from pylibcudf.null_mask import MaskState
 
 import rmm
 
+import pylibcudf as plc
+from pylibcudf.null_mask import MaskState
+
 
 @pytest.fixture(params=[False, True])
 def nullable(request):
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
index 7d93c471cc4..8b14e0db576 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def edit_distance_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
index 5cf9874d595..fae4685f81b 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def input_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
index d5a168426b1..05fe7b53c16 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def input_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
index 4e389a63f90..ead9ee094af 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()])
 def minhash_input_data(request):
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py
index 283a009288d..84748b5597e 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def input_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
index fe28b83c09a..25b6d1389ec 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def norm_spaces_input_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py
new file mode 100644
index 00000000000..65687f31c85
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    arr = ["the quick", "brown fox", "jumps*over the", "lazy dog"]
+    return pa.array(arr)
+
+
+@pytest.fixture(scope="module")
+def targets():
+    arr = ["the quick", "brown fox", "jumps*over the", "lazy dog"]
+    return pa.array(arr)
+
+
+@pytest.mark.parametrize("delim", ["*", None])
+def test_replace_tokens(input_col, targets, delim):
+    replacements = pa.array(["slow", "cat", "looked", "rat"])
+    result = plc.nvtext.replace.replace_tokens(
+        plc.interop.from_arrow(input_col),
+        plc.interop.from_arrow(targets),
+        plc.interop.from_arrow(replacements),
+        plc.interop.from_arrow(pa.scalar(delim)) if delim else None,
+    )
+    expected = pa.array(["slow", "cat", "jumps*over the", "rat"])
+    if not delim:
+        expected = pa.array(
+            ["the quick", "brown fox", "jumps*over the", "lazy dog"]
+        )
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("min_token_length", [4, 5])
+@pytest.mark.parametrize("replace", ["---", None])
+@pytest.mark.parametrize("delim", ["*", None])
+def test_filter_tokens(input_col, min_token_length, replace, delim):
+    result = plc.nvtext.replace.filter_tokens(
+        plc.interop.from_arrow(input_col),
+        min_token_length,
+        plc.interop.from_arrow(pa.scalar(replace)) if replace else None,
+        plc.interop.from_arrow(pa.scalar(delim)) if delim else None,
+    )
+    expected = pa.array(
+        ["the quick", "brown fox", "jumps*over the", "lazy dog"]
+    )
+    if not delim and not replace and min_token_length == 4:
+        expected = pa.array([" quick", "brown ", "jumps*over ", "lazy "])
+    if not delim and not replace and min_token_length == 5:
+        expected = pa.array([" quick", "brown ", "jumps*over ", " "])
+    if not delim and replace == "---" and min_token_length == 4:
+        expected = pa.array(
+            ["--- quick", "brown ---", "jumps*over ---", "lazy ---"]
+        )
+    if not delim and replace == "---" and min_token_length == 5:
+        expected = pa.array(
+            ["--- quick", "brown ---", "jumps*over ---", "--- ---"]
+        )
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
new file mode 100644
index 00000000000..e7f4a971f08
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def input_col():
+    arr = ["trouble", "toy", "syzygy"]
+    return pa.array(arr)
+
+
+@pytest.mark.parametrize("check_vowels", [True, False])
+@pytest.mark.parametrize("indices", [[3, 1, 4], 1])
+def test_is_letter(input_col, check_vowels, indices):
+    def is_letter(s, i, check):
+        vowels = "aeiouy"
+        return (s[i] in vowels) == check
+
+    result = plc.nvtext.stemmer.is_letter(
+        plc.interop.from_arrow(input_col),
+        check_vowels,
+        plc.interop.from_arrow(pa.array(indices))
+        if isinstance(indices, list)
+        else indices,
+    )
+    expected = pa.array(
+        [
+            is_letter(
+                s,
+                indices[i] if isinstance(indices, list) else indices,
+                check_vowels,
+            )
+            for i, s in enumerate(input_col.to_pylist())
+        ]
+    )
+    assert_column_eq(result, expected)
+
+
+def test_porter_stemmer_measure(input_col):
+    result = plc.nvtext.stemmer.porter_stemmer_measure(
+        plc.interop.from_arrow(input_col),
+    )
+    expected = pa.array([1, 1, 2], type=pa.int32())
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_partitioning.py b/python/pylibcudf/pylibcudf/tests/test_partitioning.py
index 444d0089d2c..c55e54cebc6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_partitioning.py
+++ b/python/pylibcudf/pylibcudf/tests/test_partitioning.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def partitioning_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_quantiles.py b/python/pylibcudf/pylibcudf/tests/test_quantiles.py
index bac56691306..e4a24fb1c98 100644
--- a/python/pylibcudf/pylibcudf/tests/test_quantiles.py
+++ b/python/pylibcudf/pylibcudf/tests/test_quantiles.py
@@ -3,10 +3,11 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
+import pylibcudf as plc
+
 # Map pylibcudf interpolation options to pyarrow options
 interp_mapping = {
     plc.types.Interpolation.LINEAR: "linear",
diff --git a/python/pylibcudf/pylibcudf/tests/test_regex_program.py b/python/pylibcudf/pylibcudf/tests/test_regex_program.py
index 777315df538..52598f2c462 100644
--- a/python/pylibcudf/pylibcudf/tests/test_regex_program.py
+++ b/python/pylibcudf/pylibcudf/tests/test_regex_program.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("pat", ["(", "*", "\\"])
 def test_regex_program_invalid(pat):
diff --git a/python/pylibcudf/pylibcudf/tests/test_reshape.py b/python/pylibcudf/pylibcudf/tests/test_reshape.py
index 01115bc363a..ef23e23766a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_reshape.py
+++ b/python/pylibcudf/pylibcudf/tests/test_reshape.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def reshape_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_round.py b/python/pylibcudf/pylibcudf/tests/test_round.py
index 0b30316b9a0..2526580bc13 100644
--- a/python/pylibcudf/pylibcudf/tests/test_round.py
+++ b/python/pylibcudf/pylibcudf/tests/test_round.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(params=["float32", "float64"])
 def column(request, has_nulls):
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
index a1820def0b1..f461657281a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture()
 def str_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
index 176ccc55b96..3e31c75c38a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def str_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_case.py b/python/pylibcudf/pylibcudf/tests/test_string_case.py
index 233cc253b14..08ac371fd96 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_case.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_case.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def string_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
index bcd030c019e..06b44210d74 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py
@@ -2,9 +2,10 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_all_characters_of_type():
     pa_array = pa.array(["1", "A"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_combine.py b/python/pylibcudf/pylibcudf/tests/test_string_combine.py
index 4a7007a0d6b..eea3ac68e84 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_combine.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_combine.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_concatenate_scalar_seperator():
     plc_table = plc.interop.from_arrow(
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
index 4e4dd7cbb00..ba9a4a7d3b8 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_contains.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def target_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
index 69f7a0fdd33..3f3f452c4f6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(
     scope="module",
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py
index 117c59ff1b8..b391d2b290e 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_to_booleans():
     pa_array = pa.array(["true", None, "True"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py
index f3e84286a36..c9368d858a4 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py
@@ -3,10 +3,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def fmt():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py
index 6d704309bfd..2d3578e4e71 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py
@@ -3,10 +3,11 @@
 from datetime import datetime, timedelta
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(
     params=[
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py
index b1c4d729604..012e722038e 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py
@@ -2,9 +2,10 @@
 import decimal
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_to_fixed_point():
     typ = pa.decimal128(38, 2)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
index e9918fab559..8ee2b5075af 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_to_floats():
     typ = pa.float32()
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py
index 6d1d565af30..01192c2d1f8 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_to_integers():
     typ = pa.int8()
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py
index 4dc3e512624..b533809f106 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_ipv4_to_integers():
     arr = pa.array(["123.45.67.890", None])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
index 8591732b39e..737036a4f0f 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("na_rep", [None, pa.scalar("")])
 @pytest.mark.parametrize("separators", [None, pa.array([",", "[", "]"])])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
index fee8c3fb8f6..528736798c7 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
@@ -2,9 +2,10 @@
 import urllib
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_url_encode():
     data = ["/home/nfs", None]
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
index 788b86423c4..e70edf4fb33 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_extract.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py
@@ -2,6 +2,7 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
+
 import pylibcudf as plc
 
 
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find.py b/python/pylibcudf/pylibcudf/tests/test_string_find.py
index db3b13a5aae..82ec18832a9 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_find.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_find.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
index d6b37a388f0..fa9eee3594b 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_find_multiple():
     arr = pa.array(["abc", "def"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py
index debfad92d00..b73d812c898 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_findall.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py
@@ -2,9 +2,10 @@
 import re
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_findall():
     arr = pa.array(["bunny", "rabbit", "hare", "dog"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_padding.py b/python/pylibcudf/pylibcudf/tests/test_string_padding.py
index 2ba775d17ae..79498132097 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_padding.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_padding.py
@@ -2,6 +2,7 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
+
 import pylibcudf as plc
 
 
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
index 18b5d8bf4d0..c06c06be7c6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py
@@ -2,9 +2,10 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize("repeats", [pa.array([2, 2]), 2])
 def test_repeat_strings(repeats):
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace.py b/python/pylibcudf/pylibcudf/tests/test_string_replace.py
index 5a9c2007b73..2c7d25133de 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_replace.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_replace.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py
new file mode 100644
index 00000000000..511f826441a
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+from utils import assert_column_eq
+
+import pylibcudf as plc
+
+
+@pytest.mark.parametrize("max_replace_count", [-1, 1])
+def test_replace_re_regex_program_scalar(max_replace_count):
+    arr = pa.array(["foo", "fuz", None])
+    pat = "f."
+    repl = "ba"
+    result = plc.strings.replace_re.replace_re(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            pat, plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        plc.interop.from_arrow(pa.scalar(repl)),
+        max_replace_count=max_replace_count,
+    )
+    expected = pc.replace_substring_regex(
+        arr,
+        pat,
+        repl,
+        max_replacements=max_replace_count
+        if max_replace_count != -1
+        else None,
+    )
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "flags",
+    [
+        plc.strings.regex_flags.RegexFlags.DEFAULT,
+        plc.strings.regex_flags.RegexFlags.DOTALL,
+    ],
+)
+def test_replace_re_list_str_columns(flags):
+    arr = pa.array(["foo", "fuz", None])
+    pats = ["oo", "uz"]
+    repls = ["a", "b"]
+    result = plc.strings.replace_re.replace_re(
+        plc.interop.from_arrow(arr),
+        pats,
+        plc.interop.from_arrow(pa.array(repls)),
+        flags=flags,
+    )
+    expected = arr
+    for pat, repl in zip(pats, repls):
+        expected = pc.replace_substring_regex(
+            expected,
+            pat,
+            repl,
+        )
+    assert_column_eq(result, expected)
+
+
+def test_replace_with_backrefs():
+    arr = pa.array(["Z756", None])
+    result = plc.strings.replace_re.replace_with_backrefs(
+        plc.interop.from_arrow(arr),
+        plc.strings.regex_program.RegexProgram.create(
+            "(\\d)(\\d)", plc.strings.regex_flags.RegexFlags.DEFAULT
+        ),
+        "V\\2\\1",
+    )
+    expected = pa.array(["ZV576", None])
+    assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_slice.py b/python/pylibcudf/pylibcudf/tests/test_string_slice.py
index d9ce5591b98..1759f739e31 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_slice.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_slice.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture(scope="module")
 def pa_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
index 80cae8d1c6b..4e80f19b814 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
index 2aeffac8209..450b336ce65 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py
@@ -2,10 +2,11 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_strip.py b/python/pylibcudf/pylibcudf/tests/test_string_strip.py
index 005e5e4a405..5869e5f4920 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_strip.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_strip.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 data_strings = [
     "AbC",
     "123abc",
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_translate.py b/python/pylibcudf/pylibcudf/tests/test_string_translate.py
index 2ae893e69fb..84fd3354ac6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_translate.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_translate.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 @pytest.fixture
 def data_col():
diff --git a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
index a1c820cd586..00442d866e9 100644
--- a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py
@@ -2,9 +2,10 @@
 import textwrap
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_wrap():
     width = 12
diff --git a/python/pylibcudf/pylibcudf/tests/test_table.py b/python/pylibcudf/pylibcudf/tests/test_table.py
index e822d6a97a8..ac39ef4c5c9 100644
--- a/python/pylibcudf/pylibcudf/tests/test_table.py
+++ b/python/pylibcudf/pylibcudf/tests/test_table.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 
+import pylibcudf as plc
+
 
 @pytest.mark.parametrize(
     "arrow_tbl",
diff --git a/python/pylibcudf/pylibcudf/tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py
index d5c618f07e4..49802fe64ac 100644
--- a/python/pylibcudf/pylibcudf/tests/test_transform.py
+++ b/python/pylibcudf/pylibcudf/tests/test_transform.py
@@ -3,9 +3,10 @@
 import math
 
 import pyarrow as pa
-import pylibcudf as plc
 from utils import assert_column_eq
 
+import pylibcudf as plc
+
 
 def test_nans_to_nulls(has_nans):
     if has_nans:
diff --git a/python/pylibcudf/pylibcudf/tests/test_transpose.py b/python/pylibcudf/pylibcudf/tests/test_transpose.py
index ac11123f680..b0c0bc72ead 100644
--- a/python/pylibcudf/pylibcudf/tests/test_transpose.py
+++ b/python/pylibcudf/pylibcudf/tests/test_transpose.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
-import pylibcudf as plc
 import pytest
 from packaging.version import parse
 
+import pylibcudf as plc
+
 
 @pytest.mark.skipif(
     parse(pa.__version__) < parse("16.0.0"),
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index ea5b3065896..a80c85a1fa8 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -53,48 +53,20 @@ test = [
 Homepage = "https://github.com/rapidsai/cudf"
 Documentation = "https://docs.rapids.ai/api/cudf/stable/"
 
-[tool.isort]
-line_length = 79
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-combine_as_imports = true
-order_by_type = true
-known_dask = [
-    "dask",
-    "distributed",
-    "dask_cuda",
-]
-known_rapids = [
-    "rmm",
-]
-known_first_party = [
-    "cudf",
-]
-default_section = "THIRDPARTY"
-sections = [
-    "FUTURE",
-    "STDLIB",
-    "THIRDPARTY",
-    "DASK",
-    "RAPIDS",
-    "FIRSTPARTY",
-    "LOCALFOLDER",
-]
-skip = [
-    "thirdparty",
-    ".eggs",
-    ".git",
-    ".hg",
-    ".mypy_cache",
-    ".tox",
-    ".venv",
-    "_build",
-    "buck-out",
-    "build",
-    "dist",
-    "__init__.py",
-]
+[tool.ruff]
+extend = "../../pyproject.toml"
+
+[tool.ruff.lint.isort]
+combine-as-imports = true
+known-first-party = ["cudf"]
+section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"]
+
+[tool.ruff.lint.isort.sections]
+dask = ["dask", "distributed", "dask_cuda"]
+rapids = ["rmm"]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401"]
 
 [tool.pytest.ini_options]
 # --import-mode=importlib because two test_json.py exists and tests directory is not a structured module