diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
similarity index 91%
rename from .devcontainer/cuda12.2-conda/devcontainer.json
rename to .devcontainer/cuda12.5-conda/devcontainer.json
index 05bf9173d25..fadce01d060 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -3,7 +3,7 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.2",
+      "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
       "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04"
     }
@@ -11,7 +11,7 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
@@ -20,7 +20,7 @@
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.2-envs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent",
@@ -29,7 +29,7 @@
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.2-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.5-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
similarity index 87%
rename from .devcontainer/cuda12.2-pip/devcontainer.json
rename to .devcontainer/cuda12.5-pip/devcontainer.json
index 74420214726..026eb540952 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -3,15 +3,15 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.2",
+      "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.2-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.5-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
@@ -20,7 +20,7 @@
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent",
@@ -28,7 +28,7 @@
     "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index c5679cc5141..2e5959338b0 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -108,6 +108,28 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: dask_cudf
+  wheel-build-cudf-polars:
+    needs: wheel-publish-cudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_cudf_polars.sh
+  wheel-publish-cudf-polars:
+    needs: wheel-build-cudf-polars
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: cudf_polars
   trigger-pandas-tests:
     if: inputs.build_type == 'nightly'
     needs: wheel-build-cudf
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index a8643923a4d..5a937b2f362 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -19,7 +19,7 @@ jobs:
       secrets: inherit
       uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
       with:
-        matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
+        matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
         build_type: nightly
         branch: ${{ inputs.branch }}
         date: ${{ inputs.date }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index a35802f2ab0..d5dfc9e1ff5 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,7 +25,8 @@ jobs:
       - docs-build
       - wheel-build-cudf
       - wheel-tests-cudf
-      - test-cudf-polars
+      - wheel-build-cudf-polars
+      - wheel-tests-cudf-polars
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
       - devcontainer
@@ -133,9 +134,18 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
-  test-cudf-polars:
+  wheel-build-cudf-polars:
     needs: wheel-build-cudf
     secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      script: "ci/build_wheel_cudf_polars.sh"
+  wheel-tests-cudf-polars:
+    needs: wheel-build-cudf-polars
+    secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -143,7 +153,7 @@ jobs:
       build_type: pull-request
       # This always runs, but only fails if this PR touches code in
       # pylibcudf or cudf_polars
-      script: "ci/test_cudf_polars.sh"
+      script: "ci/test_wheel_cudf_polars.sh"
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
@@ -167,7 +177,7 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08
     with:
       arch: '["amd64"]'
-      cuda: '["12.2"]'
+      cuda: '["12.5"]'
       build_command: |
         sccache -z;
         build-all -DBUILD_BENCHMARKS=ON --verbose;
@@ -186,7 +196,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
       build_type: pull-request
       script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
       # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4fbc28fa6e1..f9cdde7c2b7 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -104,7 +104,7 @@ Instructions for a minimal build environment without conda are included below.
 # create the conda environment (assuming in base `cudf` directory)
 # note: RAPIDS currently doesn't support `channel_priority: strict`;
 # use `channel_priority: flexible` instead
-conda env create --name cudf_dev --file conda/environments/all_cuda-122_arch-x86_64.yaml
+conda env create --name cudf_dev --file conda/environments/all_cuda-125_arch-x86_64.yaml
 # activate the environment
 conda activate cudf_dev
 ```
diff --git a/README.md b/README.md
index 17d2df9a936..1ab6a2d7457 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=24.08 python=3.11 cuda-version=12.2
+    cudf=24.08 python=3.11 cuda-version=12.5
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh
new file mode 100755
index 00000000000..9c945e11c00
--- /dev/null
+++ b/ci/build_wheel_cudf_polars.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir="python/cudf_polars"
+
+./ci/build_wheel.sh ${package_dir}
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist
diff --git a/ci/run_cudf_polars_pytests.sh b/ci/run_cudf_polars_pytests.sh
new file mode 100755
index 00000000000..c10612a065a
--- /dev/null
+++ b/ci/run_cudf_polars_pytests.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# It is essential to cd into python/cudf_polars as `pytest-xdist` + `coverage` seem to work only at this directory level.
+
+# Support invoking run_cudf_polars_pytests.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_polars/
+
+python -m pytest --cache-clear "$@" tests
diff --git a/ci/test_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
similarity index 67%
rename from ci/test_cudf_polars.sh
rename to ci/test_wheel_cudf_polars.sh
index 95fb4b431bf..900acd5d473 100755
--- a/ci/test_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -18,19 +18,14 @@ else
 fi
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
-RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
-RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
-mkdir -p "${RAPIDS_TESTS_DIR}"
-
-rapids-logger "Install cudf wheel"
-# echo to expand wildcard before adding `[extra]` requires for pip
-python -m pip install $(echo ./dist/cudf*.whl)[test]
+# Download the cudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+python -m pip install ./local-cudf-dep/cudf*.whl
 
 rapids-logger "Install cudf_polars"
-python -m pip install 'polars>=1.0'
-python -m pip install --no-deps python/cudf_polars
+python -m pip install $(echo ./dist/cudf_polars*.whl)[test]
 
 rapids-logger "Run cudf_polars tests"
 
@@ -42,13 +37,11 @@ EXITCODE=0
 trap set_exitcode ERR
 set +e
 
-python -m pytest \
-       --cache-clear \
+./ci/run_cudf_polars_pytests.sh \
        --cov cudf_polars \
        --cov-fail-under=100 \
-       --cov-config=python/cudf_polars/pyproject.toml \
-       --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf_polars.xml" \
-       python/cudf_polars/tests
+       --cov-config=./pyproject.toml \
+       --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-polars.xml"
 
 trap ERR
 set -e
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 2b20b9d9ce4..c3800d3cc25 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -8,7 +8,7 @@ RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="
 
 # Download the cudf built in the previous step
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
-python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
+python -m pip install ./local-cudf-dep/cudf*.whl
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index cc9238ab80a..b8d73a01f96 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cachetools
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cramjam
 - cubinlinker
 - cuda-nvtx=11.8
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
similarity index 96%
rename from conda/environments/all_cuda-122_arch-x86_64.yaml
rename to conda/environments/all_cuda-125_arch-x86_64.yaml
index 9fecd452248..3f5fae49cbb 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cachetools
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cramjam
 - cuda-cudart-dev
 - cuda-nvcc
@@ -23,7 +23,7 @@ dependencies:
 - cuda-nvtx-dev
 - cuda-python>=12.0,<13.0a0
 - cuda-sanitizer-api
-- cuda-version=12.2
+- cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
@@ -96,4 +96,4 @@ dependencies:
 - zlib>=1.2.13
 - pip:
   - git+https://github.com/python-streamz/streamz.git@master
-name: all_cuda-122_arch-x86_64
+name: all_cuda-125_arch-x86_64
diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml
index d399e440edd..af894cccda0 100644
--- a/conda/recipes/cudf/conda_build_config.yaml
+++ b/conda/recipes/cudf/conda_build_config.yaml
@@ -11,7 +11,7 @@ c_stdlib_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
 
 cuda_compiler:
   - cuda-nvcc
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 3cdc2050631..9137f099ad1 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -64,6 +64,7 @@ requirements:
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.7.0
     - dlpack >=0.8,<1.0
+    # TODO: Change to `2.0` for NumPy 2
     - numpy 1.23
     - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
@@ -82,7 +83,8 @@ requirements:
     - pandas >=2.0,<2.2.3dev0
     - cupy >=12.0.0
     - numba >=0.57
-    - {{ pin_compatible('numpy', max_pin='x') }}
+    # TODO: Update `numpy` in `host` when dropping `<2.0a0`
+    - numpy >=1.23,<2.0a0
     - {{ pin_compatible('pyarrow', max_pin='x.x') }}
     - libcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml
index d399e440edd..af894cccda0 100644
--- a/conda/recipes/cudf_kafka/conda_build_config.yaml
+++ b/conda/recipes/cudf_kafka/conda_build_config.yaml
@@ -11,7 +11,7 @@ c_stdlib_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
 
 cuda_compiler:
   - cuda-nvcc
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index c01178bf732..4f99411e978 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -17,7 +17,7 @@ c_stdlib_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
 
 libarrow_version:
   - "==16.1.0"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2811711d58c..65347bd6689 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -216,6 +216,8 @@ include(cmake/thirdparty/get_fmt.cmake)
 include(cmake/thirdparty/get_spdlog.cmake)
 # find nanoarrow
 include(cmake/thirdparty/get_nanoarrow.cmake)
+# find thread_pool
+include(cmake/thirdparty/get_thread_pool.cmake)
 
 # Workaround until https://github.com/rapidsai/rapids-cmake/issues/176 is resolved
 if(NOT BUILD_SHARED_LIBS)
@@ -409,6 +411,7 @@ add_library(
   src/io/orc/stripe_init.cu
   src/datetime/timezone.cpp
   src/io/orc/writer_impl.cu
+  src/io/parquet/arrow_schema_writer.cpp
   src/io/parquet/compact_protocol_reader.cpp
   src/io/parquet/compact_protocol_writer.cpp
   src/io/parquet/decode_preprocess.cu
@@ -425,6 +428,7 @@ add_library(
   src/io/parquet/reader_impl_helpers.cpp
   src/io/parquet/reader_impl_preprocess.cu
   src/io/parquet/writer_impl.cu
+  src/io/parquet/writer_impl_helpers.cpp
   src/io/parquet/decode_fixed.cu
   src/io/statistics/orc_column_statistics.cu
   src/io/statistics/parquet_column_statistics.cu
@@ -802,7 +806,7 @@ add_dependencies(cudf jitify_preprocess_run)
 # Specify the target module library dependencies
 target_link_libraries(
   cudf
-  PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm
+  PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS_thread_pool>
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
           kvikio::kvikio $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
 )
@@ -925,6 +929,11 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
     add_library(
       ${_tgt} SHARED src/utilities/stacktrace.cpp tests/utilities/identify_stream_usage.cpp
     )
+    if(CUDF_USE_PER_THREAD_DEFAULT_STREAM)
+      target_compile_definitions(
+        ${_tgt} PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM CUDF_USE_PER_THREAD_DEFAULT_STREAM
+      )
+    endif()
 
     set_target_properties(
       ${_tgt}
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index a5b248135c1..ff431c7f260 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -231,8 +231,8 @@ ConfigureBench(
 )
 
 ConfigureNVBench(
-  GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_nunique.cpp groupby/group_rank.cpp
-  groupby/group_struct_keys.cpp
+  GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_max_multithreaded.cpp
+  groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/copying/gather.cu b/cpp/benchmarks/copying/gather.cu
index eeb0149fb3a..985166f7298 100644
--- a/cpp/benchmarks/copying/gather.cu
+++ b/cpp/benchmarks/copying/gather.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,5 +71,5 @@ void BM_gather(benchmark::State& state)
     ->Ranges({{1 << 10, 1 << 26}, {1, 8}})                     \
     ->UseManualTime();
 
-GBM_BENCHMARK_DEFINE(double_coalesce_x, double, true);
-GBM_BENCHMARK_DEFINE(double_coalesce_o, double, false);
+GBM_BENCHMARK_DEFINE(double_coalesced, double, true);
+GBM_BENCHMARK_DEFINE(double_shuffled, double, false);
diff --git a/cpp/benchmarks/copying/scatter.cu b/cpp/benchmarks/copying/scatter.cu
index a521dc82739..c27480b69f4 100644
--- a/cpp/benchmarks/copying/scatter.cu
+++ b/cpp/benchmarks/copying/scatter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,5 +74,5 @@ void BM_scatter(benchmark::State& state)
     ->Ranges({{1 << 10, 1 << 25}, {1, 8}})                      \
     ->UseManualTime();
 
-SBM_BENCHMARK_DEFINE(double_coalesce_x, double, true);
-SBM_BENCHMARK_DEFINE(double_coalesce_o, double, false);
+SBM_BENCHMARK_DEFINE(double_coalesced, double, true);
+SBM_BENCHMARK_DEFINE(double_shuffled, double, false);
diff --git a/cpp/benchmarks/decimal/convert_floating.cpp b/cpp/benchmarks/decimal/convert_floating.cpp
index a367036c494..ac09c3400cb 100644
--- a/cpp/benchmarks/decimal/convert_floating.cpp
+++ b/cpp/benchmarks/decimal/convert_floating.cpp
@@ -32,8 +32,6 @@ void bench_cast_decimal(nvbench::state& state, nvbench::type_list<InputType, Out
 
   static constexpr bool is_double =
     std::is_same_v<InputType, double> || std::is_same_v<OutputType, double>;
-  static constexpr bool is_32bit =
-    std::is_same_v<InputType, numeric::decimal32> || std::is_same_v<OutputType, numeric::decimal32>;
   static constexpr bool is_128bit = std::is_same_v<InputType, numeric::decimal128> ||
                                     std::is_same_v<OutputType, numeric::decimal128>;
 
@@ -69,21 +67,6 @@ void bench_cast_decimal(nvbench::state& state, nvbench::type_list<InputType, Out
     return;
   }
 
-  // The current float <--> decimal conversion algorithm is limited
-  static constexpr bool is_64bit = !is_32bit && !is_128bit;
-  if (is_32bit && (exp_mode != 3)) {
-    state.skip("Decimal32 conversion only works up to scale factors of 10^9.");
-    return;
-  }
-  if (is_64bit && ((exp_mode < 2) || (exp_mode > 4))) {
-    state.skip("Decimal64 conversion only works up to scale factors of 10^18.");
-    return;
-  }
-  if (is_128bit && ((exp_mode == 0) || (exp_mode == 6))) {
-    state.skip("Decimal128 conversion only works up to scale factors of 10^38.");
-    return;
-  }
-
   // Type IDs
   auto const input_id  = cudf::type_to_id<InputType>();
   auto const output_id = cudf::type_to_id<OutputType>();
diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp
index 01ca23ebbf8..f41285008c4 100644
--- a/cpp/benchmarks/groupby/group_max.cpp
+++ b/cpp/benchmarks/groupby/group_max.cpp
@@ -48,20 +48,25 @@ void groupby_max_helper(nvbench::state& state,
       cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
   }();
 
+  auto const num_aggregations = state.get_int64("num_aggregations");
+
   auto keys_view = keys->view();
   auto gb_obj    = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));
 
   std::vector<cudf::groupby::aggregation_request> requests;
-  requests.emplace_back(cudf::groupby::aggregation_request());
-  requests[0].values = vals->view();
-  requests[0].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+  for (int64_t i = 0; i < num_aggregations; i++) {
+    requests.emplace_back(cudf::groupby::aggregation_request());
+    requests[i].values = vals->view();
+    requests[i].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+  }
 
   auto const mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
   auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_element_count(
+    static_cast<double>(num_rows * num_aggregations) / elapsed_time / 1'000'000., "Mrows/s");
   state.add_buffer_size(
     mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
@@ -91,7 +96,8 @@ NVBENCH_BENCH_TYPES(bench_groupby_max,
   .set_name("groupby_max")
   .add_int64_axis("cardinality", {0})
   .add_int64_power_of_two_axis("num_rows", {12, 18, 24})
-  .add_float64_axis("null_probability", {0, 0.1, 0.9});
+  .add_float64_axis("null_probability", {0, 0.1, 0.9})
+  .add_int64_axis("num_aggregations", {1, 2, 4, 8, 16, 32});
 
 NVBENCH_BENCH_TYPES(bench_groupby_max_cardinality, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t>))
   .set_name("groupby_max_cardinality")
diff --git a/cpp/benchmarks/groupby/group_max_multithreaded.cpp b/cpp/benchmarks/groupby/group_max_multithreaded.cpp
new file mode 100644
index 00000000000..bf1a1a5fcf7
--- /dev/null
+++ b/cpp/benchmarks/groupby/group_max_multithreaded.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <BS_thread_pool.hpp>
+#include <nvbench/nvbench.cuh>
+
+template <typename Type>
+void bench_groupby_max_multithreaded(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const cardinality      = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const num_rows         = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const null_probability = state.get_float64("null_probability");
+  auto const num_threads      = state.get_int64("num_threads");
+  auto const num_aggregations = state.get_int64("num_aggregations");
+
+  auto const keys = [&] {
+    data_profile const profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .no_validity()
+        .distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
+    return create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
+  }();
+
+  auto const vals = [&] {
+    auto builder = data_profile_builder().cardinality(0).distribution(
+      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows);
+    if (null_probability > 0) {
+      builder.null_probability(null_probability);
+    } else {
+      builder.no_validity();
+    }
+    return create_random_column(
+      cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
+  }();
+
+  auto keys_view = keys->view();
+  auto gb_obj    = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));
+
+  auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
+  BS::thread_pool threads(num_threads);
+
+  std::vector<std::vector<cudf::groupby::aggregation_request>> requests(num_threads);
+  for (auto& thread_requests : requests) {
+    for (int64_t j = 0; j < num_aggregations; j++) {
+      thread_requests.emplace_back();
+      thread_requests.back().values = vals->view();
+      thread_requests.back().aggregations.push_back(
+        cudf::make_max_aggregation<cudf::groupby_aggregation>());
+    }
+  }
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      auto perform_agg = [&](int64_t index) { gb_obj.aggregate(requests[index], streams[index]); };
+      timer.start();
+      threads.detach_sequence(decltype(num_threads){0}, num_threads, perform_agg);
+      threads.wait();
+      cudf::detail::join_streams(streams, cudf::get_default_stream());
+      cudf::get_default_stream().synchronize();
+      timer.stop();
+    });
+
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(
+    static_cast<double>(num_rows * num_threads * num_aggregations) / elapsed_time / 1'000'000.,
+    "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH_TYPES(bench_groupby_max_multithreaded,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t, float, double>))
+  .set_name("groupby_max_multithreaded")
+  .add_int64_axis("cardinality", {0})
+  .add_int64_power_of_two_axis("num_rows", {12, 18})
+  .add_float64_axis("null_probability", {0, 0.1, 0.9})
+  .add_int64_axis("num_aggregations", {1})
+  .add_int64_axis("num_threads", {1, 2, 4, 8});
diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
index aa0ee39a179..e91bf06fdfa 100644
--- a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
@@ -24,8 +24,8 @@
 #include <cudf/io/orc.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
-#include <cudf/utilities/thread_pool.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <nvbench/nvbench.cuh>
 
 #include <vector>
@@ -90,7 +90,7 @@ void BM_orc_multithreaded_read_common(nvbench::state& state,
   auto const num_threads = state.get_int64("num_threads");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
 
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
@@ -112,13 +112,11 @@ void BM_orc_multithreaded_read_common(nvbench::state& state,
                    cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource());
                  };
 
-                 threads.paused = true;
-                 for (size_t i = 0; i < num_files; ++i) {
-                   threads.submit(read_func, i);
-                 }
+                 threads.pause();
+                 threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                  timer.start();
-                 threads.paused = false;
-                 threads.wait_for_tasks();
+                 threads.unpause();
+                 threads.wait();
                  cudf::detail::join_streams(streams, cudf::get_default_stream());
                  timer.stop();
                });
@@ -170,7 +168,7 @@ void BM_orc_multithreaded_read_chunked_common(nvbench::state& state,
   size_t const output_limit = state.get_int64("output_limit");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
@@ -203,13 +201,11 @@ void BM_orc_multithreaded_read_chunked_common(nvbench::state& state,
                    } while (reader.has_next());
                  };
 
-                 threads.paused = true;
-                 for (size_t i = 0; i < num_files; ++i) {
-                   threads.submit(read_func, i);
-                 }
+                 threads.pause();
+                 threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                  timer.start();
-                 threads.paused = false;
-                 threads.wait_for_tasks();
+                 threads.unpause();
+                 threads.wait();
                  cudf::detail::join_streams(streams, cudf::get_default_stream());
                  timer.stop();
                });
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index b4c8ed78ed8..9e76ebb71ab 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -23,10 +23,10 @@
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
-#include <cudf/utilities/thread_pool.hpp>
 
 #include <nvtx3/nvtx3.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <nvbench/nvbench.cuh>
 
 #include <vector>
@@ -93,7 +93,7 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
   auto const num_threads = state.get_int64("num_threads");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
 
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
@@ -114,13 +114,11 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
                  cudf::io::read_parquet(read_opts, stream, rmm::mr::get_current_device_resource());
                };
 
-               threads.paused = true;
-               for (size_t i = 0; i < num_files; ++i) {
-                 threads.submit(read_func, i);
-               }
+               threads.pause();
+               threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                timer.start();
-               threads.paused = false;
-               threads.wait_for_tasks();
+               threads.unpause();
+               threads.wait();
                cudf::detail::join_streams(streams, cudf::get_default_stream());
                timer.stop();
              });
@@ -176,7 +174,7 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
   size_t const output_limit = state.get_int64("output_limit");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
@@ -207,13 +205,11 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
                  } while (reader.has_next());
                };
 
-               threads.paused = true;
-               for (size_t i = 0; i < num_files; ++i) {
-                 threads.submit(read_func, i);
-               }
+               threads.pause();
+               threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                timer.start();
-               threads.paused = false;
-               threads.wait_for_tasks();
+               threads.unpause();
+               threads.wait();
                cudf::detail::join_streams(streams, cudf::get_default_stream());
                timer.stop();
              });
diff --git a/cpp/benchmarks/lists/copying/scatter_lists.cu b/cpp/benchmarks/lists/copying/scatter_lists.cu
index dbc3234dabf..570decf410f 100644
--- a/cpp/benchmarks/lists/copying/scatter_lists.cu
+++ b/cpp/benchmarks/lists/copying/scatter_lists.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -143,5 +143,5 @@ void BM_lists_scatter(::benchmark::State& state)
     ->Ranges({{1 << 10, 1 << 25}, {64, 2048}}) /* 1K-1B rows, 64-2048 elements */ \
     ->UseManualTime();
 
-SBM_BENCHMARK_DEFINE(double_type_colesce_o, double, true);
-SBM_BENCHMARK_DEFINE(double_type_colesce_x, double, false);
+SBM_BENCHMARK_DEFINE(double_coalesced, double, true);
+SBM_BENCHMARK_DEFINE(double_shuffled, double, false);
diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp
index d05c195d077..d5b74da6773 100644
--- a/cpp/benchmarks/text/jaccard.cpp
+++ b/cpp/benchmarks/text/jaccard.cpp
@@ -59,6 +59,6 @@ static void bench_jaccard(nvbench::state& state)
 
 NVBENCH_BENCH(bench_jaccard)
   .set_name("jaccard")
-  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
-  .add_int64_axis("row_width", {128, 512, 2048})
+  .add_int64_axis("num_rows", {32768, 131072, 262144})
+  .add_int64_axis("row_width", {128, 512, 1024, 2048})
   .add_int64_axis("substring_width", {5, 10});
diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 6ec35ddcaf1..fb82b0f5ff3 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -15,10 +15,6 @@
 # This function finds cuCollections and performs any additional configuration.
 function(find_and_configure_cucollections)
   include(${rapids-cmake-dir}/cpm/cuco.cmake)
-  include(${rapids-cmake-dir}/cpm/package_override.cmake)
-
-  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
-  rapids_cpm_package_override("${cudf_patch_dir}/cuco_override.json")
 
   if(BUILD_SHARED_LIBS)
     rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports)
diff --git a/cpp/cmake/thirdparty/get_thread_pool.cmake b/cpp/cmake/thirdparty/get_thread_pool.cmake
new file mode 100644
index 00000000000..264257c7199
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_thread_pool.cmake
@@ -0,0 +1,31 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# This function finds rmm and sets any additional necessary environment variables.
+function(find_and_configure_thread_pool)
+  rapids_cpm_find(
+    BS_thread_pool 4.1.0
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/bshoshany/thread-pool.git
+    GIT_TAG 097aa718f25d44315cadb80b407144ad455ee4f9
+    GIT_SHALLOW TRUE
+  )
+  if(NOT TARGET BS_thread_pool)
+    add_library(BS_thread_pool INTERFACE)
+    target_include_directories(BS_thread_pool INTERFACE ${BS_thread_pool_SOURCE_DIR}/include)
+    target_compile_definitions(BS_thread_pool INTERFACE "BS_THREAD_POOL_ENABLE_PAUSE=1")
+  endif()
+endfunction()
+
+find_and_configure_thread_pool()
diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index e61102dffac..2f29578f7ae 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -3,11 +3,6 @@
   "packages" : {
     "CCCL" : {
       "patches" : [
-        {
-          "file" : "cccl/revert_pr_211.diff",
-          "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
-          "fixed_in" : ""
-        },
         {
           "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
           "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
diff --git a/cpp/cmake/thirdparty/patches/cuco_noexcept.diff b/cpp/cmake/thirdparty/patches/cuco_noexcept.diff
deleted file mode 100644
index 0f334c0e81f..00000000000
--- a/cpp/cmake/thirdparty/patches/cuco_noexcept.diff
+++ /dev/null
@@ -1,227 +0,0 @@
-diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh
-index 7f9de01..5228193 100644
---- a/include/cuco/aow_storage.cuh
-+++ b/include/cuco/aow_storage.cuh
-@@ -81,7 +81,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
-    * @param size Number of windows to (de)allocate
-    * @param allocator Allocator used for (de)allocating device storage
-    */
--  explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept;
-+  explicit constexpr aow_storage(Extent size, Allocator const& allocator = {});
- 
-   aow_storage(aow_storage&&) = default;  ///< Move constructor
-   /**
-@@ -122,7 +122,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
-    * @param key Key to which all keys in `slots` are initialized
-    * @param stream Stream used for executing the kernel
-    */
--  void initialize(value_type key, cuda_stream_ref stream = {}) noexcept;
-+  void initialize(value_type key, cuda_stream_ref stream = {});
- 
-   /**
-    * @brief Asynchronously initializes each slot in the AoW storage to contain `key`.
-diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
-index c2c9c14..8ac4236 100644
---- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh
-+++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
-@@ -125,7 +125,7 @@ class open_addressing_impl {
-                                  KeyEqual const& pred,
-                                  ProbingScheme const& probing_scheme,
-                                  Allocator const& alloc,
--                                 cuda_stream_ref stream) noexcept
-+                                 cuda_stream_ref stream)
-     : empty_slot_sentinel_{empty_slot_sentinel},
-       erased_key_sentinel_{this->extract_key(empty_slot_sentinel)},
-       predicate_{pred},
-@@ -233,7 +233,7 @@ class open_addressing_impl {
-    *
-    * @param stream CUDA stream this operation is executed in
-    */
--  void clear(cuda_stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); }
-+  void clear(cuda_stream_ref stream) { storage_.initialize(empty_slot_sentinel_, stream); }
- 
-   /**
-    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
-@@ -599,7 +599,7 @@ class open_addressing_impl {
-    *
-    * @return The number of elements in the container
-    */
--  [[nodiscard]] size_type size(cuda_stream_ref stream) const noexcept
-+  [[nodiscard]] size_type size(cuda_stream_ref stream) const
-   {
-     auto counter =
-       detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
-diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
-index e17a145..3fa1d02 100644
---- a/include/cuco/detail/static_map/static_map.inl
-+++ b/include/cuco/detail/static_map/static_map.inl
-@@ -123,7 +123,7 @@ template <class Key,
-           class Allocator,
-           class Storage>
- void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
--  cuda_stream_ref stream) noexcept
-+  cuda_stream_ref stream)
- {
-   impl_->clear(stream);
- }
-@@ -215,7 +215,7 @@ template <class Key,
-           class Storage>
- template <typename InputIt>
- void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
--  insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream) noexcept
-+  insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream)
- {
-   return this->insert_or_assign_async(first, last, stream);
-   stream.synchronize();
-@@ -465,7 +465,7 @@ template <class Key,
-           class Storage>
- static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
- static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
--  cuda_stream_ref stream) const noexcept
-+  cuda_stream_ref stream) const
- {
-   return impl_->size(stream);
- }
-diff --git a/include/cuco/detail/static_multiset/static_multiset.inl b/include/cuco/detail/static_multiset/static_multiset.inl
-index 174f9bc..582926b 100644
---- a/include/cuco/detail/static_multiset/static_multiset.inl
-+++ b/include/cuco/detail/static_multiset/static_multiset.inl
-@@ -97,7 +97,7 @@ template <class Key,
-           class Allocator,
-           class Storage>
- void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
--  cuda_stream_ref stream) noexcept
-+  cuda_stream_ref stream)
- {
-   impl_->clear(stream);
- }
-@@ -183,7 +183,7 @@ template <class Key,
-           class Storage>
- static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
- static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
--  cuda_stream_ref stream) const noexcept
-+  cuda_stream_ref stream) const
- {
-   return impl_->size(stream);
- }
-diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl
-index 645013f..d3cece0 100644
---- a/include/cuco/detail/static_set/static_set.inl
-+++ b/include/cuco/detail/static_set/static_set.inl
-@@ -98,7 +98,7 @@ template <class Key,
-           class Allocator,
-           class Storage>
- void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
--  cuda_stream_ref stream) noexcept
-+  cuda_stream_ref stream)
- {
-   impl_->clear(stream);
- }
-@@ -429,7 +429,7 @@ template <class Key,
-           class Storage>
- static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
- static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
--  cuda_stream_ref stream) const noexcept
-+  cuda_stream_ref stream) const
- {
-   return impl_->size(stream);
- }
-diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl
-index 3547f4c..94b7f98 100644
---- a/include/cuco/detail/storage/aow_storage.inl
-+++ b/include/cuco/detail/storage/aow_storage.inl
-@@ -32,8 +32,8 @@
- namespace cuco {
- 
- template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
--constexpr aow_storage<T, WindowSize, Extent, Allocator>::aow_storage(
--  Extent size, Allocator const& allocator) noexcept
-+constexpr aow_storage<T, WindowSize, Extent, Allocator>::aow_storage(Extent size,
-+                                                                     Allocator const& allocator)
-   : detail::aow_storage_base<T, WindowSize, Extent>{size},
-     allocator_{allocator},
-     window_deleter_{capacity(), allocator_},
-@@ -64,7 +64,7 @@ aow_storage<T, WindowSize, Extent, Allocator>::ref() const noexcept
- 
- template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
- void aow_storage<T, WindowSize, Extent, Allocator>::initialize(value_type key,
--                                                               cuda_stream_ref stream) noexcept
-+                                                               cuda_stream_ref stream)
- {
-   this->initialize_async(key, stream);
-   stream.synchronize();
-diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
-index c86e90c..95da423 100644
---- a/include/cuco/static_map.cuh
-+++ b/include/cuco/static_map.cuh
-@@ -269,7 +269,7 @@ class static_map {
-    *
-    * @param stream CUDA stream this operation is executed in
-    */
--  void clear(cuda_stream_ref stream = {}) noexcept;
-+  void clear(cuda_stream_ref stream = {});
- 
-   /**
-    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
-@@ -387,7 +387,7 @@ class static_map {
-    * @param stream CUDA stream used for insert
-    */
-   template <typename InputIt>
--  void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept;
-+  void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {});
- 
-   /**
-    * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k`
-@@ -690,7 +690,7 @@ class static_map {
-    * @param stream CUDA stream used to get the number of inserted elements
-    * @return The number of elements in the container
-    */
--  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
-+  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
- 
-   /**
-    * @brief Gets the maximum number of elements the hash map can hold.
-diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh
-index 0daf103..fbcbc9c 100644
---- a/include/cuco/static_multiset.cuh
-+++ b/include/cuco/static_multiset.cuh
-@@ -235,7 +235,7 @@ class static_multiset {
-    *
-    * @param stream CUDA stream this operation is executed in
-    */
--  void clear(cuda_stream_ref stream = {}) noexcept;
-+  void clear(cuda_stream_ref stream = {});
- 
-   /**
-    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
-@@ -339,7 +339,7 @@ class static_multiset {
-    * @param stream CUDA stream used to get the number of inserted elements
-    * @return The number of elements in the container
-    */
--  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
-+  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
- 
-   /**
-    * @brief Gets the maximum number of elements the multiset can hold.
-diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh
-index a069939..3517f84 100644
---- a/include/cuco/static_set.cuh
-+++ b/include/cuco/static_set.cuh
-@@ -240,7 +240,7 @@ class static_set {
-    *
-    * @param stream CUDA stream this operation is executed in
-    */
--  void clear(cuda_stream_ref stream = {}) noexcept;
-+  void clear(cuda_stream_ref stream = {});
- 
-   /**
-    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
-@@ -687,7 +687,7 @@ class static_set {
-    * @param stream CUDA stream used to get the number of inserted elements
-    * @return The number of elements in the container
-    */
--  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
-+  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
- 
-   /**
-    * @brief Gets the maximum number of elements the hash set can hold.
diff --git a/cpp/cmake/thirdparty/patches/cuco_override.json b/cpp/cmake/thirdparty/patches/cuco_override.json
deleted file mode 100644
index ae0a9a4b4f0..00000000000
--- a/cpp/cmake/thirdparty/patches/cuco_override.json
+++ /dev/null
@@ -1,14 +0,0 @@
-
-{
-  "packages" : {
-    "cuco" : {
-      "patches" : [
-        {
-          "file" : "${current_json_dir}/cuco_noexcept.diff",
-          "issue" : "Remove erroneous noexcept clauses on cuco functions that may throw [https://github.com/rapidsai/cudf/issues/16059]",
-          "fixed_in" : ""
-        }
-      ]
-    }
-  }
-}
diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index bde6ef7d69c..dce81fb1677 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -57,6 +57,7 @@ build_example() {
 }
 
 build_example basic
+build_example tpch
 build_example strings
 build_example nested_types
 build_example parquet_io
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 8be17db3781..274a2599189 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -16,6 +16,8 @@
 
 #include "parquet_io.hpp"
 
+#include "../utilities/timer.hpp"
+
 /**
  * @file parquet_io.cpp
  * @brief Demonstrates usage of the libcudf APIs to read and write
@@ -140,7 +142,7 @@ int main(int argc, char const** argv)
             << page_stat_string << ".." << std::endl;
 
   // `timer` is automatically started here
-  Timer timer;
+  cudf::examples::timer timer;
   write_parquet(input->view(), metadata, output_filepath, encoding, compression, page_stats);
   timer.print_elapsed_millis();
 
diff --git a/cpp/examples/parquet_io/parquet_io.hpp b/cpp/examples/parquet_io/parquet_io.hpp
index d2fc359a2fe..e27cbec4fce 100644
--- a/cpp/examples/parquet_io/parquet_io.hpp
+++ b/cpp/examples/parquet_io/parquet_io.hpp
@@ -124,34 +124,3 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
 
   return std::nullopt;
 }
-
-/**
- * @brief Light-weight timer for parquet reader and writer instrumentation
- *
- * Timer object constructed from std::chrono, instrumenting at microseconds
- * precision. Can display elapsed durations at milli and micro second
- * scales. Timer starts at object construction.
- */
-class Timer {
- public:
-  using micros = std::chrono::microseconds;
-  using millis = std::chrono::milliseconds;
-
-  Timer() { reset(); }
-  void reset() { start_time = std::chrono::high_resolution_clock::now(); }
-  auto elapsed() { return (std::chrono::high_resolution_clock::now() - start_time); }
-  void print_elapsed_micros()
-  {
-    std::cout << "Elapsed Time: " << std::chrono::duration_cast<micros>(elapsed()).count()
-              << "us\n\n";
-  }
-  void print_elapsed_millis()
-  {
-    std::cout << "Elapsed Time: " << std::chrono::duration_cast<millis>(elapsed()).count()
-              << "ms\n\n";
-  }
-
- private:
-  using time_point_t = std::chrono::time_point<std::chrono::high_resolution_clock>;
-  time_point_t start_time;
-};
diff --git a/cpp/examples/tpch/CMakeLists.txt b/cpp/examples/tpch/CMakeLists.txt
new file mode 100644
index 00000000000..1b91d07e148
--- /dev/null
+++ b/cpp/examples/tpch/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.26.4)
+
+include(../set_cuda_architecture.cmake)
+
+rapids_cuda_init_architectures(tpch_example)
+rapids_cuda_set_architectures(RAPIDS)
+
+project(
+  tpch_example
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+include(../fetch_dependencies.cmake)
+
+add_executable(tpch_q1 q1.cpp)
+target_link_libraries(tpch_q1 PRIVATE cudf::cudf)
+target_compile_features(tpch_q1 PRIVATE cxx_std_17)
+
+add_executable(tpch_q5 q5.cpp)
+target_link_libraries(tpch_q5 PRIVATE cudf::cudf)
+target_compile_features(tpch_q5 PRIVATE cxx_std_17)
+
+add_executable(tpch_q6 q6.cpp)
+target_link_libraries(tpch_q6 PRIVATE cudf::cudf)
+target_compile_features(tpch_q6 PRIVATE cxx_std_17)
+
+add_executable(tpch_q9 q9.cpp)
+target_link_libraries(tpch_q9 PRIVATE cudf::cudf)
+target_compile_features(tpch_q9 PRIVATE cxx_std_17)
diff --git a/cpp/examples/tpch/README.md b/cpp/examples/tpch/README.md
new file mode 100644
index 00000000000..1ea71ae9824
--- /dev/null
+++ b/cpp/examples/tpch/README.md
@@ -0,0 +1,38 @@
+# TPC-H Inspired Examples
+
+Implements TPC-H queries using `libcudf`. We leverage the data generator (wrapper around official TPC-H datagen) from [Apache Datafusion](https://github.com/apache/datafusion) for generating data in Parquet format.
+
+## Requirements
+
+- Rust
+
+## Generating the Dataset
+
+1. Clone the datafusion repository.
+```bash
+git clone git@github.com:apache/datafusion.git
+```
+
+2. Run the data generator. The data will be placed in a `data/` subdirectory.
+```bash
+cd datafusion/benchmarks/
+./bench.sh data tpch
+
+# for scale factor 10,
+./bench.sh data tpch10
+```
+
+## Running Queries
+
+1. Build the examples.
+```bash
+cd cpp/examples
+./build.sh
+```
+The TPC-H query binaries would be built inside `examples/tpch/build`.
+
+2. Execute the queries.
+```bash
+./tpch/build/tpch_q1
+```
+A parquet file named `q1.parquet` would be generated holding the results of the query.
diff --git a/cpp/examples/tpch/q1.cpp b/cpp/examples/tpch/q1.cpp
new file mode 100644
index 00000000000..1bdf039da4a
--- /dev/null
+++ b/cpp/examples/tpch/q1.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "utils.hpp"
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+/**
+ * @file q1.cpp
+ * @brief Implement query 1 of the TPC-H benchmark.
+ *
+ * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
+ *
+ * select
+ *    l_returnflag,
+ *    l_linestatus,
+ *    sum(l_quantity) as sum_qty,
+ *    sum(l_extendedprice) as sum_base_price,
+ *    sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
+ *    sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+ *    avg(l_quantity) as avg_qty,
+ *    avg(l_extendedprice) as avg_price,
+ *    avg(l_discount) as avg_disc,
+ *    count(*) as count_order
+ * from
+ *    lineitem
+ * where
+ *    l_shipdate <= date '1998-09-02'
+ * group by
+ *    l_returnflag,
+ *    l_linestatus
+ * order by
+ *    l_returnflag,
+ *    l_linestatus;
+ */
+
+/**
+ * @brief Calculate the discount price column
+ *
+ * @param discount The discount column
+ * @param extendedprice The extended price column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_disc_price(
+  cudf::column_view const& discount,
+  cudf::column_view const& extendedprice,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const one = cudf::numeric_scalar<double>(1);
+  auto const one_minus_discount =
+    cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type(), stream, mr);
+  auto const disc_price_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto disc_price            = cudf::binary_operation(extendedprice,
+                                           one_minus_discount->view(),
+                                           cudf::binary_operator::MUL,
+                                           disc_price_type,
+                                           stream,
+                                           mr);
+  return disc_price;
+}
+
+/**
+ * @brief Calculate the charge column
+ *
+ * @param tax The tax column
+ * @param disc_price The discount price column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_charge(
+  cudf::column_view const& tax,
+  cudf::column_view const& disc_price,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const one = cudf::numeric_scalar<double>(1);
+  auto const one_plus_tax =
+    cudf::binary_operation(one, tax, cudf::binary_operator::ADD, tax.type(), stream, mr);
+  auto const charge_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto charge            = cudf::binary_operation(
+    disc_price, one_plus_tax->view(), cudf::binary_operator::MUL, charge_type, stream, mr);
+  return charge;
+}
+
+int main(int argc, char const** argv)
+{
+  auto const args = parse_args(argc, argv);
+
+  // Use a memory pool
+  auto resource = create_memory_resource(args.memory_resource_type);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  cudf::examples::timer timer;
+
+  // Define the column projections and filter predicate for `lineitem` table
+  std::vector<std::string> const lineitem_cols = {"l_returnflag",
+                                                  "l_linestatus",
+                                                  "l_quantity",
+                                                  "l_extendedprice",
+                                                  "l_discount",
+                                                  "l_shipdate",
+                                                  "l_orderkey",
+                                                  "l_tax"};
+  auto const shipdate_ref                      = cudf::ast::column_reference(std::distance(
+    lineitem_cols.begin(), std::find(lineitem_cols.begin(), lineitem_cols.end(), "l_shipdate")));
+  auto shipdate_upper =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1998, 9, 2), true);
+  auto const shipdate_upper_literal = cudf::ast::literal(shipdate_upper);
+  auto lineitem_pred                = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::LESS_EQUAL, shipdate_ref, shipdate_upper_literal);
+
+  // Read out the `lineitem` table from parquet file
+  auto lineitem =
+    read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred));
+
+  // Calculate the discount price and charge columns and append to lineitem table
+  auto disc_price =
+    calc_disc_price(lineitem->column("l_discount"), lineitem->column("l_extendedprice"));
+  auto charge = calc_charge(lineitem->column("l_tax"), disc_price->view());
+  (*lineitem).append(disc_price, "disc_price").append(charge, "charge");
+
+  // Perform the group by operation
+  auto const groupedby_table = apply_groupby(
+    lineitem,
+    groupby_context_t{
+      {"l_returnflag", "l_linestatus"},
+      {
+        {"l_extendedprice",
+         {{cudf::aggregation::Kind::SUM, "sum_base_price"},
+          {cudf::aggregation::Kind::MEAN, "avg_price"}}},
+        {"l_quantity",
+         {{cudf::aggregation::Kind::SUM, "sum_qty"}, {cudf::aggregation::Kind::MEAN, "avg_qty"}}},
+        {"l_discount",
+         {
+           {cudf::aggregation::Kind::MEAN, "avg_disc"},
+         }},
+        {"disc_price",
+         {
+           {cudf::aggregation::Kind::SUM, "sum_disc_price"},
+         }},
+        {"charge",
+         {{cudf::aggregation::Kind::SUM, "sum_charge"},
+          {cudf::aggregation::Kind::COUNT_ALL, "count_order"}}},
+      }});
+
+  // Perform the order by operation
+  auto const orderedby_table = apply_orderby(groupedby_table,
+                                             {"l_returnflag", "l_linestatus"},
+                                             {cudf::order::ASCENDING, cudf::order::ASCENDING});
+
+  timer.print_elapsed_millis();
+
+  // Write query result to a parquet file
+  orderedby_table->to_parquet("q1.parquet");
+  return 0;
+}
diff --git a/cpp/examples/tpch/q5.cpp b/cpp/examples/tpch/q5.cpp
new file mode 100644
index 00000000000..e56850b94d6
--- /dev/null
+++ b/cpp/examples/tpch/q5.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "utils.hpp"
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+/**
+ * @file q5.cpp
+ * @brief Implement query 5 of the TPC-H benchmark.
+ *
+ * create view customer as select * from '/tables/scale-1/customer.parquet';
+ * create view orders as select * from '/tables/scale-1/orders.parquet';
+ * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
+ * create view supplier as select * from '/tables/scale-1/supplier.parquet';
+ * create view nation as select * from '/tables/scale-1/nation.parquet';
+ * create view region as select * from '/tables/scale-1/region.parquet';
+ *
+ * select
+ *    n_name,
+ *    sum(l_extendedprice * (1 - l_discount)) as revenue
+ * from
+ *    customer,
+ *    orders,
+ *    lineitem,
+ *    supplier,
+ *    nation,
+ *    region
+ * where
+ *     c_custkey = o_custkey
+ *    and l_orderkey = o_orderkey
+ *    and l_suppkey = s_suppkey
+ *    and c_nationkey = s_nationkey
+ *    and s_nationkey = n_nationkey
+ *    and n_regionkey = r_regionkey
+ *    and r_name = 'ASIA'
+ *    and o_orderdate >= date '1994-01-01'
+ *    and o_orderdate < date '1995-01-01'
+ * group by
+ *    n_name
+ * order by
+ *    revenue desc;
+ */
+
+/**
+ * @brief Calculate the revenue column
+ *
+ * @param extendedprice The extended price column
+ * @param discount The discount column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+  cudf::column_view const& extendedprice,
+  cudf::column_view const& discount,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const one = cudf::numeric_scalar<double>(1);
+  auto const one_minus_discount =
+    cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type(), stream, mr);
+  auto const revenue_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto revenue            = cudf::binary_operation(extendedprice,
+                                        one_minus_discount->view(),
+                                        cudf::binary_operator::MUL,
+                                        revenue_type,
+                                        stream,
+                                        mr);
+  return revenue;
+}
+
+int main(int argc, char const** argv)
+{
+  auto const args = parse_args(argc, argv);
+
+  // Use a memory pool
+  auto resource = create_memory_resource(args.memory_resource_type);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  cudf::examples::timer timer;
+
+  // Define the column projection and filter predicate for the `orders` table
+  std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
+  auto const o_orderdate_ref                 = cudf::ast::column_reference(std::distance(
+    orders_cols.begin(), std::find(orders_cols.begin(), orders_cols.end(), "o_orderdate")));
+  auto o_orderdate_lower =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1994, 1, 1), true);
+  auto const o_orderdate_lower_limit = cudf::ast::literal(o_orderdate_lower);
+  auto const o_orderdate_pred_lower  = cudf::ast::operation(
+    cudf::ast::ast_operator::GREATER_EQUAL, o_orderdate_ref, o_orderdate_lower_limit);
+  auto o_orderdate_upper =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1995, 1, 1), true);
+  auto const o_orderdate_upper_limit = cudf::ast::literal(o_orderdate_upper);
+  auto const o_orderdate_pred_upper =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, o_orderdate_ref, o_orderdate_upper_limit);
+  auto orders_pred = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::LOGICAL_AND, o_orderdate_pred_lower, o_orderdate_pred_upper);
+
+  // Define the column projection and filter predicate for the `region` table
+  std::vector<std::string> const region_cols = {"r_regionkey", "r_name"};
+  auto const r_name_ref                      = cudf::ast::column_reference(std::distance(
+    region_cols.begin(), std::find(region_cols.begin(), region_cols.end(), "r_name")));
+  auto r_name_value                          = cudf::string_scalar("ASIA");
+  auto const r_name_literal                  = cudf::ast::literal(r_name_value);
+  auto region_pred                           = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::EQUAL, r_name_ref, r_name_literal);
+
+  // Read out the tables from parquet files
+  // while pushing down the column projections and filter predicates
+  auto const customer =
+    read_parquet(args.dataset_dir + "/customer.parquet", {"c_custkey", "c_nationkey"});
+  auto const orders =
+    read_parquet(args.dataset_dir + "/orders.parquet", orders_cols, std::move(orders_pred));
+  auto const lineitem = read_parquet(args.dataset_dir + "/lineitem.parquet",
+                                     {"l_orderkey", "l_suppkey", "l_extendedprice", "l_discount"});
+  auto const supplier =
+    read_parquet(args.dataset_dir + "/supplier.parquet", {"s_suppkey", "s_nationkey"});
+  auto const nation =
+    read_parquet(args.dataset_dir + "/nation.parquet", {"n_nationkey", "n_regionkey", "n_name"});
+  auto const region =
+    read_parquet(args.dataset_dir + "/region.parquet", region_cols, std::move(region_pred));
+
+  // Perform the joins
+  auto const join_a = apply_inner_join(region, nation, {"r_regionkey"}, {"n_regionkey"});
+  auto const join_b = apply_inner_join(join_a, customer, {"n_nationkey"}, {"c_nationkey"});
+  auto const join_c = apply_inner_join(join_b, orders, {"c_custkey"}, {"o_custkey"});
+  auto const join_d = apply_inner_join(join_c, lineitem, {"o_orderkey"}, {"l_orderkey"});
+  auto joined_table =
+    apply_inner_join(supplier, join_d, {"s_suppkey", "s_nationkey"}, {"l_suppkey", "n_nationkey"});
+
+  // Calculate and append the `revenue` column
+  auto revenue =
+    calc_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
+  (*joined_table).append(revenue, "revenue");
+
+  // Perform the groupby operation
+  auto const groupedby_table =
+    apply_groupby(joined_table,
+                  groupby_context_t{{"n_name"},
+                                    {
+                                      {"revenue", {{cudf::aggregation::Kind::SUM, "revenue"}}},
+                                    }});
+
+  // Perform the order by operation
+  auto const orderedby_table =
+    apply_orderby(groupedby_table, {"revenue"}, {cudf::order::DESCENDING});
+
+  timer.print_elapsed_millis();
+
+  // Write query result to a parquet file
+  orderedby_table->to_parquet("q5.parquet");
+  return 0;
+}
diff --git a/cpp/examples/tpch/q6.cpp b/cpp/examples/tpch/q6.cpp
new file mode 100644
index 00000000000..f11b3d6ab3b
--- /dev/null
+++ b/cpp/examples/tpch/q6.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "utils.hpp"
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+/**
+ * @file q6.cpp
+ * @brief Implement query 6 of the TPC-H benchmark.
+ *
+ * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
+ *
+ * select
+ *    sum(l_extendedprice * l_discount) as revenue
+ * from
+ *    lineitem
+ * where
+ *    l_shipdate >= date '1994-01-01'
+ *    and l_shipdate < date '1995-01-01'
+ *    and l_discount >= 0.05
+ *    and l_discount <= 0.07
+ *    and l_quantity < 24;
+ */
+
+/**
+ * @brief Calculate the revenue column
+ *
+ * @param extendedprice The extended price column
+ * @param discount The discount column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+  cudf::column_view const& extendedprice,
+  cudf::column_view const& discount,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const revenue_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto revenue            = cudf::binary_operation(
+    extendedprice, discount, cudf::binary_operator::MUL, revenue_type, stream, mr);
+  return revenue;
+}
+
+int main(int argc, char const** argv)
+{
+  auto const args = parse_args(argc, argv);
+
+  // Use a memory pool
+  auto resource = create_memory_resource(args.memory_resource_type);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  cudf::examples::timer timer;
+
+  // Read out the `lineitem` table from parquet file
+  std::vector<std::string> const lineitem_cols = {
+    "l_extendedprice", "l_discount", "l_shipdate", "l_quantity"};
+  auto const shipdate_ref = cudf::ast::column_reference(std::distance(
+    lineitem_cols.begin(), std::find(lineitem_cols.begin(), lineitem_cols.end(), "l_shipdate")));
+  auto shipdate_lower =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1994, 1, 1), true);
+  auto const shipdate_lower_literal = cudf::ast::literal(shipdate_lower);
+  auto shipdate_upper =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1995, 1, 1), true);
+  auto const shipdate_upper_literal = cudf::ast::literal(shipdate_upper);
+  auto const shipdate_pred_a        = cudf::ast::operation(
+    cudf::ast::ast_operator::GREATER_EQUAL, shipdate_ref, shipdate_lower_literal);
+  auto const shipdate_pred_b =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, shipdate_ref, shipdate_upper_literal);
+  auto lineitem_pred = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b);
+  auto lineitem =
+    read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred));
+
+  // Cast the discount and quantity columns to float32 and append to lineitem table
+  auto discout_float =
+    cudf::cast(lineitem->column("l_discount"), cudf::data_type{cudf::type_id::FLOAT32});
+  auto quantity_float =
+    cudf::cast(lineitem->column("l_quantity"), cudf::data_type{cudf::type_id::FLOAT32});
+
+  (*lineitem).append(discout_float, "l_discount_float").append(quantity_float, "l_quantity_float");
+
+  // Apply the filters
+  auto const discount_ref = cudf::ast::column_reference(lineitem->col_id("l_discount_float"));
+  auto const quantity_ref = cudf::ast::column_reference(lineitem->col_id("l_quantity_float"));
+
+  auto discount_lower               = cudf::numeric_scalar<float_t>(0.05);
+  auto const discount_lower_literal = cudf::ast::literal(discount_lower);
+  auto discount_upper               = cudf::numeric_scalar<float_t>(0.07);
+  auto const discount_upper_literal = cudf::ast::literal(discount_upper);
+  auto quantity_upper               = cudf::numeric_scalar<float_t>(24);
+  auto const quantity_upper_literal = cudf::ast::literal(quantity_upper);
+
+  auto const discount_pred_a = cudf::ast::operation(
+    cudf::ast::ast_operator::GREATER_EQUAL, discount_ref, discount_lower_literal);
+
+  auto const discount_pred_b =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, discount_ref, discount_upper_literal);
+  auto const discount_pred =
+    cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, discount_pred_a, discount_pred_b);
+  auto const quantity_pred =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, quantity_ref, quantity_upper_literal);
+  auto const discount_quantity_pred =
+    cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, discount_pred, quantity_pred);
+  auto const filtered_table = apply_filter(lineitem, discount_quantity_pred);
+
+  // Calculate the `revenue` column
+  auto revenue =
+    calc_revenue(filtered_table->column("l_extendedprice"), filtered_table->column("l_discount"));
+
+  // Sum the `revenue` column
+  auto const revenue_view = revenue->view();
+  auto const result_table = apply_reduction(revenue_view, cudf::aggregation::Kind::SUM, "revenue");
+
+  timer.print_elapsed_millis();
+
+  // Write query result to a parquet file
+  result_table->to_parquet("q6.parquet");
+  return 0;
+}
diff --git a/cpp/examples/tpch/q9.cpp b/cpp/examples/tpch/q9.cpp
new file mode 100644
index 00000000000..d3c218253f9
--- /dev/null
+++ b/cpp/examples/tpch/q9.cpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "utils.hpp"
+
+#include <cudf/column/column.hpp>
+#include <cudf/datetime.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/contains.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+/**
+ * @file q9.cpp
+ * @brief Implement query 9 of the TPC-H benchmark.
+ *
+ * create view part as select * from '/tables/scale-1/part.parquet';
+ * create view supplier as select * from '/tables/scale-1/supplier.parquet';
+ * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
+ * create view partsupp as select * from '/tables/scale-1/partsupp.parquet';
+ * create view orders as select * from '/tables/scale-1/orders.parquet';
+ * create view nation as select * from '/tables/scale-1/nation.parquet';
+ *
+ * select
+ *    nation,
+ *    o_year,
+ *    sum(amount) as sum_profit
+ * from
+ *     (
+ *        select
+ *            n_name as nation,
+ *            extract(year from o_orderdate) as o_year,
+ *            l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
+ *        from
+ *            part,
+ *            supplier,
+ *            lineitem,
+ *            partsupp,
+ *            orders,
+ *            nation
+ *        where
+ *           s_suppkey = l_suppkey
+ *           and ps_suppkey = l_suppkey
+ *           and ps_partkey = l_partkey
+ *           and p_partkey = l_partkey
+ *           and o_orderkey = l_orderkey
+ *           and s_nationkey = n_nationkey
+ *           and p_name like '%green%'
+ *     ) as profit
+ * group by
+ *     nation,
+ *     o_year
+ * order by
+ *     nation,
+ *     o_year desc;
+ */
+
+/**
+ * @brief Calculate the amount column
+ *
+ * @param discount The discount column
+ * @param extendedprice The extended price column
+ * @param supplycost The supply cost column
+ * @param quantity The quantity column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_amount(
+  cudf::column_view const& discount,
+  cudf::column_view const& extendedprice,
+  cudf::column_view const& supplycost,
+  cudf::column_view const& quantity,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const one = cudf::numeric_scalar<double>(1);
+  auto const one_minus_discount =
+    cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type());
+  auto const extendedprice_discounted_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto const extendedprice_discounted      = cudf::binary_operation(extendedprice,
+                                                               one_minus_discount->view(),
+                                                               cudf::binary_operator::MUL,
+                                                               extendedprice_discounted_type,
+                                                               stream,
+                                                               mr);
+  auto const supplycost_quantity_type      = cudf::data_type{cudf::type_id::FLOAT64};
+  auto const supplycost_quantity           = cudf::binary_operation(
+    supplycost, quantity, cudf::binary_operator::MUL, supplycost_quantity_type);
+  auto amount = cudf::binary_operation(extendedprice_discounted->view(),
+                                       supplycost_quantity->view(),
+                                       cudf::binary_operator::SUB,
+                                       extendedprice_discounted->type(),
+                                       stream,
+                                       mr);
+  return amount;
+}
+
+int main(int argc, char const** argv)
+{
+  auto const args = parse_args(argc, argv);
+
+  // Use a memory pool
+  auto resource = create_memory_resource(args.memory_resource_type);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  cudf::examples::timer timer;
+
+  // Read out the table from parquet files
+  auto const lineitem = read_parquet(
+    args.dataset_dir + "/lineitem.parquet",
+    {"l_suppkey", "l_partkey", "l_orderkey", "l_extendedprice", "l_discount", "l_quantity"});
+  auto const nation = read_parquet(args.dataset_dir + "/nation.parquet", {"n_nationkey", "n_name"});
+  auto const orders =
+    read_parquet(args.dataset_dir + "/orders.parquet", {"o_orderkey", "o_orderdate"});
+  auto const part     = read_parquet(args.dataset_dir + "/part.parquet", {"p_partkey", "p_name"});
+  auto const partsupp = read_parquet(args.dataset_dir + "/partsupp.parquet",
+                                     {"ps_suppkey", "ps_partkey", "ps_supplycost"});
+  auto const supplier =
+    read_parquet(args.dataset_dir + "/supplier.parquet", {"s_suppkey", "s_nationkey"});
+
+  // Generating the `profit` table
+  // Filter the part table using `p_name like '%green%'`
+  auto const p_name = part->table().column(1);
+  auto const mask =
+    cudf::strings::like(cudf::strings_column_view(p_name), cudf::string_scalar("%green%"));
+  auto const part_filtered = apply_mask(part, mask);
+
+  // Perform the joins
+  auto const join_a = apply_inner_join(supplier, nation, {"s_nationkey"}, {"n_nationkey"});
+  auto const join_b = apply_inner_join(partsupp, join_a, {"ps_suppkey"}, {"s_suppkey"});
+  auto const join_c = apply_inner_join(lineitem, part_filtered, {"l_partkey"}, {"p_partkey"});
+  auto const join_d = apply_inner_join(orders, join_c, {"o_orderkey"}, {"l_orderkey"});
+  auto const joined_table =
+    apply_inner_join(join_d, join_b, {"l_suppkey", "l_partkey"}, {"s_suppkey", "ps_partkey"});
+
+  // Calculate the `nation`, `o_year`, and `amount` columns
+  auto n_name = std::make_unique<cudf::column>(joined_table->column("n_name"));
+  auto o_year = cudf::datetime::extract_year(joined_table->column("o_orderdate"));
+  auto amount = calc_amount(joined_table->column("l_discount"),
+                            joined_table->column("l_extendedprice"),
+                            joined_table->column("ps_supplycost"),
+                            joined_table->column("l_quantity"));
+
+  // Put together the `profit` table
+  std::vector<std::unique_ptr<cudf::column>> profit_columns;
+  profit_columns.push_back(std::move(n_name));
+  profit_columns.push_back(std::move(o_year));
+  profit_columns.push_back(std::move(amount));
+
+  auto profit_table = std::make_unique<cudf::table>(std::move(profit_columns));
+  auto const profit = std::make_unique<table_with_names>(
+    std::move(profit_table), std::vector<std::string>{"nation", "o_year", "amount"});
+
+  // Perform the groupby operation
+  auto const groupedby_table = apply_groupby(
+    profit,
+    groupby_context_t{{"nation", "o_year"},
+                      {{"amount", {{cudf::groupby_aggregation::SUM, "sum_profit"}}}}});
+
+  // Perform the orderby operation
+  auto const orderedby_table = apply_orderby(
+    groupedby_table, {"nation", "o_year"}, {cudf::order::ASCENDING, cudf::order::DESCENDING});
+
+  timer.print_elapsed_millis();
+
+  // Write query result to a parquet file
+  orderedby_table->to_parquet("q9.parquet");
+  return 0;
+}
diff --git a/cpp/examples/tpch/utils.hpp b/cpp/examples/tpch/utils.hpp
new file mode 100644
index 00000000000..e586da2c802
--- /dev/null
+++ b/cpp/examples/tpch/utils.hpp
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/join.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/unary.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <ctime>
+
+// RMM memory resource creation utilities
+inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+inline auto make_pool()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+    make_cuda(), rmm::percent_of_free_device_memory(50));
+}
+inline auto make_managed() { return std::make_shared<rmm::mr::managed_memory_resource>(); }
+inline auto make_managed_pool()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+    make_managed(), rmm::percent_of_free_device_memory(50));
+}
+inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
+  std::string const& mode)
+{
+  if (mode == "cuda") return make_cuda();
+  if (mode == "pool") return make_pool();
+  if (mode == "managed") return make_managed();
+  if (mode == "managed_pool") return make_managed_pool();
+  CUDF_FAIL("Unknown rmm_mode parameter: " + mode +
+            "\nExpecting: cuda, pool, managed, or managed_pool");
+}
+
+/**
+ * @brief A class to represent a table with column names attached
+ */
+class table_with_names {
+ public:
+  table_with_names(std::unique_ptr<cudf::table> tbl, std::vector<std::string> col_names)
+    : tbl(std::move(tbl)), col_names(col_names)
+  {
+  }
+  /**
+   * @brief Return the table view
+   */
+  [[nodiscard]] cudf::table_view table() const { return tbl->view(); }
+  /**
+   * @brief Return the column view for a given column name
+   *
+   * @param col_name The name of the column
+   */
+  [[nodiscard]] cudf::column_view column(std::string const& col_name) const
+  {
+    return tbl->view().column(col_id(col_name));
+  }
+  /**
+   * @param Return the column names of the table
+   */
+  [[nodiscard]] std::vector<std::string> column_names() const { return col_names; }
+  /**
+   * @brief Translate a column name to a column index
+   *
+   * @param col_name The name of the column
+   */
+  [[nodiscard]] cudf::size_type col_id(std::string const& col_name) const
+  {
+    CUDF_FUNC_RANGE();
+    auto it = std::find(col_names.begin(), col_names.end(), col_name);
+    if (it == col_names.end()) { throw std::runtime_error("Column not found"); }
+    return std::distance(col_names.begin(), it);
+  }
+  /**
+   * @brief Append a column to the table
+   *
+   * @param col The column to append
+   * @param col_name The name of the appended column
+   */
+  table_with_names& append(std::unique_ptr<cudf::column>& col, std::string const& col_name)
+  {
+    CUDF_FUNC_RANGE();
+    auto cols = tbl->release();
+    cols.push_back(std::move(col));
+    tbl = std::make_unique<cudf::table>(std::move(cols));
+    col_names.push_back(col_name);
+    return (*this);
+  }
+  /**
+   * @brief Select a subset of columns from the table
+   *
+   * @param col_names The names of the columns to select
+   */
+  [[nodiscard]] cudf::table_view select(std::vector<std::string> const& col_names) const
+  {
+    CUDF_FUNC_RANGE();
+    std::vector<cudf::size_type> col_indices;
+    for (auto const& col_name : col_names) {
+      col_indices.push_back(col_id(col_name));
+    }
+    return tbl->select(col_indices);
+  }
+  /**
+   * @brief Write the table to a parquet file
+   *
+   * @param filepath The path to the parquet file
+   */
+  void to_parquet(std::string const& filepath) const
+  {
+    CUDF_FUNC_RANGE();
+    auto const sink_info = cudf::io::sink_info(filepath);
+    cudf::io::table_metadata metadata;
+    metadata.schema_info =
+      std::vector<cudf::io::column_name_info>(col_names.begin(), col_names.end());
+    auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
+    auto builder = cudf::io::parquet_writer_options::builder(sink_info, tbl->view());
+    builder.metadata(table_input_metadata);
+    auto const options = builder.build();
+    cudf::io::write_parquet(options);
+  }
+
+ private:
+  std::unique_ptr<cudf::table> tbl;
+  std::vector<std::string> col_names;
+};
+
+/**
+ * @brief Concatenate two vectors
+ *
+ * @param lhs The left vector
+ * @param rhs The right vector
+ */
+template <typename T>
+std::vector<T> concat(std::vector<T> const& lhs, std::vector<T> const& rhs)
+{
+  std::vector<T> result;
+  result.reserve(lhs.size() + rhs.size());
+  std::copy(lhs.begin(), lhs.end(), std::back_inserter(result));
+  std::copy(rhs.begin(), rhs.end(), std::back_inserter(result));
+  return result;
+}
+
+/**
+ * @brief Inner join two tables and gather the result
+ *
+ * @param left_input The left input table
+ * @param right_input The right input table
+ * @param left_on The columns to join on in the left table
+ * @param right_on The columns to join on in the right table
+ * @param compare_nulls The null equality policy
+ */
+[[nodiscard]] std::unique_ptr<cudf::table> join_and_gather(
+  cudf::table_view const& left_input,
+  cudf::table_view const& right_input,
+  std::vector<cudf::size_type> const& left_on,
+  std::vector<cudf::size_type> const& right_on,
+  cudf::null_equality compare_nulls)
+{
+  CUDF_FUNC_RANGE();
+  constexpr auto oob_policy                          = cudf::out_of_bounds_policy::DONT_CHECK;
+  auto const left_selected                           = left_input.select(left_on);
+  auto const right_selected                          = right_input.select(right_on);
+  auto const [left_join_indices, right_join_indices] = cudf::inner_join(
+    left_selected, right_selected, compare_nulls, rmm::mr::get_current_device_resource());
+
+  auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
+  auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
+
+  auto const left_indices_col  = cudf::column_view{left_indices_span};
+  auto const right_indices_col = cudf::column_view{right_indices_span};
+
+  auto const left_result  = cudf::gather(left_input, left_indices_col, oob_policy);
+  auto const right_result = cudf::gather(right_input, right_indices_col, oob_policy);
+
+  auto joined_cols = left_result->release();
+  auto right_cols  = right_result->release();
+  joined_cols.insert(joined_cols.end(),
+                     std::make_move_iterator(right_cols.begin()),
+                     std::make_move_iterator(right_cols.end()));
+  return std::make_unique<cudf::table>(std::move(joined_cols));
+}
+
+/**
+ * @brief Apply an inner join operation to two tables
+ *
+ * @param left_input The left input table
+ * @param right_input The right input table
+ * @param left_on The columns to join on in the left table
+ * @param right_on The columns to join on in the right table
+ * @param compare_nulls The null equality policy
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_inner_join(
+  std::unique_ptr<table_with_names> const& left_input,
+  std::unique_ptr<table_with_names> const& right_input,
+  std::vector<std::string> const& left_on,
+  std::vector<std::string> const& right_on,
+  cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
+{
+  CUDF_FUNC_RANGE();
+  std::vector<cudf::size_type> left_on_indices;
+  std::vector<cudf::size_type> right_on_indices;
+  std::transform(
+    left_on.begin(), left_on.end(), std::back_inserter(left_on_indices), [&](auto const& col_name) {
+      return left_input->col_id(col_name);
+    });
+  std::transform(right_on.begin(),
+                 right_on.end(),
+                 std::back_inserter(right_on_indices),
+                 [&](auto const& col_name) { return right_input->col_id(col_name); });
+  auto table = join_and_gather(
+    left_input->table(), right_input->table(), left_on_indices, right_on_indices, compare_nulls);
+  return std::make_unique<table_with_names>(
+    std::move(table), concat(left_input->column_names(), right_input->column_names()));
+}
+
+/**
+ * @brief Apply a filter predicated to a table
+ *
+ * @param table The input table
+ * @param predicate The filter predicate
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_filter(
+  std::unique_ptr<table_with_names> const& table, cudf::ast::operation const& predicate)
+{
+  CUDF_FUNC_RANGE();
+  auto const boolean_mask = cudf::compute_column(table->table(), predicate);
+  auto result_table       = cudf::apply_boolean_mask(table->table(), boolean_mask->view());
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+/**
+ * @brief Apply a boolean mask to a table
+ *
+ * @param table The input table
+ * @param mask The boolean mask
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_mask(
+  std::unique_ptr<table_with_names> const& table, std::unique_ptr<cudf::column> const& mask)
+{
+  CUDF_FUNC_RANGE();
+  auto result_table = cudf::apply_boolean_mask(table->table(), mask->view());
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+struct groupby_context_t {
+  std::vector<std::string> keys;
+  std::unordered_map<std::string, std::vector<std::pair<cudf::aggregation::Kind, std::string>>>
+    values;
+};
+
+/**
+ * @brief Apply a groupby operation to a table
+ *
+ * @param table The input table
+ * @param ctx The groupby context
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_groupby(
+  std::unique_ptr<table_with_names> const& table, groupby_context_t const& ctx)
+{
+  CUDF_FUNC_RANGE();
+  auto const keys = table->select(ctx.keys);
+  cudf::groupby::groupby groupby_obj(keys);
+  std::vector<std::string> result_column_names;
+  result_column_names.insert(result_column_names.end(), ctx.keys.begin(), ctx.keys.end());
+  std::vector<cudf::groupby::aggregation_request> requests;
+  for (auto& [value_col, aggregations] : ctx.values) {
+    requests.emplace_back(cudf::groupby::aggregation_request());
+    for (auto& agg : aggregations) {
+      if (agg.first == cudf::aggregation::Kind::SUM) {
+        requests.back().aggregations.push_back(
+          cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+      } else if (agg.first == cudf::aggregation::Kind::MEAN) {
+        requests.back().aggregations.push_back(
+          cudf::make_mean_aggregation<cudf::groupby_aggregation>());
+      } else if (agg.first == cudf::aggregation::Kind::COUNT_ALL) {
+        requests.back().aggregations.push_back(
+          cudf::make_count_aggregation<cudf::groupby_aggregation>());
+      } else {
+        throw std::runtime_error("Unsupported aggregation");
+      }
+      result_column_names.push_back(agg.second);
+    }
+    requests.back().values = table->column(value_col);
+  }
+  auto agg_results = groupby_obj.aggregate(requests);
+  std::vector<std::unique_ptr<cudf::column>> result_columns;
+  for (size_t i = 0; i < agg_results.first->num_columns(); i++) {
+    auto col = std::make_unique<cudf::column>(agg_results.first->get_column(i));
+    result_columns.push_back(std::move(col));
+  }
+  for (size_t i = 0; i < agg_results.second.size(); i++) {
+    for (size_t j = 0; j < agg_results.second[i].results.size(); j++) {
+      result_columns.push_back(std::move(agg_results.second[i].results[j]));
+    }
+  }
+  auto result_table = std::make_unique<cudf::table>(std::move(result_columns));
+  return std::make_unique<table_with_names>(std::move(result_table), result_column_names);
+}
+
+/**
+ * @brief Apply an order by operation to a table
+ *
+ * @param table The input table
+ * @param sort_keys The sort keys
+ * @param sort_key_orders The sort key orders
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_orderby(
+  std::unique_ptr<table_with_names> const& table,
+  std::vector<std::string> const& sort_keys,
+  std::vector<cudf::order> const& sort_key_orders)
+{
+  CUDF_FUNC_RANGE();
+  std::vector<cudf::column_view> column_views;
+  for (auto& key : sort_keys) {
+    column_views.push_back(table->column(key));
+  }
+  auto result_table =
+    cudf::sort_by_key(table->table(), cudf::table_view{column_views}, sort_key_orders);
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+/**
+ * @brief Apply a reduction operation to a column
+ *
+ * @param column The input column
+ * @param agg_kind The aggregation kind
+ * @param col_name The name of the output column
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_reduction(
+  cudf::column_view const& column,
+  cudf::aggregation::Kind const& agg_kind,
+  std::string const& col_name)
+{
+  CUDF_FUNC_RANGE();
+  auto const agg            = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+  auto const result         = cudf::reduce(column, *agg, column.type());
+  cudf::size_type const len = 1;
+  auto col                  = cudf::make_column_from_scalar(*result, len);
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(col));
+  auto result_table                  = std::make_unique<cudf::table>(std::move(columns));
+  std::vector<std::string> col_names = {col_name};
+  return std::make_unique<table_with_names>(std::move(result_table), col_names);
+}
+
+/**
+ * @brief Read a parquet file into a table
+ *
+ * @param filename The path to the parquet file
+ * @param columns The columns to read
+ * @param predicate The filter predicate to pushdown
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> read_parquet(
+  std::string const& filename,
+  std::vector<std::string> const& columns                = {},
+  std::unique_ptr<cudf::ast::operation> const& predicate = nullptr)
+{
+  CUDF_FUNC_RANGE();
+  auto const source = cudf::io::source_info(filename);
+  auto builder      = cudf::io::parquet_reader_options_builder(source);
+  if (!columns.empty()) { builder.columns(columns); }
+  if (predicate) { builder.filter(*predicate); }
+  auto const options       = builder.build();
+  auto table_with_metadata = cudf::io::read_parquet(options);
+  std::vector<std::string> column_names;
+  for (auto const& col_info : table_with_metadata.metadata.schema_info) {
+    column_names.push_back(col_info.name);
+  }
+  return std::make_unique<table_with_names>(std::move(table_with_metadata.tbl), column_names);
+}
+
+/**
+ * @brief Generate the `std::tm` structure from year, month, and day
+ *
+ * @param year The year
+ * @param month The month
+ * @param day The day
+ */
+std::tm make_tm(int year, int month, int day)
+{
+  std::tm tm{};
+  tm.tm_year = year - 1900;
+  tm.tm_mon  = month - 1;
+  tm.tm_mday = day;
+  return tm;
+}
+
+/**
+ * @brief Calculate the number of days since the UNIX epoch
+ *
+ * @param year The year
+ * @param month The month
+ * @param day The day
+ */
+int32_t days_since_epoch(int year, int month, int day)
+{
+  std::tm tm             = make_tm(year, month, day);
+  std::tm epoch          = make_tm(1970, 1, 1);
+  std::time_t time       = std::mktime(&tm);
+  std::time_t epoch_time = std::mktime(&epoch);
+  double diff            = std::difftime(time, epoch_time) / (60 * 60 * 24);
+  return static_cast<int32_t>(diff);
+}
+
+struct tpch_example_args {
+  std::string dataset_dir;
+  std::string memory_resource_type;
+};
+
+/**
+ * @brief Parse command line arguments into a struct
+ *
+ * @param argc The number of command line arguments
+ * @param argv The command line arguments
+ */
+tpch_example_args parse_args(int argc, char const** argv)
+{
+  if (argc < 3) {
+    std::string usage_message = "Usage: " + std::string(argv[0]) +
+                                " <dataset_dir> <memory_resource_type>\n The query result will be "
+                                "saved to a parquet file named q{query_no}.parquet in the current "
+                                "working directory ";
+    throw std::runtime_error(usage_message);
+  }
+  tpch_example_args args;
+  args.dataset_dir          = argv[1];
+  args.memory_resource_type = argv[2];
+  return args;
+}
diff --git a/cpp/examples/utilities/timer.hpp b/cpp/examples/utilities/timer.hpp
new file mode 100644
index 00000000000..65fa92e74cf
--- /dev/null
+++ b/cpp/examples/utilities/timer.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <chrono>
+#include <iostream>
+
+namespace cudf {
+namespace examples {
+/**
+ * @brief Light-weight timer for measuring elapsed time.
+ *
+ * A timer object constructed from std::chrono, instrumenting at microseconds
+ * precision. Can display elapsed durations at milli and micro second
+ * scales. The timer starts at object construction.
+ */
+class timer {
+ public:
+  using micros = std::chrono::microseconds;
+  using millis = std::chrono::milliseconds;
+
+  timer() { reset(); }
+  void reset() { start_time = std::chrono::high_resolution_clock::now(); }
+  auto elapsed() const { return (std::chrono::high_resolution_clock::now() - start_time); }
+  void print_elapsed_micros() const
+  {
+    std::cout << "Elapsed Time: " << std::chrono::duration_cast<micros>(elapsed()).count()
+              << "us\n\n";
+  }
+  void print_elapsed_millis() const
+  {
+    std::cout << "Elapsed Time: " << std::chrono::duration_cast<millis>(elapsed()).count()
+              << "ms\n\n";
+  }
+
+ private:
+  using time_point_t = std::chrono::time_point<std::chrono::high_resolution_clock>;
+  time_point_t start_time;
+};
+
+}  // namespace examples
+};  // namespace cudf
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 22dad11e109..c74c91e39c2 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -290,6 +290,17 @@ cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
 
 namespace binops {
 
+/**
+ * @brief Returns true if the binary operator is supported for the given input types.
+ *
+ * @param out The output data type
+ * @param lhs The left-hand cudf::data_type
+ * @param rhs The right-hand cudf::data_type
+ * @param op The binary operator
+ * @return true if the binary operator is supported for the given input types
+ */
+bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op);
+
 /**
  * @brief Computes output valid mask for op between a column and a scalar
  *
diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index 1ef8b3b120a..c3bc3ad89fa 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -42,17 +42,6 @@ template <typename Equal>
 struct comparator_adapter {
   comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {}
 
-  // suppress "function was declared but never referenced warning"
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 177
-  __device__ constexpr auto operator()(
-    cuco::pair<hash_value_type, lhs_index_type> const&,
-    cuco::pair<hash_value_type, lhs_index_type> const&) const noexcept
-  {
-    // All build table keys are distinct thus `false` no matter what
-    return false;
-  }
-
   __device__ constexpr auto operator()(
     cuco::pair<hash_value_type, rhs_index_type> const&,
     cuco::pair<hash_value_type, rhs_index_type> const&) const noexcept
@@ -69,15 +58,6 @@ struct comparator_adapter {
     return _d_equal(lhs.second, rhs.second);
   }
 
-  __device__ constexpr auto operator()(
-    cuco::pair<hash_value_type, rhs_index_type> const& lhs,
-    cuco::pair<hash_value_type, lhs_index_type> const& rhs) const noexcept
-  {
-    if (lhs.first != rhs.first) { return false; }
-    return _d_equal(lhs.second, rhs.second);
-  }
-#pragma nv_diagnostic pop
-
  private:
   Equal _d_equal;
 };
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index f1775c6d6d7..5007af7f9f1 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -41,8 +41,8 @@ static constexpr size_type warp_size{32};
  */
 class grid_1d {
  public:
-  int const num_threads_per_block;
-  int const num_blocks;
+  thread_index_type const num_threads_per_block;
+  thread_index_type const num_blocks;
   /**
    * @param overall_num_elements The number of elements the kernel needs to
    * handle/process, in its main, one-dimensional/linear input (e.g. one or more
@@ -55,9 +55,9 @@ class grid_1d {
    * than a single element; this affects the number of threads the grid must
    * contain
    */
-  grid_1d(cudf::size_type overall_num_elements,
-          cudf::size_type num_threads_per_block,
-          cudf::size_type elements_per_thread = 1)
+  grid_1d(thread_index_type overall_num_elements,
+          thread_index_type num_threads_per_block,
+          thread_index_type elements_per_thread = 1)
     : num_threads_per_block(num_threads_per_block),
       num_blocks(util::div_rounding_up_safe(overall_num_elements,
                                             elements_per_thread * num_threads_per_block))
diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp
index e8486a80afc..9cdda773dbb 100644
--- a/cpp/include/cudf/dictionary/detail/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp
@@ -29,7 +29,7 @@ namespace dictionary {
 namespace detail {
 /**
  * @copydoc cudf::dictionary::add_keys(dictionary_column_view const&,column_view
- * const&,mm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -40,7 +40,7 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
 
 /**
  * @copydoc cudf::dictionary::remove_keys(dictionary_column_view const&,column_view
- * const&,mm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -51,7 +51,7 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
 
 /**
  * @copydoc cudf::dictionary::remove_unused_keys(dictionary_column_view
- * const&,mm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -61,7 +61,7 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
 
 /**
  * @copydoc cudf::dictionary::set_keys(dictionary_column_view
- * const&,mm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -72,7 +72,7 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
 
 /**
  * @copydoc
- * cudf::dictionary::match_dictionaries(std::vector<cudf::dictionary_column_view>,mm::mr::device_memory_resource*)
+ * cudf::dictionary::match_dictionaries(std::vector<cudf::dictionary_column_view>,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 6c3c3b4da07..c9cbc603226 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -84,8 +84,8 @@ template <typename Rep,
           Radix Base,
           typename T,
           typename cuda::std::enable_if_t<(cuda::std::is_same_v<int32_t, T> &&
-                                           is_supported_representation_type<Rep>())>* = nullptr>
-CUDF_HOST_DEVICE inline Rep ipow(T exponent)
+                                           cuda::std::is_integral_v<Rep>)>* = nullptr>
+CUDF_HOST_DEVICE inline constexpr Rep ipow(T exponent)
 {
   cudf_assert(exponent >= 0 && "integer exponentiation with negative exponent is not possible.");
 
diff --git a/cpp/include/cudf/fixed_point/floating_conversion.hpp b/cpp/include/cudf/fixed_point/floating_conversion.hpp
index 2c3a5c5629d..f12177c6a4b 100644
--- a/cpp/include/cudf/fixed_point/floating_conversion.hpp
+++ b/cpp/include/cudf/fixed_point/floating_conversion.hpp
@@ -18,6 +18,7 @@
 
 #include <cudf/utilities/traits.hpp>
 
+#include <cuda/std/cmath>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
@@ -34,6 +35,49 @@ namespace numeric {
 
 namespace detail {
 
+/**
+ * @brief Determine the number of significant bits in an integer
+ *
+ * @tparam T Type of input integer value. Must be either uint32_t, uint64_t, or __uint128_t
+ * @param value The integer whose bits are being counted
+ * @return The number of significant bits: the # of bits - # of leading zeroes
+ */
+template <typename T,
+          CUDF_ENABLE_IF(std::is_same_v<T, uint32_t> || std::is_same_v<T, uint64_t> ||
+                         std::is_same_v<T, __uint128_t>)>
+CUDF_HOST_DEVICE inline int count_significant_bits(T value)
+{
+#ifdef __CUDA_ARCH__
+  if constexpr (std::is_same_v<T, uint64_t>) {
+    return 64 - __clzll(static_cast<int64_t>(value));
+  } else if constexpr (std::is_same_v<T, uint32_t>) {
+    return 32 - __clz(static_cast<int32_t>(value));
+  } else if constexpr (std::is_same_v<T, __uint128_t>) {
+    // 128 bit type, must break up into high and low components
+    auto const high_bits = static_cast<int64_t>(value >> 64);
+    auto const low_bits  = static_cast<int64_t>(value);
+    return 128 - (__clzll(high_bits) + static_cast<int>(high_bits == 0) * __clzll(low_bits));
+  }
+#else
+  // Undefined behavior to call __builtin_clzll() with zero in gcc and clang
+  if (value == 0) { return 0; }
+
+  if constexpr (std::is_same_v<T, uint64_t>) {
+    return 64 - __builtin_clzll(value);
+  } else if constexpr (std::is_same_v<T, uint32_t>) {
+    return 32 - __builtin_clz(value);
+  } else if constexpr (std::is_same_v<T, __uint128_t>) {
+    // 128 bit type, must break up into high and low components
+    auto const high_bits = static_cast<uint64_t>(value >> 64);
+    if (high_bits == 0) {
+      return 64 - __builtin_clzll(static_cast<uint64_t>(value));
+    } else {
+      return 128 - __builtin_clzll(high_bits);
+    }
+  }
+#endif
+}
+
 /**
  * @brief Helper struct for getting and setting the components of a floating-point value
  *
@@ -62,27 +106,28 @@ struct floating_converter {
   // The low 23 / 52 bits (for float / double) are the mantissa.
   // The mantissa is normalized. There is an understood 1 bit to the left of the binary point.
   // The value of the mantissa is in the range [1, 2).
-  /// # mantissa bits (-1 for understood bit)
-  static constexpr int num_mantissa_bits = cuda::std::numeric_limits<FloatingType>::digits - 1;
+  /// # significand bits (includes understood bit)
+  static constexpr int num_significand_bits = cuda::std::numeric_limits<FloatingType>::digits;
+  /// # stored mantissa bits (-1 for understood bit)
+  static constexpr int num_stored_mantissa_bits = num_significand_bits - 1;
   /// The mask for the understood bit
-  static constexpr IntegralType understood_bit_mask = (IntegralType(1) << num_mantissa_bits);
+  static constexpr IntegralType understood_bit_mask = (IntegralType(1) << num_stored_mantissa_bits);
   /// The mask to select the mantissa
   static constexpr IntegralType mantissa_mask = understood_bit_mask - 1;
 
   // And in between are the bits used to store the biased power-of-2 exponent.
   /// # exponents bits (-1 for sign bit)
-  static constexpr int num_exponent_bits = num_floating_bits - num_mantissa_bits - 1;
+  static constexpr int num_exponent_bits = num_floating_bits - num_stored_mantissa_bits - 1;
   /// The mask for the exponents, unshifted
   static constexpr IntegralType unshifted_exponent_mask =
     (IntegralType(1) << num_exponent_bits) - 1;
   /// The mask to select the exponents
-  static constexpr IntegralType exponent_mask = unshifted_exponent_mask << num_mantissa_bits;
+  static constexpr IntegralType exponent_mask = unshifted_exponent_mask << num_stored_mantissa_bits;
 
   // To store positive and negative exponents as unsigned values, the stored value for
   // the power-of-2 is exponent + bias. The bias is 127 for floats and 1023 for doubles.
   /// 127 / 1023 for float / double
-  static constexpr IntegralType exponent_bias =
-    cuda::std::numeric_limits<FloatingType>::max_exponent - 1;
+  static constexpr int exponent_bias = cuda::std::numeric_limits<FloatingType>::max_exponent - 1;
 
   /**
    * @brief Reinterpret the bits of a floating-point value as an integer
@@ -113,15 +158,15 @@ struct floating_converter {
   }
 
   /**
-   * @brief Extracts the integral significand of a bit-casted floating-point number
+   * @brief Checks whether the bit-casted floating-point value is +/-0
    *
-   * @param integer_rep The bit-casted floating value to extract the exponent from
-   * @return The integral significand, bit-shifted to a (large) whole number
+   * @param integer_rep The bit-casted floating value to check if is +/-0
+   * @return True if is a zero, else false
    */
-  CUDF_HOST_DEVICE inline static IntegralType get_base2_value(IntegralType integer_rep)
+  CUDF_HOST_DEVICE inline static bool is_zero(IntegralType integer_rep)
   {
-    // Extract the significand, setting the high bit for the understood 1/2
-    return (integer_rep & mantissa_mask) | understood_bit_mask;
+    // It's a zero if every non-sign bit is zero
+    return ((integer_rep & ~sign_mask) == 0);
   }
 
   /**
@@ -137,40 +182,59 @@ struct floating_converter {
   }
 
   /**
-   * @brief Extracts the exponent of a bit-casted floating-point number
+   * @brief Extracts the significand and exponent of a bit-casted floating-point number,
+   * shifted for denormals.
    *
-   * @note This returns INT_MIN for +/-0, +/-inf, NaN's, and denormals
-   * For all of these cases, the decimal fixed_point number should be set to zero
+   * @note Zeros/inf/NaN not handled.
    *
    * @param integer_rep The bit-casted floating value to extract the exponent from
-   * @return The stored base-2 exponent, or INT_MIN for special values
+   * @return The stored base-2 exponent and significand, shifted for denormals
    */
-  CUDF_HOST_DEVICE inline static int get_exp2(IntegralType integer_rep)
+  CUDF_HOST_DEVICE inline static std::pair<IntegralType, int> get_significand_and_pow2(
+    IntegralType integer_rep)
   {
-    // First extract the exponent bits and handle its special values.
-    // To minimize branching, all of these special cases will return INT_MIN.
-    // For all of these cases, the decimal fixed_point number should be set to zero.
+    // Extract the significand
+    auto significand = (integer_rep & mantissa_mask);
+
+    // Extract the exponent bits.
     auto const exponent_bits = integer_rep & exponent_mask;
+
+    // Notes on special values of exponent_bits:
+    // bits = exponent_mask is +/-inf or NaN, but those are handled prior to input.
+    // bits = 0 is either a denormal (handled below) or a zero (handled earlier by caller).
+    int floating_pow2;
     if (exponent_bits == 0) {
-      // Because of the understood set-bit not stored in the mantissa, it is not possible
-      // to store the value zero directly. Instead both +/-0 and denormals are represented with
-      // the exponent bits set to zero.
-      // Thus it's fastest to just floor (generally unwanted) denormals to zero.
-      return INT_MIN;
-    } else if (exponent_bits == exponent_mask) {
-      //+/-inf and NaN values are stored with all of the exponent bits set.
-      // As none of these are representable by integers, we'll return the same value for all cases.
-      return INT_MIN;
+      // Denormal values are 2^(1 - exponent_bias) * Sum_i(B_i * 2^-i)
+      // Where i is the i-th mantissa bit (counting from the LEFT, starting at 1),
+      // and B_i is the value of that bit (0 or 1)
+      // So e.g. for the minimum denormal, only the lowest bit is set:
+      // FLT_TRUE_MIN = 2^(1 - 127) * 2^-23 = 2^-149
+      // DBL_TRUE_MIN = 2^(1 - 1023) * 2^-52 = 2^-1074
+      floating_pow2 = 1 - exponent_bias;
+
+      // Line-up denormal to same (understood) bit as normal numbers
+      // This is so bit-shifting starts at the same bit index
+      auto const lineup_shift = num_significand_bits - count_significant_bits(significand);
+      significand <<= lineup_shift;
+      floating_pow2 -= lineup_shift;
+    } else {
+      // Extract the exponent value: shift the bits down and subtract the bias.
+      auto const shifted_exponent_bits = exponent_bits >> num_stored_mantissa_bits;
+      floating_pow2                    = static_cast<int>(shifted_exponent_bits) - exponent_bias;
+
+      // Set the high bit for the understood 1/2
+      significand |= understood_bit_mask;
     }
 
-    // Extract the exponent value: shift the bits down and subtract the bias.
-    using SignedIntegralType                       = cuda::std::make_signed_t<IntegralType>;
-    SignedIntegralType const shifted_exponent_bits = exponent_bits >> num_mantissa_bits;
-    return shifted_exponent_bits - static_cast<SignedIntegralType>(exponent_bias);
+    // To convert the mantissa to an integer, we effectively applied #-mantissa-bits
+    // powers of 2 to convert the fractional value to an integer, so subtract them off here
+    int const pow2 = floating_pow2 - num_stored_mantissa_bits;
+
+    return {significand, pow2};
   }
 
   /**
-   * @brief Sets the sign bit of a positive floating-point number
+   * @brief Sets the sign bit of a floating-point number
    *
    * @param floating The floating-point value to set the sign of. Must be positive.
    * @param is_negative The sign bit to set for the floating-point number
@@ -192,83 +256,60 @@ struct floating_converter {
   /**
    * @brief Adds to the base-2 exponent of a floating-point number
    *
+   * @note The caller must guarantee that the input is a positive (> 0) whole number.
+   *
    * @param floating The floating value to add to the exponent of. Must be positive.
-   * @param exp2 The power-of-2 to add to the floating-point number
-   * @return The input floating-point value * 2^exp2
+   * @param pow2 The power-of-2 to add to the floating-point number
+   * @return The input floating-point value * 2^pow2
    */
-  CUDF_HOST_DEVICE inline static FloatingType add_exp2(FloatingType floating, int exp2)
+  CUDF_HOST_DEVICE inline static FloatingType add_pow2(FloatingType floating, int pow2)
   {
+    // Note that the input floating-point number is positive (& whole), so we don't have to
+    // worry about the sign here; the sign will be set later in set_is_negative()
+
     // Convert floating to integer
     auto integer_rep = bit_cast_to_integer(floating);
 
     // Extract the currently stored (biased) exponent
+    using SignedType   = std::make_signed_t<IntegralType>;
     auto exponent_bits = integer_rep & exponent_mask;
-    auto stored_exp2   = exponent_bits >> num_mantissa_bits;
+    auto stored_pow2   = static_cast<SignedType>(exponent_bits >> num_stored_mantissa_bits);
 
     // Add the additional power-of-2
-    stored_exp2 += exp2;
+    stored_pow2 += pow2;
 
     // Check for exponent over/under-flow.
-    // Note that the input floating-point number is always positive, so we don't have to
-    // worry about the sign here; the sign will be set later in set_is_negative()
-    if (stored_exp2 <= 0) {
-      return 0.0;
-    } else if (stored_exp2 >= unshifted_exponent_mask) {
+    if (stored_pow2 <= 0) {
+      // Denormal (zero handled prior to input)
+
+      // Early out if bit shift will zero it anyway.
+      // Note: We must handle this explicitly, as too-large a bit-shift is UB
+      auto const bit_shift = -stored_pow2 + 1;  //+1 due to understood bit set below
+      if (bit_shift > num_stored_mantissa_bits) { return 0.0; }
+
+      // Clear the exponent bits (zero means 2^-126/2^-1022 w/ no understood bit)
+      integer_rep &= (~exponent_mask);
+
+      // The input floating-point number has an "understood" bit that we need to set
+      // prior to bit-shifting. Set the understood bit.
+      integer_rep |= understood_bit_mask;
+
+      // Convert to denormal: bit shift off the low bits
+      integer_rep >>= bit_shift;
+    } else if (stored_pow2 >= static_cast<SignedType>(unshifted_exponent_mask)) {
+      // Overflow: Set infinity
       return cuda::std::numeric_limits<FloatingType>::infinity();
     } else {
-      // Clear existing exponent bits and set new ones
-      exponent_bits = stored_exp2 << num_mantissa_bits;
+      // Normal number: Clear existing exponent bits and set new ones
+      exponent_bits = static_cast<IntegralType>(stored_pow2) << num_stored_mantissa_bits;
       integer_rep &= (~exponent_mask);
       integer_rep |= exponent_bits;
-
-      // Convert back to float
-      return bit_cast_to_floating(integer_rep);
     }
-  }
-};
-
-/**
- * @brief Determine the number of significant bits in an integer
- *
- * @tparam T Type of input integer value. Must be either uint32_t, uint64_t, or __uint128_t
- * @param value The integer whose bits are being counted
- * @return The number of significant bits: the # of bits - # of leading zeroes
- */
-template <typename T,
-          CUDF_ENABLE_IF(std::is_same_v<T, uint32_t> || std::is_same_v<T, uint64_t> ||
-                         std::is_same_v<T, __uint128_t>)>
-CUDF_HOST_DEVICE inline int count_significant_bits(T value)
-{
-#ifdef __CUDA_ARCH__
-  if constexpr (std::is_same_v<T, uint64_t>) {
-    return 64 - __clzll(static_cast<int64_t>(value));
-  } else if constexpr (std::is_same_v<T, uint32_t>) {
-    return 32 - __clz(static_cast<int32_t>(value));
-  } else if constexpr (std::is_same_v<T, __uint128_t>) {
-    // 128 bit type, must break up into high and low components
-    auto const high_bits = static_cast<int64_t>(value >> 64);
-    auto const low_bits  = static_cast<int64_t>(value);
-    return 128 - (__clzll(high_bits) + static_cast<int>(high_bits == 0) * __clzll(low_bits));
-  }
-#else
-  // Undefined behavior to call __builtin_clzll() with zero in gcc and clang
-  if (value == 0) { return 0; }
 
-  if constexpr (std::is_same_v<T, uint64_t>) {
-    return 64 - __builtin_clzll(value);
-  } else if constexpr (std::is_same_v<T, uint32_t>) {
-    return 32 - __builtin_clz(value);
-  } else if constexpr (std::is_same_v<T, __uint128_t>) {
-    // 128 bit type, must break up into high and low components
-    auto const high_bits = static_cast<uint64_t>(value >> 64);
-    if (high_bits == 0) {
-      return 64 - __builtin_clzll(static_cast<uint64_t>(value));
-    } else {
-      return 128 - __builtin_clzll(high_bits);
-    }
+    // Convert back to float
+    return bit_cast_to_floating(integer_rep);
   }
-#endif
-}
+};
 
 /**
  * @brief Recursively calculate a signed large power of 10 (>= 10^19) that can only be stored in an
@@ -276,18 +317,18 @@ CUDF_HOST_DEVICE inline int count_significant_bits(T value)
  *
  * @note Intended to be run at compile time.
  *
- * @tparam Exp10 The power of 10 to calculate
- * @return Returns 10^Exp10
+ * @tparam Pow10 The power of 10 to calculate
+ * @return Returns 10^Pow10
  */
-template <int Exp10>
+template <int Pow10>
 constexpr __uint128_t large_power_of_10()
 {
   // Stop at 10^19 to speed up compilation; literals can be used for smaller powers of 10.
-  static_assert(Exp10 >= 19);
-  if constexpr (Exp10 == 19)
+  static_assert(Pow10 >= 19);
+  if constexpr (Pow10 == 19)
     return __uint128_t(10000000000000000000ULL);
   else
-    return large_power_of_10<Exp10 - 1>() * __uint128_t(10);
+    return large_power_of_10<Pow10 - 1>() * __uint128_t(10);
 }
 
 /**
@@ -295,11 +336,11 @@ constexpr __uint128_t large_power_of_10()
  *
  * @tparam T Type of value to be divided-from.
  * @param value The number to be divided-from.
- * @param exp10 The power-of-10 of the denominator, from 0 to 9 inclusive.
- * @return Returns value / 10^exp10
+ * @param pow10 The power-of-10 of the denominator, from 0 to 9 inclusive.
+ * @return Returns value / 10^pow10
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int pow10)
 {
   // Computing division this way is much faster than the alternatives.
   // Division is not implemented in GPU hardware, and the compiler will often implement it as a
@@ -309,7 +350,7 @@ CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10)
 
   // Instead, if the compiler can see exactly what number it is dividing by, it can
   // produce much more optimal assembly, doing bit shifting, multiplies by a constant, etc.
-  // For the compiler to see the value though, array lookup (with exp10 as the index)
+  // For the compiler to see the value though, array lookup (with pow10 as the index)
   // is not sufficient: We have to use a switch statement. Although this introduces a branch,
   // it is still much faster than doing the divide any other way.
   // Perhaps an array can be used in C++23 with the assume attribute?
@@ -325,7 +366,7 @@ CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10)
   // introduces too much pressure on the kernels that use this code, slowing down their benchmarks.
   // It also dramatically slows down the compile time.
 
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value / 10U;
     case 2: return value / 100U;
@@ -345,36 +386,13 @@ CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10)
  *
  * @tparam T Type of value to be divided-from.
  * @param value The number to be divided-from.
- * @param exp10 The power-of-10 of the denominator, from 0 to 19 inclusive.
- * @return Returns value / 10^exp10
+ * @param pow10 The power-of-10 of the denominator, from 0 to 19 inclusive.
+ * @return Returns value / 10^pow10
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int pow10)
 {
-  // See comments in divide_power10_32bit() for discussion.
-  switch (exp10) {
-    case 0: return value;
-    case 1: return value / 10U;
-    case 2: return value / 100U;
-    case 3: return value / 1000U;
-    case 4: return value / 10000U;
-    case 5: return value / 100000U;
-    case 6: return value / 1000000U;
-    case 7: return value / 10000000U;
-    case 8: return value / 100000000U;
-    case 9: return value / 1000000000U;
-    case 10: return value / 10000000000ULL;
-    case 11: return value / 100000000000ULL;
-    case 12: return value / 1000000000000ULL;
-    case 13: return value / 10000000000000ULL;
-    case 14: return value / 100000000000000ULL;
-    case 15: return value / 1000000000000000ULL;
-    case 16: return value / 10000000000000000ULL;
-    case 17: return value / 100000000000000000ULL;
-    case 18: return value / 1000000000000000000ULL;
-    case 19: return value / 10000000000000000000ULL;
-    default: return 0;
-  }
+  return value / ipow<uint64_t, Radix::BASE_10>(pow10);
 }
 
 /**
@@ -382,55 +400,13 @@ CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int exp10)
  *
  * @tparam T Type of value to be divided-from.
  * @param value The number to be divided-from.
- * @param exp10 The power-of-10 of the denominator, from 0 to 38 inclusive.
- * @return Returns value / 10^exp10.
+ * @param pow10 The power-of-10 of the denominator, from 0 to 38 inclusive.
+ * @return Returns value / 10^pow10.
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int pow10)
 {
-  // See comments in divide_power10_32bit() for an introduction.
-  switch (exp10) {
-    case 0: return value;
-    case 1: return value / 10U;
-    case 2: return value / 100U;
-    case 3: return value / 1000U;
-    case 4: return value / 10000U;
-    case 5: return value / 100000U;
-    case 6: return value / 1000000U;
-    case 7: return value / 10000000U;
-    case 8: return value / 100000000U;
-    case 9: return value / 1000000000U;
-    case 10: return value / 10000000000ULL;
-    case 11: return value / 100000000000ULL;
-    case 12: return value / 1000000000000ULL;
-    case 13: return value / 10000000000000ULL;
-    case 14: return value / 100000000000000ULL;
-    case 15: return value / 1000000000000000ULL;
-    case 16: return value / 10000000000000000ULL;
-    case 17: return value / 100000000000000000ULL;
-    case 18: return value / 1000000000000000000ULL;
-    case 19: return value / 10000000000000000000ULL;
-    case 20: return value / large_power_of_10<20>();
-    case 21: return value / large_power_of_10<21>();
-    case 22: return value / large_power_of_10<22>();
-    case 23: return value / large_power_of_10<23>();
-    case 24: return value / large_power_of_10<24>();
-    case 25: return value / large_power_of_10<25>();
-    case 26: return value / large_power_of_10<26>();
-    case 27: return value / large_power_of_10<27>();
-    case 28: return value / large_power_of_10<28>();
-    case 29: return value / large_power_of_10<29>();
-    case 30: return value / large_power_of_10<30>();
-    case 31: return value / large_power_of_10<31>();
-    case 32: return value / large_power_of_10<32>();
-    case 33: return value / large_power_of_10<33>();
-    case 34: return value / large_power_of_10<34>();
-    case 35: return value / large_power_of_10<35>();
-    case 36: return value / large_power_of_10<36>();
-    case 37: return value / large_power_of_10<37>();
-    case 38: return value / large_power_of_10<38>();
-    default: return 0;
-  }
+  return value / ipow<__uint128_t, Radix::BASE_10>(pow10);
 }
 
 /**
@@ -438,14 +414,14 @@ CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int exp10)
  *
  * @tparam T Type of value to be multiplied.
  * @param value The number to be multiplied.
- * @param exp10 The power-of-10 of the multiplier, from 0 to 9 inclusive.
- * @return Returns value * 10^exp10
+ * @param pow10 The power-of-10 of the multiplier, from 0 to 9 inclusive.
+ * @return Returns value * 10^pow10
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int pow10)
 {
   // See comments in divide_power10_32bit() for discussion.
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value * 10U;
     case 2: return value * 100U;
@@ -465,36 +441,13 @@ CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int exp10)
  *
  * @tparam T Type of value to be multiplied.
  * @param value The number to be multiplied.
- * @param exp10 The power-of-10 of the multiplier, from 0 to 19 inclusive.
- * @return Returns value * 10^exp10
+ * @param pow10 The power-of-10 of the multiplier, from 0 to 19 inclusive.
+ * @return Returns value * 10^pow10
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int pow10)
 {
-  // See comments in divide_power10_32bit() for discussion.
-  switch (exp10) {
-    case 0: return value;
-    case 1: return value * 10U;
-    case 2: return value * 100U;
-    case 3: return value * 1000U;
-    case 4: return value * 10000U;
-    case 5: return value * 100000U;
-    case 6: return value * 1000000U;
-    case 7: return value * 10000000U;
-    case 8: return value * 100000000U;
-    case 9: return value * 1000000000U;
-    case 10: return value * 10000000000ULL;
-    case 11: return value * 100000000000ULL;
-    case 12: return value * 1000000000000ULL;
-    case 13: return value * 10000000000000ULL;
-    case 14: return value * 100000000000000ULL;
-    case 15: return value * 1000000000000000ULL;
-    case 16: return value * 10000000000000000ULL;
-    case 17: return value * 100000000000000000ULL;
-    case 18: return value * 1000000000000000000ULL;
-    case 19: return value * 10000000000000000000ULL;
-    default: return 0;
-  }
+  return value * ipow<uint64_t, Radix::BASE_10>(pow10);
 }
 
 /**
@@ -502,113 +455,690 @@ CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int exp10)
  *
  * @tparam T Type of value to be multiplied.
  * @param value The number to be multiplied.
- * @param exp10 The power-of-10 of the multiplier, from 0 to 38 inclusive.
- * @return Returns value * 10^exp10.
+ * @param pow10 The power-of-10 of the multiplier, from 0 to 38 inclusive.
+ * @return Returns value * 10^pow10.
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int pow10)
 {
-  // See comments in divide_power10_128bit() for discussion.
-  switch (exp10) {
-    case 0: return value;
-    case 1: return value * 10U;
-    case 2: return value * 100U;
-    case 3: return value * 1000U;
-    case 4: return value * 10000U;
-    case 5: return value * 100000U;
-    case 6: return value * 1000000U;
-    case 7: return value * 10000000U;
-    case 8: return value * 100000000U;
-    case 9: return value * 1000000000U;
-    case 10: return value * 10000000000ULL;
-    case 11: return value * 100000000000ULL;
-    case 12: return value * 1000000000000ULL;
-    case 13: return value * 10000000000000ULL;
-    case 14: return value * 100000000000000ULL;
-    case 15: return value * 1000000000000000ULL;
-    case 16: return value * 10000000000000000ULL;
-    case 17: return value * 100000000000000000ULL;
-    case 18: return value * 1000000000000000000ULL;
-    case 19: return value * 10000000000000000000ULL;
-    case 20: return value * large_power_of_10<20>();
-    case 21: return value * large_power_of_10<21>();
-    case 22: return value * large_power_of_10<22>();
-    case 23: return value * large_power_of_10<23>();
-    case 24: return value * large_power_of_10<24>();
-    case 25: return value * large_power_of_10<25>();
-    case 26: return value * large_power_of_10<26>();
-    case 27: return value * large_power_of_10<27>();
-    case 28: return value * large_power_of_10<28>();
-    case 29: return value * large_power_of_10<29>();
-    case 30: return value * large_power_of_10<30>();
-    case 31: return value * large_power_of_10<31>();
-    case 32: return value * large_power_of_10<32>();
-    case 33: return value * large_power_of_10<33>();
-    case 34: return value * large_power_of_10<34>();
-    case 35: return value * large_power_of_10<35>();
-    case 36: return value * large_power_of_10<36>();
-    case 37: return value * large_power_of_10<37>();
-    case 38: return value * large_power_of_10<38>();
-    default: return 0;
-  }
+  return value * ipow<__uint128_t, Radix::BASE_10>(pow10);
 }
 
 /**
  * @brief Multiply an integer by a power of 10.
  *
- * @note Use this function if you have no a-priori knowledge of what exp10 might be.
+ * @note Use this function if you have no a-priori knowledge of what pow10 might be.
  * If you do, prefer calling the bit-size-specific versions
  *
  * @tparam Rep Representation type needed for integer exponentiation
  * @tparam T Integral type of value to be multiplied.
  * @param value The number to be multiplied.
- * @param exp10 The power-of-10 of the multiplier.
- * @return Returns value * 10^exp10
+ * @param pow10 The power-of-10 of the multiplier.
+ * @return Returns value * 10^pow10
  */
-template <typename Rep,
-          typename T,
-          typename cuda::std::enable_if_t<(cuda::std::is_unsigned_v<T>)>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T multiply_power10(T value, int exp10)
+template <typename Rep, typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10(T value, int pow10)
 {
-  // Use this function if you have no knowledge of what exp10 might be
+  // Use this function if you have no knowledge of what pow10 might be
   // If you do, prefer calling the bit-size-specific versions
   if constexpr (sizeof(Rep) <= 4) {
-    return multiply_power10_32bit(value, exp10);
+    return multiply_power10_32bit(value, pow10);
   } else if constexpr (sizeof(Rep) <= 8) {
-    return multiply_power10_64bit(value, exp10);
+    return multiply_power10_64bit(value, pow10);
   } else {
-    return multiply_power10_128bit(value, exp10);
+    return multiply_power10_128bit(value, pow10);
   }
 }
 
 /**
  * @brief Divide an integer by a power of 10.
  *
- * @note Use this function if you have no a-priori knowledge of what exp10 might be.
+ * @note Use this function if you have no a-priori knowledge of what pow10 might be.
  * If you do, prefer calling the bit-size-specific versions
  *
  * @tparam Rep Representation type needed for integer exponentiation
  * @tparam T Integral type of value to be divided-from.
  * @param value The number to be divided-from.
- * @param exp10 The power-of-10 of the denominator.
- * @return Returns value / 10^exp10
+ * @param pow10 The power-of-10 of the denominator.
+ * @return Returns value / 10^pow10
  */
-template <typename Rep,
-          typename T,
-          typename cuda::std::enable_if_t<(cuda::std::is_unsigned_v<T>)>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T divide_power10(T value, int exp10)
+template <typename Rep, typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T divide_power10(T value, int pow10)
 {
-  // Use this function if you have no knowledge of what exp10 might be
+  // Use this function if you have no knowledge of what pow10 might be
   // If you do, prefer calling the bit-size-specific versions
   if constexpr (sizeof(Rep) <= 4) {
-    return divide_power10_32bit(value, exp10);
+    return divide_power10_32bit(value, pow10);
   } else if constexpr (sizeof(Rep) <= 8) {
-    return divide_power10_64bit(value, exp10);
+    return divide_power10_64bit(value, pow10);
   } else {
-    return divide_power10_128bit(value, exp10);
+    return divide_power10_128bit(value, pow10);
   }
 }
 
+/**
+ * @brief Perform a bit-shift left, guarding against undefined behavior
+ *
+ * @tparam IntegerType Type of input unsigned integer value
+ * @param value The integer whose bits are being shifted
+ * @param bit_shift The number of bits to shift left
+ * @return The bit-shifted integer, except max value if UB would occur
+ */
+template <typename IntegerType, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<IntegerType>)>
+CUDF_HOST_DEVICE inline IntegerType guarded_left_shift(IntegerType value, int bit_shift)
+{
+  // Bit shifts larger than this are undefined behavior
+  constexpr int max_safe_bit_shift = cuda::std::numeric_limits<IntegerType>::digits - 1;
+  return (bit_shift <= max_safe_bit_shift) ? value << bit_shift
+                                           : cuda::std::numeric_limits<IntegerType>::max();
+}
+
+/**
+ * @brief Perform a bit-shift right, guarding against undefined behavior
+ *
+ * @tparam IntegerType Type of input unsigned integer value
+ * @param value The integer whose bits are being shifted
+ * @param bit_shift The number of bits to shift right
+ * @return The bit-shifted integer, which is zero on underflow
+ */
+template <typename IntegerType, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<IntegerType>)>
+CUDF_HOST_DEVICE inline IntegerType guarded_right_shift(IntegerType value, int bit_shift)
+{
+  // Bit shifts larger than this are undefined behavior
+  constexpr int max_safe_bit_shift = cuda::std::numeric_limits<IntegerType>::digits - 1;
+  return (bit_shift <= max_safe_bit_shift) ? value >> bit_shift : 0;
+}
+
+/**
+ * @brief Helper struct with common constants needed by the floating <--> decimal conversions
+ */
+template <typename FloatingType>
+struct shifting_constants {
+  /// Whether the type is double
+  static constexpr bool is_double = cuda::std::is_same_v<FloatingType, double>;
+
+  /// Integer type that can hold the value of the significand
+  using IntegerRep = std::conditional_t<is_double, uint64_t, uint32_t>;
+
+  /// Num bits needed to hold the significand
+  static constexpr auto num_significand_bits = cuda::std::numeric_limits<FloatingType>::digits;
+
+  /// Shift data back and forth in space of a type with 2x the starting bits, to give us enough room
+  using ShiftingRep = std::conditional_t<is_double, __uint128_t, uint64_t>;
+
+  // The significand of a float / double is 24 / 53 bits
+  // However, to uniquely represent each double / float as different #'s in decimal
+  // you need 17 / 9 digits (from std::numeric_limits<T>::max_digits10)
+  // To represent 10^17 / 10^9, you need 57 / 30 bits
+  // So we need to keep track of at least this # of bits during shifting to ensure no info is lost
+
+  // We will be alternately shifting our data back and forth by powers of 2 and 10 to convert
+  // between floating and decimal (see shifting functions for details).
+
+  // To iteratively shift back and forth, our 2's (bit-) and 10's (divide-/multiply-) shifts must
+  // be of nearly the same magnitude, or else we'll over-/under-flow our shifting integer
+
+  // 2^10 is approximately 10^3, so the largest shifts will have a 10/3 ratio
+  // The difference between 2^10 and 10^3 is 1024/1000: 2.4%
+  // So every time we shift by 10 bits and 3 decimal places, the 2s shift is an extra 2.4%
+
+  // This 2.4% error compounds each time we do an iteration.
+  // The min (normal) float is 2^-126.
+  // Min denormal: 2^-126 * 2^-23 (mantissa bits): 2^-149 = ~1.4E-45
+  // With our 10/3 shifting ratio, 149 (bit-shifts) * (3 / 10) = 44.7 (10s-shifts)
+  // 10^(-44.7) = 2E-45, which is off by ~1.4x from 1.4E-45
+
+  // Similarly, the min (normal) double is 2^-1022.
+  // Min denormal: 2^-1022 * 2^-52 (mantissa bits): 2^-1074 = 4.94E-324
+  // With our 10/3 shifting ratio, 1074 (bit-shifts) * (3 / 10) = 322.2 (10s-shifts)
+  // 10^(-322.2) = 6.4E-323, which is off by ~13.2x from 4.94E-324
+
+  // To account for this compounding error, we can either complicate our loop code (slow),
+  // or use extra bits (in the direction we're shifting the 2s!) to compensate:
+  // 4 extra bits for doubles (2^4 = 16 > 13.2x error), 1 extra for floats (2 > 1.4x error)
+  /// # buffer bits to account for shifting error
+  static constexpr int num_2s_shift_buffer_bits = is_double ? 4 : 1;
+
+  // How much room do we have for shifting?
+  // Float: 64-bit ShiftingRep - 31 (rep + buffer) = 33 bits. 2^33 = 8.6E9
+  // Double: 128-bit ShiftingRep - 61 (rep + buffer) = 67 bits. 2^67 = 1.5E20
+  // Thus for double / float we can shift up to 20 / 9 decimal places at once
+
+  // But, we need to stick to our 10-bits / 3-decimals shift ratio to not over/under-flow.
+  // To simplify our loop code, we'll keep to this ratio by instead shifting a max of
+  // 18 / 9 decimal places, for double / float (60 / 30 bits)
+  /// Max at-once decimal place shift
+  static constexpr int max_digits_shift = is_double ? 18 : 9;
+  /// Max at-once bit shift
+  static constexpr int max_bits_shift = max_digits_shift * 10 / 3;
+
+  // Pre-calculate 10^max_digits_shift. Note that 10^18 / 10^9 fits within IntegerRep
+  /// 10^max_digits_shift
+  static constexpr auto max_digits_shift_pow =
+    multiply_power10<IntegerRep>(IntegerRep(1), max_digits_shift);
+};
+
+/**
+ * @brief Add half a bit to integer rep of floating point if conversion causes truncation
+ *
+ * @note This fixes problems like 1.2 (value = 1.1999...) at scale -1 -> 11
+ *
+ * @tparam FloatingType Type of integer holding the floating-point significand
+ * @param floating The floating-point number to convert
+ * @param integer_rep The integer representation of the floating-point significand
+ * @param pow2 The power of 2 that needs to be applied to the significand
+ * @param pow10 The power of 10 that needs to be applied to the significand
+ * @return integer_rep, shifted 1 and ++'d if the conversion to decimal causes truncation
+ */
+template <typename FloatingType, CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE cuda::std::pair<typename floating_converter<FloatingType>::IntegralType, int>
+add_half_if_truncates(FloatingType floating,
+                      typename floating_converter<FloatingType>::IntegralType integer_rep,
+                      int pow2,
+                      int pow10)
+{
+  // The user-supplied scale may truncate information, so we need to talk about rounding.
+  // We have chosen not to round, so we want 1.23456f with scale -4 to be decimal 12345
+
+  // But if we don't round at all, 1.2 (double) with scale -1 is 11 instead of 12!
+  // Why? Because 1.2 (double) is actually stored as 1.1999999... which we truncate to 1.1
+  // While correct (given our choice to truncate), this is surprising and undesirable.
+  // This problem happens because 1.2 is not perfectly representable in floating point,
+  // and the value 1.199999... happened to be closer to 1.2 than the next value (1.2000...1...)
+
+  // If the scale truncates information (we didn't choose to keep exactly 1.1999...), how
+  // do we make sure we store 1.2?  We'll add half an ulp! (unit in the last place)
+  // Then 1.1999... becomes 1.2000...1... which truncates to 1.2.
+  // And if it had been 1.2000...1..., adding half an ulp still truncates to 1.2
+
+  // Why 1/2 an ulp? Because that's all that is needed. The reason we have this problem in the
+  // first place is because the compiler rounded (e.g.) 1.2 to the nearest floating point number.
+  // The distance of this rounding is at most 1/2 ulp, otherwise we'd have rounded the other way.
+
+  // How do we add 1/2 an ulp? Just shift the bits left (updating pow2) and add 1.
+  // We'll always shift up so every input to the conversion algorithm is aligned the same way.
+
+  // If we add a full ulp we run into issues where we add too much and get the wrong result.
+  // This is because (e.g.) 2^23 = 8.4E6 which is not quite 7 digits of precision.
+  // So if we want 7 digits, that may "barely" truncate information; adding a 1 ulp is overkill.
+
+  // So when does the user-supplied scale truncate info?
+  // For powers > 0: When the 10s (scale) shift is larger than the corresponding bit-shift.
+  // For powers < 0: When the 10s shift is less than the corresponding bit-shift.
+
+  // Corresponding bit-shift:
+  // 2^10 is approximately 10^3, but this is off by 1.024%
+  // 1.024^30 is 2.03704, so this is high by one bit for every 30*3 = 90 powers of 10
+  // So 10^N = 2^(10*N/3 - N/90) = 2^(299*N/90)
+  // Do comparison without dividing, which loses information:
+  // Note: if shift is "equal," still truncates if pow2 < 0 (shifting UP by 2s, 2^10 > 10^3)
+  int const pow2_term  = 90 * pow2;
+  int const pow10_term = 299 * pow10;
+  bool const conversion_truncates =
+    (pow10_term > pow2_term) || ((pow2_term == pow10_term) && (pow2 < 0));
+
+  // However, don't add a half-bit if the input is a whole number!
+  // This is only for errors introduced by rounding decimal fractions!
+  bool const is_whole_number = (cuda::std::floor(floating) == floating);
+  bool const add_half_bit    = conversion_truncates && !is_whole_number;
+
+  // Add half a bit on truncation (shift to make room and update pow2)
+  integer_rep <<= 1;
+  --pow2;
+  integer_rep += static_cast<decltype(integer_rep)>(add_half_bit);
+
+  return {integer_rep, pow2};
+}
+
+/**
+ * @brief Perform base-2 -> base-10 fixed-point conversion for pow10 > 0
+ *
+ * @tparam Rep The type of the storage for the decimal value
+ * @tparam FloatingType The type of the original floating-point value we are converting from
+ * @param base2_value The base-2 fixed-point value we are converting from
+ * @param pow2 The number of powers of 2 to apply to convert from base-2
+ * @param pow10 The number of powers of 10 to apply to reach the desired scale factor
+ * @return Magnitude of the converted-to decimal integer
+ */
+template <typename Rep,
+          typename FloatingType,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t<Rep> shift_to_decimal_pospow(
+  typename shifting_constants<FloatingType>::IntegerRep const base2_value, int pow2, int pow10)
+{
+  // To convert to decimal, we need to apply the input powers of 2 and 10
+  // The result will be (integer) base2_value * (2^pow2) / (10^pow10)
+  // Output type is ShiftingRep
+
+  // Here pow10 > 0 and pow2 > 0, so we need to shift left by 2s and divide by 10s.
+  // We'll iterate back and forth between them, shifting up by 2s
+  // and down by 10s until all of the powers have been applied.
+
+  // However the input base2_value type has virtually no spare room to shift our data
+  // without over- or under-flowing and losing precision.
+  // So we'll cast up to ShiftingRep: uint64 for float's, __uint128_t for double's
+  using Constants   = shifting_constants<FloatingType>;
+  using ShiftingRep = typename Constants::ShiftingRep;
+  auto shifting_rep = static_cast<ShiftingRep>(base2_value);
+
+  // We want to start with our significand bits at the top of the shifting range,
+  // so that we don't lose information we need on intermediary right-shifts.
+  // Note that since we're shifting 2s up, we need num_2s_shift_buffer_bits space on the high side,
+  // For all numbers this bit shift is a fixed distance, due to the understood 2^0 bit.
+  // Note that shift_from is +1 due to shift in add_half_if_truncates()
+  static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::num_2s_shift_buffer_bits;
+  static constexpr int shift_from  = Constants::num_significand_bits + 1;
+  static constexpr int max_init_shift = shift_up_to - shift_from;
+
+  // If our total bit shift is less than this, we don't need to iterate
+  using UnsignedRep = cuda::std::make_unsigned_t<Rep>;
+  if (pow2 <= max_init_shift) {
+    // Shift bits left, divide by 10s to apply the scale factor, and we're done.
+    shifting_rep = divide_power10<ShiftingRep>(shifting_rep << pow2, pow10);
+    // NOTE: Cast can overflow!
+    return static_cast<UnsignedRep>(shifting_rep);
+  }
+
+  // We need to iterate. Do the combined initial shift
+  shifting_rep <<= max_init_shift;
+  pow2 -= max_init_shift;
+
+  // Iterate, dividing by 10s and shifting up by 2s until we're almost done
+  while (pow10 > Constants::max_digits_shift) {
+    // More decimal places to shift than we have room: Divide the max number of 10s
+    shifting_rep /= Constants::max_digits_shift_pow;
+    pow10 -= Constants::max_digits_shift;
+
+    // If our remaining bit shift is less than the max, we're finished iterating
+    if (pow2 <= Constants::max_bits_shift) {
+      // Shift bits left, divide by 10s to apply the scale factor, and we're done.
+      shifting_rep = divide_power10<ShiftingRep>(shifting_rep << pow2, pow10);
+
+      // NOTE: Cast can overflow!
+      return static_cast<UnsignedRep>(shifting_rep);
+    }
+
+    // Shift the max number of bits left again
+    shifting_rep <<= Constants::max_bits_shift;
+    pow2 -= Constants::max_bits_shift;
+  }
+
+  // Last 10s-shift: Divide all remaining decimal places, shift all remaining bits, then bail
+  // Note: This divide result may not fit in the low half of the bit range
+  // But the divisor is less than the max-shift, and thus fits within 64 / 32 bits
+  if constexpr (Constants::is_double) {
+    shifting_rep = divide_power10_64bit(shifting_rep, pow10);
+  } else {
+    shifting_rep = divide_power10_32bit(shifting_rep, pow10);
+  }
+
+  // Final bit shift: Shift may be large, guard against UB
+  // NOTE: This can overflow (both cast and shift)!
+  return guarded_left_shift(static_cast<UnsignedRep>(shifting_rep), pow2);
+}
+
+/**
+ * @brief Perform base-2 -> base-10 fixed-point conversion for pow10 < 0
+ *
+ * @tparam Rep The type of the storage for the decimal value
+ * @tparam FloatingType The type of the original floating-point value we are converting from
+ * @param base2_value The base-2 fixed-point value we are converting from
+ * @param pow2 The number of powers of 2 to apply to convert from base-2
+ * @param pow10 The number of powers of 10 to apply to reach the desired scale factor
+ * @return Magnitude of the converted-to decimal integer
+ */
+template <typename Rep,
+          typename FloatingType,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t<Rep> shift_to_decimal_negpow(
+  typename shifting_constants<FloatingType>::IntegerRep base2_value, int pow2, int pow10)
+{
+  // This is similar to shift_to_decimal_pospow(), except pow10 < 0 & pow2 < 0
+  // See comments in that function for details.
+  // Instead here we need to multiply by 10s and shift right by 2s
+
+  // ShiftingRep: uint64 for float's, __uint128_t for double's
+  using Constants   = shifting_constants<FloatingType>;
+  using ShiftingRep = typename Constants::ShiftingRep;
+  auto shifting_rep = static_cast<ShiftingRep>(base2_value);
+
+  // Convert to using positive values so we don't have keep negating
+  int pow10_mag = -pow10;
+  int pow2_mag  = -pow2;
+
+  // For performing final 10s-shift
+  using UnsignedRep        = cuda::std::make_unsigned_t<Rep>;
+  auto final_shifts_low10s = [&]() {
+    // Last 10s-shift: multiply all remaining decimal places, shift all remaining bits, then bail
+    // The multiplier is less than the max-shift, and thus fits within 64 / 32 bits
+    if constexpr (Constants::is_double) {
+      shifting_rep = multiply_power10_64bit(shifting_rep, pow10_mag);
+    } else {
+      shifting_rep = multiply_power10_32bit(shifting_rep, pow10_mag);
+    }
+
+    // Final bit shifting: Shift may be large, guard against UB
+    return static_cast<UnsignedRep>(guarded_right_shift(shifting_rep, pow2_mag));
+  };
+
+  // If our total decimal shift is less than the max, we don't need to iterate
+  if (pow10_mag <= Constants::max_digits_shift) { return final_shifts_low10s(); }
+
+  // We want to start by lining up our bits to the top of the shifting range,
+  // except our first operation is a multiply, so not quite that far
+  // We are bit-shifting down, so we need extra bits on the low-side, which this has.
+  // Note that shift_from is +1 due to shift in add_half_if_truncates()
+  static constexpr int shift_up_to        = sizeof(ShiftingRep) * 8 - Constants::max_bits_shift;
+  static constexpr int shift_from         = Constants::num_significand_bits + 1;
+  static constexpr int num_init_bit_shift = shift_up_to - shift_from;
+
+  // Perform initial shift
+  shifting_rep <<= num_init_bit_shift;
+  pow2_mag += num_init_bit_shift;
+
+  // Iterate, multiplying by 10s and shifting down by 2s until we're almost done
+  do {
+    // More decimal places to shift than we have room: Multiply the max number of 10s
+    shifting_rep *= Constants::max_digits_shift_pow;
+    pow10_mag -= Constants::max_digits_shift;
+
+    // If our remaining bit shift is less than the max, we're finished iterating
+    if (pow2_mag <= Constants::max_bits_shift) {
+      // Last bit-shift: Shift all remaining bits, apply the remaining scale, then bail
+      shifting_rep >>= pow2_mag;
+
+      // We need to convert to the output rep for the final scale-factor multiply, because if (e.g.)
+      // float -> dec128 and some large pow10_mag, it might overflow the 64bit shifting rep.
+      // It's not needed for pow10 > 0 because we're dividing by 10s there instead of multiplying.
+      // NOTE: This can overflow! (Both multiply and cast)
+      return multiply_power10<UnsignedRep>(static_cast<UnsignedRep>(shifting_rep), pow10_mag);
+    }
+
+    // More bits to shift than we have room: Shift the max number of 2s
+    shifting_rep >>= Constants::max_bits_shift;
+    pow2_mag -= Constants::max_bits_shift;
+  } while (pow10_mag > Constants::max_digits_shift);
+
+  // Do our final shifts
+  return final_shifts_low10s();
+}
+
+/**
+ * @brief Perform base-2 -> base-10 fixed-point conversion
+ *
+ * @tparam Rep The type of integer we are converting to, to store the decimal value
+ * @tparam FloatingType The type of floating-point object we are converting from
+ * @param base2_value The base-2 fixed-point value we are converting from
+ * @param pow2 The number of powers of 2 to apply to convert from base-2
+ * @param pow10 The number of powers of 10 to apply to reach the desired scale factor
+ * @return Integer representation of the floating-point value, given the desired scale
+ */
+template <typename Rep,
+          typename FloatingType,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t<Rep> convert_floating_to_integral_shifting(
+  typename floating_converter<FloatingType>::IntegralType base2_value, int pow10, int pow2)
+{
+  // Apply the powers of 2 and 10 to convert to decimal.
+  // The result will be base2_value * (2^pow2) / (10^pow10)
+
+  // Note that while this code is branchy, the decimal scale factor is part of the
+  // column type itself, so every thread will take the same branches on pow10.
+  // Also data within a column tends to be similar, so they will often take the
+  // same branches on pow2 as well.
+
+  // NOTE: some returns here can overflow (e.g. ShiftingRep -> UnsignedRep)
+  using UnsignedRep = cuda::std::make_unsigned_t<Rep>;
+  if (pow10 == 0) {
+    // NOTE: Left Bit-shift can overflow! As can cast! (e.g. double -> decimal32)
+    // Bit shifts may be large, guard against UB
+    if (pow2 >= 0) {
+      return guarded_left_shift(static_cast<UnsignedRep>(base2_value), pow2);
+    } else {
+      return static_cast<UnsignedRep>(guarded_right_shift(base2_value, -pow2));
+    }
+  } else if (pow10 > 0) {
+    if (pow2 <= 0) {
+      // Power-2/10 shifts both downward: order doesn't matter, apply and bail.
+      // Guard against shift being undefined behavior
+      auto const shifted = guarded_right_shift(base2_value, -pow2);
+      return static_cast<UnsignedRep>(divide_power10<decltype(shifted)>(shifted, pow10));
+    }
+    return shift_to_decimal_pospow<Rep, FloatingType>(base2_value, pow2, pow10);
+  } else {  // pow10 < 0
+    if (pow2 >= 0) {
+      // Power-2/10 shifts both upward: order doesn't matter, apply and bail.
+      // NOTE: Either shift, multiply, or cast (e.g. double -> decimal32) can overflow!
+      auto const shifted = guarded_left_shift(static_cast<UnsignedRep>(base2_value), pow2);
+      return multiply_power10<UnsignedRep>(shifted, -pow10);
+    }
+    return shift_to_decimal_negpow<Rep, FloatingType>(base2_value, pow2, pow10);
+  }
+}
+
+/**
+ * @brief Perform floating-point -> integer decimal conversion
+ *
+ * @tparam Rep The type of integer we are converting to, to store the decimal value
+ * @tparam FloatingType The type of floating-point object we are converting from
+ * @param floating The floating point value to convert
+ * @param scale The desired base-10 scale factor: decimal value = returned value * 10^scale
+ * @return Integer representation of the floating-point value, given the desired scale
+ */
+template <typename Rep,
+          typename FloatingType,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline Rep convert_floating_to_integral(FloatingType const& floating,
+                                                         scale_type const& scale)
+{
+  // Extract components of the floating point number
+  using converter        = floating_converter<FloatingType>;
+  auto const integer_rep = converter::bit_cast_to_integer(floating);
+  if (converter::is_zero(integer_rep)) { return 0; }
+
+  // Note that the significand here is an unsigned integer with sizeof(FloatingType)
+  auto const is_negative                  = converter::get_is_negative(integer_rep);
+  auto const [significand, floating_pow2] = converter::get_significand_and_pow2(integer_rep);
+
+  // Add half a bit if truncating to yield expected value, see function for discussion.
+  auto const pow10 = static_cast<int>(scale);
+  auto const [base2_value, pow2] =
+    add_half_if_truncates(floating, significand, floating_pow2, pow10);
+
+  // Apply the powers of 2 and 10 to convert to decimal.
+  auto const magnitude =
+    convert_floating_to_integral_shifting<Rep, FloatingType>(base2_value, pow10, pow2);
+
+  // Reapply the sign and return
+  // NOTE: Cast can overflow!
+  auto const signed_magnitude = static_cast<Rep>(magnitude);
+  return is_negative ? -signed_magnitude : signed_magnitude;
+}
+
+/**
+ * @brief Perform base-10 -> base-2 fixed-point conversion for pow10 > 0
+ *
+ * @tparam DecimalRep The decimal integer type we are converting from
+ * @tparam FloatingType The type of floating point object we are converting to
+ * @param decimal_rep The decimal integer to convert
+ * @param pow10 The number of powers of 10 to apply to undo the scale factor
+ * @return A pair of the base-2 value and the remaining powers of 2 to be applied
+ */
+template <typename FloatingType,
+          typename DecimalRep,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline auto shift_to_binary_pospow(DecimalRep decimal_rep, int pow10)
+{
+  // This is the reverse of shift_to_decimal_pospow(), see that for more details.
+
+  // ShiftingRep: uint64 for float's, __uint128_t for double's
+  using Constants   = shifting_constants<FloatingType>;
+  using ShiftingRep = typename Constants::ShiftingRep;
+
+  // We want to start by lining up our bits to the top of the shifting range,
+  // except our first operation is a multiply, so not quite that far
+  // We are bit-shifting down, so we need extra bits on the low-side, which this has.
+  static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::max_bits_shift;
+  int const shift_from             = count_significant_bits(decimal_rep);
+  int const num_init_bit_shift     = shift_up_to - shift_from;
+  int pow2                         = -num_init_bit_shift;
+
+  // Perform the initial bit shift
+  ShiftingRep shifting_rep;
+  if constexpr (sizeof(ShiftingRep) < sizeof(DecimalRep)) {
+    // Shift within DecimalRep before dropping to the smaller ShiftingRep
+    decimal_rep  = (pow2 >= 0) ? (decimal_rep >> pow2) : (decimal_rep << -pow2);
+    shifting_rep = static_cast<ShiftingRep>(decimal_rep);
+  } else {
+    // Scale up to ShiftingRep before shifting
+    shifting_rep = static_cast<ShiftingRep>(decimal_rep);
+    shifting_rep = (pow2 >= 0) ? (shifting_rep >> pow2) : (shifting_rep << -pow2);
+  }
+
+  // Iterate, multiplying by 10s and shifting down by 2s until we're almost done
+  while (pow10 > Constants::max_digits_shift) {
+    // More decimal places to shift than we have room: Multiply the max number of 10s
+    shifting_rep *= Constants::max_digits_shift_pow;
+    pow10 -= Constants::max_digits_shift;
+
+    // Then make more room by bit shifting down by the max # of 2s
+    shifting_rep >>= Constants::max_bits_shift;
+    pow2 += Constants::max_bits_shift;
+  }
+
+  // Last 10s-shift: multiply all remaining decimal places
+  // The multiplier is less than the max-shift, and thus fits within 64 / 32 bits
+  if constexpr (Constants::is_double) {
+    shifting_rep = multiply_power10_64bit(shifting_rep, pow10);
+  } else {
+    shifting_rep = multiply_power10_32bit(shifting_rep, pow10);
+  }
+
+  // Our shifting_rep is now the integer mantissa, return it and the powers of 2
+  return std::pair{shifting_rep, pow2};
+}
+
+/**
+ * @brief Perform base-10 -> base-2 fixed-point conversion for pow10 < 0
+ *
+ * @tparam DecimalRep The decimal integer type we are converting from
+ * @tparam FloatingType The type of floating point object we are converting to
+ * @param decimal_rep The decimal integer to convert
+ * @param pow10 The number of powers of 10 to apply to undo the scale factor
+ * @return A pair of the base-2 value and the remaining powers of 2 to be applied
+ */
+template <typename FloatingType,
+          typename DecimalRep,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline auto shift_to_binary_negpow(DecimalRep decimal_rep, int const pow10)
+{
+  // This is the reverse of shift_to_decimal_negpow(), see that for more details.
+
+  // ShiftingRep: uint64 for float's, __uint128_t for double's
+  using Constants   = shifting_constants<FloatingType>;
+  using ShiftingRep = typename Constants::ShiftingRep;
+
+  // We want to start with our significand bits at the top of the shifting range,
+  // so that we lose minimal information we need on intermediary right-shifts.
+  // Note that since we're shifting 2s up, we need num_2s_shift_buffer_bits space on the high side
+  static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::num_2s_shift_buffer_bits;
+  int const shift_from             = count_significant_bits(decimal_rep);
+  int const num_init_bit_shift     = shift_up_to - shift_from;
+  int pow2                         = -num_init_bit_shift;
+
+  // Perform the initial bit shift
+  ShiftingRep shifting_rep;
+  if constexpr (sizeof(ShiftingRep) < sizeof(DecimalRep)) {
+    // Shift within DecimalRep before dropping to the smaller ShiftingRep
+    decimal_rep  = (pow2 >= 0) ? (decimal_rep >> pow2) : (decimal_rep << -pow2);
+    shifting_rep = static_cast<ShiftingRep>(decimal_rep);
+  } else {
+    // Scale up to ShiftingRep before shifting
+    shifting_rep = static_cast<ShiftingRep>(decimal_rep);
+    shifting_rep = (pow2 >= 0) ? (shifting_rep >> pow2) : (shifting_rep << -pow2);
+  }
+
+  // Convert to using positive values upfront, simpler than doing later.
+  int pow10_mag = -pow10;
+
+  // Iterate, dividing by 10s and shifting up by 2s until we're almost done
+  while (pow10_mag > Constants::max_digits_shift) {
+    // More decimal places to shift than we have room: Divide the max number of 10s
+    shifting_rep /= Constants::max_digits_shift_pow;
+    pow10_mag -= Constants::max_digits_shift;
+
+    // Then make more room by bit shifting up by the max # of 2s
+    shifting_rep <<= Constants::max_bits_shift;
+    pow2 -= Constants::max_bits_shift;
+  }
+
+  // Last 10s-shift: Divdie all remaining decimal places.
+  // This divide result may not fit in the low half of the bit range
+  // But the divisor is less than the max-shift, and thus fits within 64 / 32 bits
+  if constexpr (Constants::is_double) {
+    shifting_rep = divide_power10_64bit(shifting_rep, pow10_mag);
+  } else {
+    shifting_rep = divide_power10_32bit(shifting_rep, pow10_mag);
+  }
+
+  // Our shifting_rep is now the integer mantissa, return it and the powers of 2
+  return std::pair{shifting_rep, pow2};
+}
+
+/**
+ * @brief Perform integer decimal -> floating-point conversion
+ *
+ * @tparam FloatingType The type of floating-point object we are converting to
+ * @tparam Rep The decimal integer type we are converting from
+ * @param value The decimal integer to convert
+ * @param scale The base-10 scale factor for the input integer
+ * @return Floating-point representation of the scaled integral value
+ */
+template <typename FloatingType,
+          typename Rep,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline FloatingType convert_integral_to_floating(Rep const& value,
+                                                                  scale_type const& scale)
+{
+  // Check the sign of the input
+  bool const is_negative = (value < 0);
+
+  // Convert to unsigned for bit counting/shifting
+  using UnsignedType        = cuda::std::make_unsigned_t<Rep>;
+  auto const unsigned_value = [&]() -> UnsignedType {
+    // Must guard against minimum value, as we can't just negate it: not representable.
+    if (value == cuda::std::numeric_limits<Rep>::min()) { return static_cast<UnsignedType>(value); }
+
+    // No abs function for 128bit types, so have to do it manually.
+    if constexpr (cuda::std::is_same_v<Rep, __int128_t>) {
+      return static_cast<UnsignedType>(is_negative ? -value : value);
+    } else {
+      return cuda::std::abs(value);
+    }
+  }();
+
+  // Shift by powers of 2 and 10 to get our integer mantissa
+  auto const [mantissa, pow2] = [&]() {
+    auto const pow10 = static_cast<int32_t>(scale);
+    if (pow10 >= 0) {
+      return shift_to_binary_pospow<FloatingType>(unsigned_value, pow10);
+    } else {  // pow10 < 0
+      return shift_to_binary_negpow<FloatingType>(unsigned_value, pow10);
+    }
+  }();
+
+  // Zero has special exponent bits, just handle it here
+  if (mantissa == 0) { return FloatingType(0.0f); }
+
+  // Cast our integer mantissa to floating point
+  auto const floating = static_cast<FloatingType>(mantissa);  // IEEE-754 rounds to even
+
+  // Apply the sign and the remaining powers of 2
+  using converter      = floating_converter<FloatingType>;
+  auto const magnitude = converter::add_pow2(floating, pow2);
+  return converter::set_is_negative(magnitude, is_negative);
+}
+
 }  // namespace detail
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 502ffb9ba4f..11f6ce2bad7 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -39,6 +39,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <utility>
 
@@ -372,8 +373,8 @@ std::unique_ptr<cudf::scalar> from_arrow(
 std::unique_ptr<cudf::table> from_arrow(
   ArrowSchema const* schema,
   ArrowArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input
@@ -391,8 +392,8 @@ std::unique_ptr<cudf::table> from_arrow(
 std::unique_ptr<cudf::column> from_arrow_column(
   ArrowSchema const* schema,
   ArrowArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::table` from given ArrowDeviceArray input
@@ -415,8 +416,8 @@ std::unique_ptr<cudf::column> from_arrow_column(
 std::unique_ptr<table> from_arrow_host(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::table` from given ArrowArrayStream input
@@ -433,8 +434,8 @@ std::unique_ptr<table> from_arrow_host(
  */
 std::unique_ptr<table> from_arrow_stream(
   ArrowArrayStream* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::column` from given ArrowDeviceArray input
@@ -456,8 +457,8 @@ std::unique_ptr<table> from_arrow_stream(
 std::unique_ptr<column> from_arrow_host_column(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray
@@ -537,8 +538,8 @@ using unique_table_view_t =
 unique_table_view_t from_arrow_device(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief typedef for a unique_ptr to a `cudf::column_view` with custom deleter
@@ -580,8 +581,8 @@ using unique_column_view_t =
 unique_column_view_t from_arrow_device_column(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/io/config_utils.hpp b/cpp/include/cudf/io/config_utils.hpp
new file mode 100644
index 00000000000..1827ba0e3e6
--- /dev/null
+++ b/cpp/include/cudf/io/config_utils.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/utilities/export.hpp>
+
+namespace CUDF_EXPORT cudf {
+namespace io::cufile_integration {
+
+/**
+ * @brief Returns true if cuFile and its compatibility mode are enabled.
+ */
+bool is_always_enabled();
+
+/**
+ * @brief Returns true if only direct IO through cuFile is enabled (compatibility mode is disabled).
+ */
+bool is_gds_enabled();
+
+/**
+ * @brief Returns true if KvikIO is enabled.
+ */
+bool is_kvikio_enabled();
+
+}  // namespace io::cufile_integration
+
+namespace io::nvcomp_integration {
+
+/**
+ * @brief Returns true if all nvCOMP uses are enabled.
+ */
+bool is_all_enabled();
+
+/**
+ * @brief Returns true if stable nvCOMP use is enabled.
+ */
+bool is_stable_enabled();
+
+}  // namespace io::nvcomp_integration
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index 68bb7fba00e..cc361f0918e 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -1756,11 +1756,9 @@ class csv_writer_options_builder {
  *
  * @param options Settings for controlling writing behavior
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource to use for device memory allocation
  */
 void write_csv(csv_writer_options const& options,
-               rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-               rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 50c1a7c163d..2a70fa888f4 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -49,14 +49,12 @@ table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
  * @param column_names Column names for the output CSV
  * @param options Settings for controlling behavior
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource to use for device memory allocation
  */
 void write_csv(data_sink* sink,
                table_view const& table,
                host_span<std::string const> column_names,
                csv_writer_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::device_async_resource_ref mr);
+               rmm::cuda_stream_view stream);
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 540a584908d..6ff1c12831b 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -46,13 +46,11 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
  * @param table The set of columns
  * @param options Settings for controlling behavior
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource to use for device memory allocation
  */
 void write_json(data_sink* sink,
                 table_view const& table,
                 json_writer_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::device_async_resource_ref mr);
+                rmm::cuda_stream_view stream);
 
 /**
  * @brief Normalize single quotes to double quotes using FST
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 8de690482f9..7af90766ad0 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -1018,11 +1018,9 @@ class json_writer_options_builder {
  *
  * @param options Settings for controlling writing behavior
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource to use for device memory allocation
  */
 void write_json(json_writer_options const& options,
-                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/include/cudf/io/nvcomp_adapter.hpp b/cpp/include/cudf/io/nvcomp_adapter.hpp
new file mode 100644
index 00000000000..f3260d0cb53
--- /dev/null
+++ b/cpp/include/cudf/io/nvcomp_adapter.hpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/export.hpp>
+
+#include <optional>
+#include <string>
+
+namespace CUDF_EXPORT cudf {
+namespace io::nvcomp {
+
+enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4 };
+
+/**
+ * @brief Set of parameters that impact whether nvCOMP features are enabled.
+ *
+ */
+struct feature_status_parameters {
+  int lib_major_version;                 ///< major version
+  int lib_minor_version;                 ///< minor version
+  int lib_patch_version;                 ///< patch version
+  bool are_all_integrations_enabled;     ///< all integrations
+  bool are_stable_integrations_enabled;  ///< stable integrations
+  int compute_capability_major;          ///< cuda compute major version
+
+  /**
+   * @brief Default Constructor
+   */
+  feature_status_parameters();
+
+  /**
+   * @brief feature_status_parameters Constructor
+   *
+   * @param major positive integer representing major value of nvcomp
+   * @param minor positive integer representing minor value of nvcomp
+   * @param patch positive integer representing patch value of nvcomp
+   * @param all_enabled if all integrations are enabled
+   * @param stable_enabled if stable integrations are enabled
+   * @param cc_major CUDA compute capability
+   */
+  feature_status_parameters(
+    int major, int minor, int patch, bool all_enabled, bool stable_enabled, int cc_major)
+    : lib_major_version{major},
+      lib_minor_version{minor},
+      lib_patch_version{patch},
+      are_all_integrations_enabled{all_enabled},
+      are_stable_integrations_enabled{stable_enabled},
+      compute_capability_major{cc_major}
+  {
+  }
+};
+
+/**
+ * @brief Equality operator overload. Required to use `feature_status_parameters` as a map key.
+ */
+inline bool operator==(feature_status_parameters const& lhs, feature_status_parameters const& rhs)
+{
+  return lhs.lib_major_version == rhs.lib_major_version and
+         lhs.lib_minor_version == rhs.lib_minor_version and
+         lhs.lib_patch_version == rhs.lib_patch_version and
+         lhs.are_all_integrations_enabled == rhs.are_all_integrations_enabled and
+         lhs.are_stable_integrations_enabled == rhs.are_stable_integrations_enabled and
+         lhs.compute_capability_major == rhs.compute_capability_major;
+}
+
+/**
+ * @brief If a compression type is disabled through nvCOMP, returns the reason as a string.
+ *
+ * Result depends on nvCOMP version and environment variables.
+ *
+ * @param compression Compression type
+ * @param params Optional parameters to query status with different configurations
+ * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled
+ */
+[[nodiscard]] std::optional<std::string> is_compression_disabled(
+  compression_type compression, feature_status_parameters params = feature_status_parameters());
+
+/**
+ * @brief If a decompression type is disabled through nvCOMP, returns the reason as a string.
+ *
+ * Result depends on nvCOMP version and environment variables.
+ *
+ * @param compression Compression type
+ * @param params Optional parameters to query status with different configurations
+ * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled
+ */
+[[nodiscard]] std::optional<std::string> is_decompression_disabled(
+  compression_type compression, feature_status_parameters params = feature_status_parameters());
+
+}  // namespace io::nvcomp
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 431f14af522..4d98cae73a7 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -597,6 +597,8 @@ class parquet_writer_options_base {
   // Parquet writer can write timestamps as UTC
   // Defaults to true because libcudf timestamps are implicitly UTC
   bool _write_timestamps_as_UTC = true;
+  // Whether to write ARROW schema
+  bool _write_arrow_schema = false;
   // Maximum size of each row group (unless smaller than a single page)
   size_t _row_group_size_bytes = default_row_group_size_bytes;
   // Maximum number of rows in row group (unless smaller than a single page)
@@ -689,6 +691,13 @@ class parquet_writer_options_base {
    */
   [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }
 
+  /**
+   * @brief Returns `true` if arrow schema will be written
+   *
+   * @return `true` if arrow schema will be written
+   */
+  [[nodiscard]] auto is_enabled_write_arrow_schema() const { return _write_arrow_schema; }
+
   /**
    * @brief Returns maximum row group size, in bytes.
    *
@@ -824,6 +833,13 @@ class parquet_writer_options_base {
    */
   void enable_utc_timestamps(bool val);
 
+  /**
+   * @brief Sets preference for writing arrow schema. Write arrow schema if set to `true`.
+   *
+   * @param val Boolean value to enable/disable writing of arrow schema.
+   */
+  void enable_write_arrow_schema(bool val);
+
   /**
    * @brief Sets the maximum row group size, in bytes.
    *
@@ -1084,6 +1100,15 @@ class parquet_writer_options_builder_base {
    * @return this for chaining
    */
   BuilderT& utc_timestamps(bool enabled);
+
+  /**
+   * @brief Set to true if arrow schema is to be written
+   *
+   * @param enabled Boolean value to enable/disable writing of arrow schema
+   * @return this for chaining
+   */
+  BuilderT& write_arrow_schema(bool enabled);
+
   /**
    * @brief Set to true if V2 page headers are to be written.
    *
diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index a19aa9be0c0..a714f762a19 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -122,7 +122,7 @@ std::unique_ptr<column> replace_slice(
  * If a target string is found, it is replaced by the corresponding entry in the repls column.
  * All occurrences found in each string are replaced.
  *
- * This does not use regex to match targets in the string.
+ * This does not use regex to match targets in the string. Empty string targets are ignored.
  *
  * Null string entries will return null output string entries.
  *
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 74c8bc67d3a..1609c72f175 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/fixed_point/floating_conversion.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -50,14 +51,19 @@ namespace cudf {
  */
 template <typename Fixed,
           typename Floating,
-          typename cuda::std::enable_if_t<is_fixed_point<Fixed>() &&
-                                          cuda::std::is_floating_point_v<Floating>>* = nullptr>
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<Floating>&& is_fixed_point<Fixed>())>
 CUDF_HOST_DEVICE Fixed convert_floating_to_fixed(Floating floating, numeric::scale_type scale)
 {
-  using Rep          = typename Fixed::rep;
-  auto const shifted = numeric::detail::shift<Rep, Fixed::rad>(floating, scale);
-  numeric::scaled_integer<Rep> scaled{static_cast<Rep>(shifted), scale};
-  return Fixed(scaled);
+  using Rep        = typename Fixed::rep;
+  auto const value = [&]() {
+    if constexpr (Fixed::rad == numeric::Radix::BASE_10) {
+      return numeric::detail::convert_floating_to_integral<Rep>(floating, scale);
+    } else {
+      return static_cast<Rep>(numeric::detail::shift<Rep, Fixed::rad>(floating, scale));
+    }
+  }();
+
+  return Fixed(numeric::scaled_integer<Rep>{value, scale});
 }
 
 /**
@@ -75,14 +81,17 @@ CUDF_HOST_DEVICE Fixed convert_floating_to_fixed(Floating floating, numeric::sca
  */
 template <typename Floating,
           typename Fixed,
-          typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<Floating> &&
-                                          is_fixed_point<Fixed>()>* = nullptr>
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<Floating>&& is_fixed_point<Fixed>())>
 CUDF_HOST_DEVICE Floating convert_fixed_to_floating(Fixed fixed)
 {
-  using Rep         = typename Fixed::rep;
-  auto const casted = static_cast<Floating>(fixed.value());
-  auto const scale  = numeric::scale_type{-fixed.scale()};
-  return numeric::detail::shift<Rep, Fixed::rad>(casted, scale);
+  using Rep = typename Fixed::rep;
+  if constexpr (Fixed::rad == numeric::Radix::BASE_10) {
+    return numeric::detail::convert_integral_to_floating<Floating>(fixed.value(), fixed.scale());
+  } else {
+    auto const casted = static_cast<Floating>(fixed.value());
+    auto const scale  = numeric::scale_type{-fixed.scale()};
+    return numeric::detail::shift<Rep, Fixed::rad>(casted, scale);
+  }
 }
 
 /**
@@ -95,7 +104,7 @@ CUDF_HOST_DEVICE Floating convert_fixed_to_floating(Fixed fixed)
  */
 template <typename Floating,
           typename Input,
-          typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<Floating>>* = nullptr>
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<Floating>)>
 CUDF_HOST_DEVICE Floating convert_to_floating(Input input)
 {
   if constexpr (is_fixed_point<Input>()) {
@@ -202,6 +211,16 @@ std::unique_ptr<column> cast(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Check if a cast between two datatypes is supported.
+ *
+ * @param from source type
+ * @param to   target type
+ *
+ * @returns true if the cast is supported.
+ */
+bool is_supported_cast(data_type from, data_type to) noexcept;
+
 /**
  * @brief Creates a column of `type_id::BOOL8` elements indicating the presence of `NaN` values
  * in a column of floating point values.
diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp
index a39df064f44..45d5d1b12e1 100644
--- a/cpp/include/cudf/utilities/logger.hpp
+++ b/cpp/include/cudf/utilities/logger.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,11 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <spdlog/spdlog.h>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief Returns the global logger.
@@ -43,4 +45,4 @@ namespace cudf {
  */
 spdlog::logger& logger();
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/thread_pool.hpp b/cpp/include/cudf/utilities/thread_pool.hpp
deleted file mode 100644
index c8c3eb097c4..00000000000
--- a/cpp/include/cudf/utilities/thread_pool.hpp
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-/**
- * Modified from https://github.com/bshoshany/thread-pool
- * @copyright Copyright (c) 2021 Barak Shoshany. Licensed under the MIT license.
- *            See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
- */
-
-#include <atomic>       // std::atomic
-#include <chrono>       // std::chrono
-#include <cstdint>      // std::int_fast64_t, std::uint_fast32_t
-#include <functional>   // std::function
-#include <future>       // std::future, std::promise
-#include <memory>       // std::shared_ptr, std::unique_ptr
-#include <mutex>        // std::mutex, std::scoped_lock
-#include <queue>        // std::queue
-#include <thread>       // std::this_thread, std::thread
-#include <type_traits>  // std::decay_t, std::enable_if_t, std::is_void_v, std::invoke_result_t
-#include <utility>      // std::move, std::swap
-
-namespace cudf {
-namespace detail {
-
-/**
- * @brief A C++17 thread pool class. The user submits tasks to be executed into a queue. Whenever a
- * thread becomes available, it pops a task from the queue and executes it. Each task is
- * automatically assigned a future, which can be used to wait for the task to finish executing
- * and/or obtain its eventual return value.
- */
-class thread_pool {
-  using ui32 = int;
-
- public:
-  /**
-   * @brief Construct a new thread pool.
-   *
-   * @param _thread_count The number of threads to use. The default value is the total number of
-   * hardware threads available, as reported by the implementation. With a hyperthreaded CPU, this
-   * will be twice the number of CPU cores. If the argument is zero, the default value will be used
-   * instead.
-   */
-  thread_pool(ui32 const& _thread_count = std::thread::hardware_concurrency())
-    : thread_count(_thread_count ? _thread_count : std::thread::hardware_concurrency()),
-      threads(new std::thread[_thread_count ? _thread_count : std::thread::hardware_concurrency()])
-  {
-    create_threads();
-  }
-
-  /**
-   * @brief Destruct the thread pool. Waits for all tasks to complete, then destroys all threads.
-   * Note that if the variable paused is set to true, then any tasks still in the queue will never
-   * be executed.
-   */
-  ~thread_pool()
-  {
-    wait_for_tasks();
-    running = false;
-    destroy_threads();
-  }
-
-  /**
-   * @brief Get the number of tasks currently waiting in the queue to be executed by the threads.
-   *
-   * @return The number of queued tasks.
-   */
-  [[nodiscard]] size_t get_tasks_queued() const
-  {
-    std::scoped_lock const lock(queue_mutex);
-    return tasks.size();
-  }
-
-  /**
-   * @brief Get the number of tasks currently being executed by the threads.
-   *
-   * @return The number of running tasks.
-   */
-  [[nodiscard]] ui32 get_tasks_running() const { return tasks_total - (ui32)get_tasks_queued(); }
-
-  /**
-   * @brief Get the total number of unfinished tasks - either still in the queue, or running in a
-   * thread.
-   *
-   * @return The total number of tasks.
-   */
-  [[nodiscard]] ui32 get_tasks_total() const { return tasks_total; }
-
-  /**
-   * @brief Get the number of threads in the pool.
-   *
-   * @return The number of threads.
-   */
-  [[nodiscard]] ui32 get_thread_count() const { return thread_count; }
-
-  /**
-   * @brief Parallelize a loop by splitting it into blocks, submitting each block separately to the
-   * thread pool, and waiting for all blocks to finish executing. The loop will be equivalent to:
-   * for (T i = first_index; i <= last_index; i++) loop(i);
-   *
-   * @tparam T The type of the loop index. Should be a signed or unsigned integer.
-   * @tparam F The type of the function to loop through.
-   * @param first_index The first index in the loop (inclusive).
-   * @param last_index The last index in the loop (inclusive).
-   * @param loop The function to loop through. Should take exactly one argument, the loop index.
-   * @param num_tasks The maximum number of tasks to split the loop into. The default is to use the
-   * number of threads in the pool.
-   */
-  template <typename T, typename F>
-  void parallelize_loop(T first_index, T last_index, F const& loop, ui32 num_tasks = 0)
-  {
-    if (num_tasks == 0) num_tasks = thread_count;
-    if (last_index < first_index) std::swap(last_index, first_index);
-    size_t total_size = last_index - first_index + 1;
-    size_t block_size = total_size / num_tasks;
-    if (block_size == 0) {
-      block_size = 1;
-      num_tasks  = (ui32)total_size > 1 ? (ui32)total_size : 1;
-    }
-    std::atomic<ui32> blocks_running = 0;
-    for (ui32 t = 0; t < num_tasks; t++) {
-      T start = (T)(t * block_size + first_index);
-      T end   = (t == num_tasks - 1) ? last_index : (T)((t + 1) * block_size + first_index - 1);
-      blocks_running++;
-      push_task([start, end, &loop, &blocks_running] {
-        for (T i = start; i <= end; i++)
-          loop(i);
-        blocks_running--;
-      });
-    }
-    while (blocks_running != 0) {
-      sleep_or_yield();
-    }
-  }
-
-  /**
-   * @brief Push a function with no arguments or return value into the task queue.
-   *
-   * @tparam F The type of the function.
-   * @param task The function to push.
-   */
-  template <typename F>
-  void push_task(F const& task)
-  {
-    tasks_total++;
-    {
-      std::scoped_lock const lock(queue_mutex);
-      tasks.push(std::function<void()>(task));
-    }
-  }
-
-  /**
-   * @brief Push a function with arguments, but no return value, into the task queue.
-   * @details The function is wrapped inside a lambda in order to hide the arguments, as the tasks
-   * in the queue must be of type std::function<void()>, so they cannot have any arguments or return
-   * value. If no arguments are provided, the other overload will be used, in order to avoid the
-   * (slight) overhead of using a lambda.
-   *
-   * @tparam F The type of the function.
-   * @tparam A The types of the arguments.
-   * @param task The function to push.
-   * @param args The arguments to pass to the function.
-   */
-  template <typename F, typename... A>
-  void push_task(F const& task, A const&... args)
-  {
-    push_task([task, args...] { task(args...); });
-  }
-
-  /**
-   * @brief Reset the number of threads in the pool. Waits for all currently running tasks to be
-   * completed, then destroys all threads in the pool and creates a new thread pool with the new
-   * number of threads. Any tasks that were waiting in the queue before the pool was reset will then
-   * be executed by the new threads. If the pool was paused before resetting it, the new pool will
-   * be paused as well.
-   *
-   * @param _thread_count The number of threads to use. The default value is the total number of
-   * hardware threads available, as reported by the implementation. With a hyperthreaded CPU, this
-   * will be twice the number of CPU cores. If the argument is zero, the default value will be used
-   * instead.
-   */
-  void reset(ui32 const& _thread_count = std::thread::hardware_concurrency())
-  {
-    bool was_paused = paused;
-    paused          = true;
-    wait_for_tasks();
-    running = false;
-    destroy_threads();
-    thread_count = _thread_count ? _thread_count : std::thread::hardware_concurrency();
-    threads      = std::make_unique<std::thread[]>(thread_count);
-    paused       = was_paused;
-    create_threads();
-    running = true;
-  }
-
-  /**
-   * @brief Submit a function with zero or more arguments and a return value into the task queue,
-   * and get a future for its eventual returned value.
-   *
-   * @tparam F The type of the function.
-   * @tparam A The types of the zero or more arguments to pass to the function.
-   * @tparam R The return type of the function.
-   * @param task The function to submit.
-   * @param args The zero or more arguments to pass to the function.
-   * @return A future to be used later to obtain the function's returned value, waiting for it to
-   * finish its execution if needed.
-   */
-  template <typename F,
-            typename... A,
-            typename R = std::invoke_result_t<std::decay_t<F>, std::decay_t<A>...>>
-  std::future<R> submit(F const& task, A const&... args)
-  {
-    std::shared_ptr<std::promise<R>> promise(new std::promise<R>);
-    std::future<R> future = promise->get_future();
-    push_task([task, args..., promise] {
-      try {
-        if constexpr (std::is_void_v<R>) {
-          task(args...);
-          promise->set_value();
-        } else {
-          promise->set_value(task(args...));
-        }
-      } catch (...) {
-        promise->set_exception(std::current_exception());
-      };
-    });
-    return future;
-  }
-
-  /**
-   * @brief Wait for tasks to be completed. Normally, this function waits for all tasks, both those
-   * that are currently running in the threads and those that are still waiting in the queue.
-   * However, if the variable paused is set to true, this function only waits for the currently
-   * running tasks (otherwise it would wait forever). To wait for a specific task, use submit()
-   * instead, and call the wait() member function of the generated future.
-   */
-  void wait_for_tasks()
-  {
-    while (true) {
-      if (!paused) {
-        if (tasks_total == 0) break;
-      } else {
-        if (get_tasks_running() == 0) break;
-      }
-      sleep_or_yield();
-    }
-  }
-
-  /**
-   * @brief An atomic variable indicating to the workers to pause. When set to true, the workers
-   * temporarily stop popping new tasks out of the queue, although any tasks already executed will
-   * keep running until they are done. Set to false again to resume popping tasks.
-   */
-  std::atomic<bool> paused = false;
-
-  /**
-   * @brief The duration, in microseconds, that the worker function should sleep for when it cannot
-   * find any tasks in the queue. If set to 0, then instead of sleeping, the worker function will
-   * execute std::this_thread::yield() if there are no tasks in the queue. The default value is
-   * 1000.
-   */
-  ui32 sleep_duration = 1000;
-
- private:
-  /**
-   * @brief Create the threads in the pool and assign a worker to each thread.
-   */
-  void create_threads()
-  {
-    for (ui32 i = 0; i < thread_count; i++) {
-      threads[i] = std::thread(&thread_pool::worker, this);
-    }
-  }
-
-  /**
-   * @brief Destroy the threads in the pool by joining them.
-   */
-  void destroy_threads()
-  {
-    for (ui32 i = 0; i < thread_count; i++) {
-      threads[i].join();
-    }
-  }
-
-  /**
-   * @brief Try to pop a new task out of the queue.
-   *
-   * @param task A reference to the task. Will be populated with a function if the queue is not
-   * empty.
-   * @return true if a task was found, false if the queue is empty.
-   */
-  bool pop_task(std::function<void()>& task)
-  {
-    std::scoped_lock const lock(queue_mutex);
-    if (tasks.empty())
-      return false;
-    else {
-      task = std::move(tasks.front());
-      tasks.pop();
-      return true;
-    }
-  }
-
-  /**
-   * @brief Sleep for sleep_duration microseconds. If that variable is set to zero, yield instead.
-   *
-   */
-  void sleep_or_yield()
-  {
-    if (sleep_duration)
-      std::this_thread::sleep_for(std::chrono::microseconds(sleep_duration));
-    else
-      std::this_thread::yield();
-  }
-
-  /**
-   * @brief A worker function to be assigned to each thread in the pool. Continuously pops tasks out
-   * of the queue and executes them, as long as the atomic variable running is set to true.
-   */
-  void worker()
-  {
-    while (running) {
-      std::function<void()> task;
-      if (!paused && pop_task(task)) {
-        task();
-        tasks_total--;
-      } else {
-        sleep_or_yield();
-      }
-    }
-  }
-
-  /**
-   * @brief A mutex to synchronize access to the task queue by different threads.
-   */
-  mutable std::mutex queue_mutex;
-
-  /**
-   * @brief An atomic variable indicating to the workers to keep running. When set to false, the
-   * workers permanently stop working.
-   */
-  std::atomic<bool> running = true;
-
-  /**
-   * @brief A queue of tasks to be executed by the threads.
-   */
-  std::queue<std::function<void()>> tasks;
-
-  /**
-   * @brief The number of threads in the pool.
-   */
-  ui32 thread_count;
-
-  /**
-   * @brief A smart pointer to manage the memory allocated for the threads.
-   */
-  std::unique_ptr<std::thread[]> threads;
-
-  /**
-   * @brief An atomic variable to keep track of the total number of unfinished tasks - either still
-   * in the queue, or running in a thread.
-   */
-  std::atomic<ui32> tasks_total = 0;
-};
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 7363f965af8..2abd6f0abac 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -226,6 +226,9 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
   using namespace numeric;
   using RepType = typename ElementTo::rep;
 
+  CUDF_EXPECTS(std::all_of(begin, end, [](ElementFrom v) { return v.scale() == 0; }),
+               "Only zero-scale fixed-point values are supported");
+
   auto to_rep            = [](ElementTo fp) { return fp.value(); };
   auto transformer_begin = thrust::make_transform_iterator(begin, to_rep);
   auto const size        = cudf::distance(begin, end);
diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py
index 3fe503f749e..42f84e4d0c7 100755
--- a/cpp/scripts/sort_ninja_log.py
+++ b/cpp/scripts/sort_ninja_log.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 import argparse
 import os
@@ -9,14 +9,12 @@
 from xml.dom import minidom
 
 parser = argparse.ArgumentParser()
-parser.add_argument(
-    "log_file", type=str, default=".ninja_log", help=".ninja_log file"
-)
+parser.add_argument("log_file", type=str, default=".ninja_log", help=".ninja_log file")
 parser.add_argument(
     "--fmt",
     type=str,
     default="csv",
-    choices=["csv", "xml", "html"],
+    choices=["csv", "html"],
     help="output format (to stdout)",
 )
 parser.add_argument(
@@ -37,6 +35,7 @@
 output_fmt = args.fmt
 cmp_file = args.cmp_log
 
+
 # build a map of the log entries
 def build_log_map(log_file):
     entries = {}
@@ -68,37 +67,6 @@ def build_log_map(log_file):
     return entries
 
 
-# output results in XML format
-def output_xml(entries, sorted_list, args):
-    root = ET.Element("testsuites")
-    testsuite = ET.Element(
-        "testsuite",
-        attrib={
-            "name": "build-time",
-            "tests": str(len(sorted_list)),
-            "failures": str(0),
-            "errors": str(0),
-        },
-    )
-    root.append(testsuite)
-    for name in sorted_list:
-        entry = entries[name]
-        build_time = float(entry[1] - entry[0]) / 1000
-        item = ET.Element(
-            "testcase",
-            attrib={
-                "classname": "BuildTime",
-                "name": name,
-                "time": str(build_time),
-            },
-        )
-        testsuite.append(item)
-
-    tree = ET.ElementTree(root)
-    xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent="   ")
-    print(xmlstr)
-
-
 # utility converts a millisecond value to a column width in pixels
 def time_to_width(value, end):
     # map a value from (0,end) to (0,1000)
@@ -282,9 +250,7 @@ def output_html(entries, sorted_list, cmp_entries, args):
 
     # output detail table in build-time descending order
     print("<table id='detail' bgcolor='#EEEEEE'>")
-    print(
-        "<tr><th>File</th>", "<th>Compile time</th>", "<th>Size</th>", sep=""
-    )
+    print("<tr><th>File</th>", "<th>Compile time</th>", "<th>Size</th>", sep="")
     if cmp_entries:
         print("<th>t-cmp</th>", sep="")
     print("</tr>")
@@ -303,9 +269,7 @@ def output_html(entries, sorted_list, cmp_entries, args):
         print("<td align='right'>", build_time_str, "</td>", sep="", end="")
         print("<td align='right'>", file_size_str, "</td>", sep="", end="")
         # output diff column
-        cmp_entry = (
-            cmp_entries[name] if cmp_entries and name in cmp_entries else None
-        )
+        cmp_entry = cmp_entries[name] if cmp_entries and name in cmp_entries else None
         if cmp_entry:
             diff_time = build_time - (cmp_entry[1] - cmp_entry[0])
             diff_time_str = format_build_time(diff_time)
@@ -353,7 +317,7 @@ def output_html(entries, sorted_list, cmp_entries, args):
         print(
             "<tr><td",
             white,
-            ">time change &lt; 20%% or build time &lt; 1 minute</td></tr>",
+            ">time change &lt; 20% or build time &lt; 1 minute</td></tr>",
         )
         print("</table>")
 
@@ -370,9 +334,7 @@ def output_csv(entries, sorted_list, cmp_entries, args):
         entry = entries[name]
         build_time = entry[1] - entry[0]
         file_size = entry[2]
-        cmp_entry = (
-            cmp_entries[name] if cmp_entries and name in cmp_entries else None
-        )
+        cmp_entry = cmp_entries[name] if cmp_entries and name in cmp_entries else None
         print(build_time, file_size, name, sep=",", end="")
         if cmp_entry:
             diff_time = build_time - (cmp_entry[1] - cmp_entry[0])
@@ -396,9 +358,7 @@ def output_csv(entries, sorted_list, cmp_entries, args):
 # load the comparison build log if available
 cmp_entries = build_log_map(cmp_file) if cmp_file else None
 
-if output_fmt == "xml":
-    output_xml(entries, sorted_list, args)
-elif output_fmt == "html":
+if output_fmt == "html":
     output_html(entries, sorted_list, cmp_entries, args)
 else:
     output_csv(entries, sorted_list, cmp_entries, args)
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 8ac1491547d..3ac8547baad 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -50,6 +50,11 @@
 namespace cudf {
 namespace binops {
 
+bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op)
+{
+  return cudf::binops::compiled::is_supported_operation(out, lhs, rhs, op);
+}
+
 /**
  * @brief Computes output valid mask for op between a column and a scalar
  */
@@ -194,7 +199,7 @@ std::unique_ptr<column> binary_operation(LhsType const& lhs,
                                          rmm::device_async_resource_ref mr)
 {
   if constexpr (std::is_same_v<LhsType, column_view> and std::is_same_v<RhsType, column_view>)
-    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
+    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match", std::invalid_argument);
 
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING and
       output_type.id() == type_id::STRING and
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
index 73c1a474310..e1d289e67a3 100644
--- a/cpp/src/interop/from_arrow_device.cu
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -35,6 +35,7 @@
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
@@ -56,7 +57,7 @@ struct dispatch_from_arrow_device {
                               data_type,
                               bool,
                               rmm::cuda_stream_view,
-                              rmm::mr::device_memory_resource*)
+                              rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unsupported type in from_arrow_device", cudf::data_type_error);
   }
@@ -68,7 +69,7 @@ struct dispatch_from_arrow_device {
                               data_type type,
                               bool skip_mask,
                               rmm::cuda_stream_view,
-                              rmm::mr::device_memory_resource*)
+                              rmm::device_async_resource_ref mr)
   {
     size_type const num_rows   = input->length;
     size_type const offset     = input->offset;
@@ -90,7 +91,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema,
                             data_type type,
                             bool skip_mask,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 template <>
 dispatch_tuple_t dispatch_from_arrow_device::operator()<bool>(ArrowSchemaView* schema,
@@ -98,7 +99,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<bool>(ArrowSchemaView* s
                                                               data_type type,
                                                               bool skip_mask,
                                                               rmm::cuda_stream_view stream,
-                                                              rmm::mr::device_memory_resource* mr)
+                                                              rmm::device_async_resource_ref mr)
 {
   if (input->length == 0) {
     return std::make_tuple<column_view, owned_columns_t>(
@@ -141,7 +142,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::string_view>(
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema->type != NANOARROW_TYPE_LARGE_STRING,
                "Large strings are not yet supported in from_arrow_device",
@@ -182,7 +183,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::dictionary32>(
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   ArrowSchemaView keys_schema_view;
   NANOARROW_THROW_NOT_OK(
@@ -238,7 +239,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::struct_view>(
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   std::vector<column_view> children;
   owned_columns_t out_owned_cols;
@@ -283,7 +284,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::list_view>(
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   size_type const num_rows   = input->length;
   size_type const offset     = input->offset;
@@ -324,7 +325,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema,
                             data_type type,
                             bool skip_mask,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   return type.id() != type_id::EMPTY
            ? std::move(type_dispatcher(
@@ -342,7 +343,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema,
 unique_table_view_t from_arrow_device(ArrowSchema const* schema,
                                       ArrowDeviceArray const* input,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema != nullptr && input != nullptr,
                "input ArrowSchema and ArrowDeviceArray must not be NULL",
@@ -397,7 +398,7 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema,
 unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
                                               ArrowDeviceArray const* input,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema != nullptr && input != nullptr,
                "input ArrowSchema and ArrowDeviceArray must not be NULL",
@@ -429,7 +430,7 @@ unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
 unique_table_view_t from_arrow_device(ArrowSchema const* schema,
                                       ArrowDeviceArray const* input,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -439,7 +440,7 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema,
 unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
                                               ArrowDeviceArray const* input,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu
index b7e07056686..b3087dedf98 100644
--- a/cpp/src/interop/from_arrow_host.cu
+++ b/cpp/src/interop/from_arrow_host.cu
@@ -38,6 +38,7 @@
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
@@ -49,7 +50,7 @@ namespace {
 
 struct dispatch_copy_from_arrow_host {
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   std::unique_ptr<rmm::device_buffer> get_mask_buffer(ArrowArray const* array)
   {
@@ -131,7 +132,7 @@ std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
                                         data_type type,
                                         bool skip_mask,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 template <>
 std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<bool>(ArrowSchemaView* schema,
@@ -388,7 +389,7 @@ std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
                                         data_type type,
                                         bool skip_mask,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return type.id() != type_id::EMPTY
            ? std::move(type_dispatcher(
@@ -405,7 +406,7 @@ std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
 std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
                                        ArrowDeviceArray const* input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema != nullptr && input != nullptr,
                "input ArrowSchema and ArrowDeviceArray must not be NULL",
@@ -441,7 +442,7 @@ std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
 std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
                                                ArrowDeviceArray const* input,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema != nullptr && input != nullptr,
                "input ArrowSchema and ArrowDeviceArray must not be NULL",
@@ -462,7 +463,7 @@ std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
 std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
                                        ArrowDeviceArray const* input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -472,7 +473,7 @@ std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
 std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
                                                ArrowDeviceArray const* input,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -482,7 +483,7 @@ std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
 std::unique_ptr<table> from_arrow(ArrowSchema const* schema,
                                   ArrowArray const* input,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -497,7 +498,7 @@ std::unique_ptr<table> from_arrow(ArrowSchema const* schema,
 std::unique_ptr<column> from_arrow_column(ArrowSchema const* schema,
                                           ArrowArray const* input,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/interop/from_arrow_stream.cu b/cpp/src/interop/from_arrow_stream.cu
index 0c85b561944..578105aa90a 100644
--- a/cpp/src/interop/from_arrow_stream.cu
+++ b/cpp/src/interop/from_arrow_stream.cu
@@ -41,7 +41,7 @@ namespace {
 
 std::unique_ptr<column> make_empty_column_from_schema(ArrowSchema const* schema,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   ArrowSchemaView schema_view;
   NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, nullptr));
@@ -81,7 +81,7 @@ std::unique_ptr<column> make_empty_column_from_schema(ArrowSchema const* schema,
 
 std::unique_ptr<table> from_arrow_stream(ArrowArrayStream* input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input != nullptr, "input ArrowArrayStream must not be NULL", std::invalid_argument);
 
@@ -135,7 +135,7 @@ std::unique_ptr<table> from_arrow_stream(ArrowArrayStream* input,
 
 std::unique_ptr<table> from_arrow_stream(ArrowArrayStream* input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_arrow_stream(input, stream, mr);
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 2b3aa2f08f1..622a3aba4bb 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -365,6 +365,9 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::list_view>(
   arrow::MemoryPool* ar_mr,
   rmm::cuda_stream_view stream)
 {
+  CUDF_EXPECTS(metadata.children_meta.empty() ||
+                 metadata.children_meta.size() == static_cast<std::size_t>(input.num_children()),
+               "Number of field names and number of children do not match\n");
   std::unique_ptr<column> tmp_column = nullptr;
   if ((input.offset() != 0) or
       ((input.num_children() == 2) and (input.child(0).size() - 1 != input.size()))) {
@@ -375,8 +378,11 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::list_view>(
   auto children_meta =
     metadata.children_meta.empty() ? std::vector<column_metadata>{{}, {}} : metadata.children_meta;
   auto child_arrays = fetch_child_array(input_view, children_meta, ar_mr, stream);
-  if (child_arrays.empty()) {
-    return std::make_shared<arrow::ListArray>(arrow::list(arrow::null()), 0, nullptr, nullptr);
+  if (child_arrays.empty() || child_arrays[0]->data()->length == 0) {
+    auto element_type = child_arrays.empty() ? arrow::null() : child_arrays[1]->type();
+    auto result       = arrow::MakeEmptyArray(arrow::list(element_type), ar_mr);
+    CUDF_EXPECTS(result.ok(), "Failed to construct empty arrow list array\n");
+    return result.ValueUnsafe();
   }
 
   auto offset_buffer = child_arrays[0]->data()->buffers[1];
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index ebfd6605977..b9d3a59e647 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -603,7 +603,7 @@ unique_device_array_t create_device_array(nanoarrow::UniqueArray&& out,
   });
   result->device_id          = rmm::get_current_cuda_device().value();
   result->device_type        = ARROW_DEVICE_CUDA;
-  result->sync_event         = private_data->sync_event;
+  result->sync_event         = &private_data->sync_event;
   result->array              = private_data->parent;  // makes a shallow copy
   result->array.private_data = private_data.release();
   result->array.release      = &detail::ArrowDeviceArrayRelease;
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index f8920bf82c2..5d0c6a8c83b 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -13,11 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include "nvcomp_adapter.hpp"
 
-#include "io/utilities/config_utils.hpp"
 #include "nvcomp_adapter.cuh"
 
+#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/io/config_utils.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <nvcomp/lz4.h>
@@ -35,6 +37,13 @@
 #include NVCOMP_ZSTD_HEADER
 #endif
 
+// When building with nvcomp 4.0 or newer, map the new version macros to the old ones
+#ifndef NVCOMP_MAJOR_VERSION
+#define NVCOMP_MAJOR_VERSION NVCOMP_VER_MAJOR
+#define NVCOMP_MINOR_VERSION NVCOMP_VER_MINOR
+#define NVCOMP_PATCH_VERSION NVCOMP_VER_PATCH
+#endif
+
 #define NVCOMP_HAS_ZSTD_DECOMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 3))
 
 #define NVCOMP_HAS_ZSTD_COMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 4))
@@ -472,8 +481,8 @@ feature_status_parameters::feature_status_parameters()
   : lib_major_version{NVCOMP_MAJOR_VERSION},
     lib_minor_version{NVCOMP_MINOR_VERSION},
     lib_patch_version{NVCOMP_PATCH_VERSION},
-    are_all_integrations_enabled{detail::nvcomp_integration::is_all_enabled()},
-    are_stable_integrations_enabled{detail::nvcomp_integration::is_stable_enabled()}
+    are_all_integrations_enabled{nvcomp_integration::is_all_enabled()},
+    are_stable_integrations_enabled{nvcomp_integration::is_stable_enabled()}
 {
   int device;
   CUDF_CUDA_TRY(cudaGetDevice(&device));
diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index 1a680a050fd..43c79e32375 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -17,8 +17,9 @@
 #pragma once
 
 #include "gpuinflate.hpp"
-#include "io/utilities/config_utils.hpp"
 
+#include <cudf/io/config_utils.hpp>
+#include <cudf/io/nvcomp_adapter.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -27,70 +28,6 @@
 #include <optional>
 
 namespace cudf::io::nvcomp {
-
-enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4 };
-
-/**
- * @brief Set of parameters that impact whether the use nvCOMP features is enabled.
- */
-struct feature_status_parameters {
-  int lib_major_version;
-  int lib_minor_version;
-  int lib_patch_version;
-  bool are_all_integrations_enabled;
-  bool are_stable_integrations_enabled;
-  int compute_capability_major;
-
-  feature_status_parameters();
-  feature_status_parameters(
-    int major, int minor, int patch, bool all_enabled, bool stable_enabled, int cc_major)
-    : lib_major_version{major},
-      lib_minor_version{minor},
-      lib_patch_version{patch},
-      are_all_integrations_enabled{all_enabled},
-      are_stable_integrations_enabled{stable_enabled},
-      compute_capability_major{cc_major}
-  {
-  }
-};
-
-/**
- * @brief Equality operator overload. Required to use `feature_status_parameters` as a map key.
- */
-inline bool operator==(feature_status_parameters const& lhs, feature_status_parameters const& rhs)
-{
-  return lhs.lib_major_version == rhs.lib_major_version and
-         lhs.lib_minor_version == rhs.lib_minor_version and
-         lhs.lib_patch_version == rhs.lib_patch_version and
-         lhs.are_all_integrations_enabled == rhs.are_all_integrations_enabled and
-         lhs.are_stable_integrations_enabled == rhs.are_stable_integrations_enabled and
-         lhs.compute_capability_major == rhs.compute_capability_major;
-}
-
-/**
- * @brief If a compression type is disabled through nvCOMP, returns the reason as a string.
- *
- * Result cab depend on nvCOMP version and environment variables.
- *
- * @param compression Compression type
- * @param params Optional parameters to query status with different configurations
- * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled
- */
-[[nodiscard]] std::optional<std::string> is_compression_disabled(
-  compression_type compression, feature_status_parameters params = feature_status_parameters());
-
-/**
- * @brief If a decompression type is disabled through nvCOMP, returns the reason as a string.
- *
- * Result can depend on nvCOMP version and environment variables.
- *
- * @param compression Compression type
- * @param params Optional parameters to query status with different configurations
- * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled
- */
-[[nodiscard]] std::optional<std::string> is_decompression_disabled(
-  compression_type compression, feature_status_parameters params = feature_status_parameters());
-
 /**
  * @brief Device batch decompression of given type.
  *
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 7c4d5711281..00a6dcb2286 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -25,6 +25,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/fill.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/detail/csv.hpp>
@@ -372,15 +373,33 @@ void write_chunked(data_sink* out_sink,
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
 
   cudf::string_scalar newline{options.get_line_terminator(), true, stream};
-  auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view,
-                                                            newline,
-                                                            string_scalar{"", false, stream},
-                                                            stream,
-                                                            rmm::mr::get_current_device_resource());
-  strings_column_view strings_column{p_str_col_w_nl->view()};
 
-  auto total_num_bytes      = strings_column.chars_size(stream);
-  char const* ptr_all_bytes = strings_column.chars_begin(stream);
+  // use strings concatenate to build the final CSV output in device memory
+  auto contents_w_nl = [&] {
+    auto const total_size =
+      str_column_view.chars_size(stream) + (newline.size() * str_column_view.size());
+    auto const empty_str = string_scalar("", true, stream);
+    // use join_strings when the output will be less than 2GB
+    if (total_size < static_cast<int64_t>(std::numeric_limits<size_type>::max())) {
+      return cudf::strings::detail::join_strings(str_column_view, newline, empty_str, stream, mr)
+        ->release();
+    }
+    auto nl_col = cudf::make_column_from_scalar(newline, str_column_view.size(), stream);
+    // convert the last element into an empty string by resetting the last offset value
+    auto& offsets     = nl_col->child(strings_column_view::offsets_column_index);
+    auto offsets_view = offsets.mutable_view();
+    cudf::fill_in_place(offsets_view,
+                        offsets.size() - 1,  // set the last element with
+                        offsets.size(),      // the value from 2nd to last element
+                        *cudf::detail::get_element(offsets.view(), offsets.size() - 2, stream, mr),
+                        stream);
+    auto const nl_tbl = cudf::table_view({str_column_view.parent(), nl_col->view()});
+    return cudf::strings::detail::concatenate(
+             nl_tbl, empty_str, empty_str, strings::separator_on_nulls::NO, stream, mr)
+      ->release();
+  }();
+  auto const total_num_bytes = contents_w_nl.data->size();
+  auto const ptr_all_bytes   = static_cast<char const*>(contents_w_nl.data->data());
 
   if (out_sink->is_device_write_preferred(total_num_bytes)) {
     // Direct write from device memory
@@ -411,13 +430,13 @@ void write_csv(data_sink* out_sink,
                table_view const& table,
                host_span<std::string const> user_column_names,
                csv_writer_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::device_async_resource_ref mr)
+               rmm::cuda_stream_view stream)
 {
   // write header: column names separated by delimiter:
   // (even for tables with no rows)
   //
-  write_chunked_begin(out_sink, table, user_column_names, options, stream, mr);
+  write_chunked_begin(
+    out_sink, table, user_column_names, options, stream, rmm::mr::get_current_device_resource());
 
   if (table.num_rows() > 0) {
     // no need to check same-size columns constraint; auto-enforced by table_view
@@ -491,7 +510,8 @@ void write_csv(data_sink* out_sink,
           str_table_view.column(0), options_narep, stream, rmm::mr::get_current_device_resource());
       }();
 
-      write_chunked(out_sink, str_concat_col->view(), options, stream, mr);
+      write_chunked(
+        out_sink, str_concat_col->view(), options, stream, rmm::mr::get_current_device_resource());
     }
   }
 }
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 5daa55d4552..6d2834206d4 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -215,9 +215,7 @@ table_with_metadata read_json(json_reader_options options,
   return json::detail::read_json(datasources, options, stream, mr);
 }
 
-void write_json(json_writer_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::device_async_resource_ref mr)
+void write_json(json_writer_options const& options, rmm::cuda_stream_view stream)
 {
   auto sinks = make_datasinks(options.get_sink());
   CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for JSON writing");
@@ -226,8 +224,7 @@ void write_json(json_writer_options const& options,
     sinks[0].get(),
     options.get_table(),
     options,
-    stream,
-    mr);
+    stream);
 }
 
 table_with_metadata read_csv(csv_reader_options options,
@@ -252,9 +249,7 @@ table_with_metadata read_csv(csv_reader_options options,
 }
 
 // Freeform API wraps the detail writer class API
-void write_csv(csv_writer_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::device_async_resource_ref mr)
+void write_csv(csv_writer_options const& options, rmm::cuda_stream_view stream)
 {
   using namespace cudf::io::detail;
 
@@ -266,8 +261,7 @@ void write_csv(csv_writer_options const& options,
     options.get_table(),
     options.get_names(),
     options,
-    stream,
-    mr);
+    stream);
 }
 
 raw_orc_statistics read_raw_orc_statistics(source_info const& src_info,
@@ -762,6 +756,9 @@ void parquet_writer_options_base::set_compression(compression_type compression)
 
 void parquet_writer_options_base::enable_int96_timestamps(bool req)
 {
+  CUDF_EXPECTS(not req or not is_enabled_write_arrow_schema(),
+               "INT96 timestamps and arrow schema cannot be simultaneously "
+               "enabled as INT96 timestamps are deprecated in Arrow.");
   _write_timestamps_as_int96 = req;
 }
 
@@ -770,6 +767,14 @@ void parquet_writer_options_base::enable_utc_timestamps(bool val)
   _write_timestamps_as_UTC = val;
 }
 
+void parquet_writer_options_base::enable_write_arrow_schema(bool val)
+{
+  CUDF_EXPECTS(not val or not is_enabled_int96_timestamps(),
+               "arrow schema and INT96 timestamps cannot be simultaneously "
+               "enabled as INT96 timestamps are deprecated in Arrow.");
+  _write_arrow_schema = val;
+}
+
 void parquet_writer_options_base::set_row_group_size_bytes(size_t size_bytes)
 {
   CUDF_EXPECTS(
@@ -974,6 +979,13 @@ BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::utc_timestamp
   return static_cast<BuilderT&>(*this);
 }
 
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::write_arrow_schema(bool enabled)
+{
+  _options.enable_write_arrow_schema(enabled);
+  return static_cast<BuilderT&>(*this);
+}
+
 template <class BuilderT, class OptionsT>
 BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::write_v2_headers(bool enabled)
 {
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 74001e5e01a..9cd39038348 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -193,7 +193,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
   size_t chunk_size                         = reader_opts.get_byte_range_size();
 
   CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset,
-               "Invalid offsetting");
+               "Invalid offsetting",
+               std::invalid_argument);
   auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
   chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
 
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 997d6fd99f8..c688c809e04 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -805,8 +805,7 @@ void write_chunked(data_sink* out_sink,
                    strings_column_view const& str_column_view,
                    int const skip_last_chars,
                    json_writer_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::device_async_resource_ref mr)
+                   rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
@@ -829,8 +828,7 @@ void write_chunked(data_sink* out_sink,
 void write_json(data_sink* out_sink,
                 table_view const& table,
                 json_writer_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::device_async_resource_ref mr)
+                rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   std::vector<column_name_info> user_column_names = [&]() {
@@ -912,7 +910,7 @@ void write_json(data_sink* out_sink,
       bool const include_line_terminator =
         (&sub_view != &vector_views.back()) or options.is_enabled_lines();
       auto const skip_last_chars = (include_line_terminator ? 0 : line_terminator.size());
-      write_chunked(out_sink, str_concat_col->view(), skip_last_chars, options, stream, mr);
+      write_chunked(out_sink, str_concat_col->view(), skip_last_chars, options, stream);
     }
   } else {
     if (options.is_enabled_lines()) {
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 72eb41b1360..8e20505d3ff 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -19,13 +19,13 @@
 #include "io/orc/reader_impl.hpp"
 #include "io/orc/reader_impl_chunking.hpp"
 #include "io/orc/reader_impl_helpers.hpp"
-#include "io/utilities/config_utils.hpp"
 #include "io/utilities/hostdevice_span.hpp"
 
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/config_utils.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
 
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index b6fc4e3510f..805959327ac 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -16,12 +16,12 @@
 
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/utilities/block_utils.cuh"
-#include "io/utilities/config_utils.hpp"
 #include "io/utilities/time_utils.cuh"
 #include "orc_gpu.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/orc_types.hpp>
 #include <cudf/lists/lists_column_view.hpp>
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index e9e031a407a..4cb20bb7518 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -27,6 +27,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
new file mode 100644
index 00000000000..ddf65e9020f
--- /dev/null
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file arrow_schema_writer.cpp
+ * @brief Arrow IPC schema writer implementation
+ */
+
+#include "arrow_schema_writer.hpp"
+
+#include "io/parquet/parquet_common.hpp"
+#include "io/utilities/base64_utilities.hpp"
+#include "ipc/Message_generated.h"
+#include "ipc/Schema_generated.h"
+#include "writer_impl_helpers.hpp"
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+namespace cudf::io::parquet::detail {
+
+using namespace cudf::io::detail;
+
+namespace {
+
+// Copied over from arrow source for better code readability
+namespace flatbuf       = cudf::io::parquet::flatbuf;
+using FlatBufferBuilder = flatbuffers::FlatBufferBuilder;
+using DictionaryOffset  = flatbuffers::Offset<flatbuf::DictionaryEncoding>;
+using FieldOffset       = flatbuffers::Offset<flatbuf::Field>;
+using Offset            = flatbuffers::Offset<void>;
+using FBString          = flatbuffers::Offset<flatbuffers::String>;
+
+/**
+ * @brief Recursively construct the arrow schema (fields) tree
+ *
+ * @param fbb The root flatbuffer builder object instance
+ * @param column A view of the column
+ * @param column_metadata Metadata of the column
+ * @param write_mode Flag to indicate that we are guaranteeing a single table write
+ * @param utc_timestamps Flag to indicate if timestamps are UTC
+ *
+ * @return Flatbuffer offset to the constructed field
+ */
+FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
+                                     cudf::detail::LinkedColPtr const& column,
+                                     column_in_metadata const& column_metadata,
+                                     single_write_mode const write_mode,
+                                     bool const utc_timestamps);
+
+/**
+ * @brief Functor to convert cudf column metadata to arrow schema field metadata
+ */
+struct dispatch_to_flatbuf {
+  FlatBufferBuilder& fbb;
+  cudf::detail::LinkedColPtr const& col;
+  column_in_metadata const& col_meta;
+  single_write_mode const write_mode;
+  bool const utc_timestamps;
+  Offset& field_offset;
+  flatbuf::Type& field_type_id;
+  std::vector<FieldOffset>& children;
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, bool>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Bool;
+    field_offset  = flatbuf::CreateBool(fbb).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, int8_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 8, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, int16_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 16, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, int32_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 32, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, int64_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 64, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, uint8_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 8, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, uint16_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 16, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, uint32_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 32, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, uint64_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 64, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, float>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_FloatingPoint;
+    field_offset  = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_SINGLE).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, double>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_FloatingPoint;
+    field_offset  = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_DOUBLE).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Utf8View;
+    field_offset  = flatbuf::CreateUtf8View(fbb).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_D>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Date;
+    // Date type (Set unit type to DAY for arrows's Date32)
+    field_offset = flatbuf::CreateDate(fbb, flatbuf::DateUnit_DAY).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_s>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Timestamp;
+    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
+    field_offset = flatbuf::CreateTimestamp(
+                     fbb, flatbuf::TimeUnit_SECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
+                     .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_ms>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Timestamp;
+    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
+    field_offset =
+      flatbuf::CreateTimestamp(
+        fbb, flatbuf::TimeUnit_MILLISECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
+        .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_us>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Timestamp;
+    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
+    field_offset =
+      flatbuf::CreateTimestamp(
+        fbb, flatbuf::TimeUnit_MICROSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
+        .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Timestamp;
+    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
+    field_offset =
+      flatbuf::CreateTimestamp(
+        fbb, flatbuf::TimeUnit_NANOSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
+        .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
+  {
+    // `duration_D` is written as TimeType as `duration_D` is not a valid arrow type.
+    //  This also allows for easy and faithful roundtripping with cudf.
+    field_type_id = flatbuf::Type_Time;
+    field_offset  = flatbuf::CreateTime(fbb, flatbuf::TimeUnit_MILLISECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Duration;
+    field_offset  = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_SECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Duration;
+    field_offset  = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MILLISECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_us>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Duration;
+    field_offset  = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MICROSECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_ns>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Duration;
+    field_offset  = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_NANOSECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
+  {
+    field_type_id = flatbuf::Type_Decimal;
+    field_offset  = flatbuf::CreateDecimal(fbb,
+                                          (col_meta.is_decimal_precision_set())
+                                             ? col_meta.get_decimal_precision()
+                                             : MAX_DECIMAL128_PRECISION,
+                                          col->type().scale(),
+                                          128)
+                     .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<cudf::is_nested<T>(), void> operator()()
+  {
+    // Lists are represented differently in arrow and cuDF.
+    // cuDF representation: List<int>: "col_name" : { "list", "element:int" } (2 children)
+    // arrow schema representation: List<int>: "col_name" : { "list<item:int>" } (1 child)
+    // Hence, we only need to process the second child of the list.
+    if constexpr (std::is_same_v<T, cudf::list_view>) {
+      children.emplace_back(make_arrow_schema_fields(
+        fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps));
+      field_type_id = flatbuf::Type_List;
+      field_offset  = flatbuf::CreateList(fbb).Union();
+    }
+
+    // Traverse the struct in DFS manner and process children fields.
+    else if constexpr (std::is_same_v<T, cudf::struct_view>) {
+      std::transform(thrust::make_counting_iterator(0UL),
+                     thrust::make_counting_iterator(col->children.size()),
+                     std::back_inserter(children),
+                     [&](auto const idx) {
+                       return make_arrow_schema_fields(
+                         fbb, col->children[idx], col_meta.child(idx), write_mode, utc_timestamps);
+                     });
+      field_type_id = flatbuf::Type_Struct_;
+      field_offset  = flatbuf::CreateStruct_(fbb).Union();
+    }
+  }
+
+  template <typename T>
+  std::enable_if_t<cudf::is_dictionary<T>(), void> operator()()
+  {
+    // `dictionary32` columns are not written to parquet by cudf.
+    CUDF_FAIL("Dictionary columns are not supported for writing");
+  }
+};
+
+FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
+                                     cudf::detail::LinkedColPtr const& column,
+                                     column_in_metadata const& column_metadata,
+                                     single_write_mode const write_mode,
+                                     bool const utc_timestamps)
+{
+  // Variables to be set by the dispatch_to_flatbuf functor
+  Offset field_offset         = 0;
+  flatbuf::Type field_type_id = flatbuf::Type_NONE;
+  std::vector<FieldOffset> children;
+
+  cudf::type_dispatcher(column->type(),
+                        dispatch_to_flatbuf{fbb,
+                                            column,
+                                            column_metadata,
+                                            write_mode,
+                                            utc_timestamps,
+                                            field_offset,
+                                            field_type_id,
+                                            children});
+
+  // push to field offsets vector
+  return flatbuf::CreateField(
+    fbb,
+    fbb.CreateString(column_metadata.get_name()),                    // name
+    is_output_column_nullable(column, column_metadata, write_mode),  // nullable
+    field_type_id,                                                   // type id
+    field_offset,                                                    // field offset
+    {0},                                                             // DictionaryOffset
+    fbb.CreateVector(children.data(), children.size()));             // children vector
+}
+
+}  // namespace
+
+std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
+                                               table_input_metadata const& metadata,
+                                               single_write_mode const write_mode,
+                                               bool const utc_timestamps)
+{
+  // Lambda function to convert int32 to a string of uint8 bytes
+  auto const convert_int32_to_byte_string = [&](int32_t const value) {
+    std::array<uint8_t, sizeof(int32_t)> buffer;
+    std::memcpy(buffer.data(), &value, sizeof(int32_t));
+    return std::string(reinterpret_cast<char*>(buffer.data()), buffer.size());
+  };
+
+  // Instantiate a flatbuffer builder
+  FlatBufferBuilder fbb;
+
+  // Create an empty field offset vector and reserve space for linked columns
+  std::vector<FieldOffset> field_offsets;
+  field_offsets.reserve(linked_columns.size());
+
+  // populate field offsets (aka schema fields)
+  std::transform(thrust::make_zip_iterator(
+                   thrust::make_tuple(linked_columns.begin(), metadata.column_metadata.begin())),
+                 thrust::make_zip_iterator(
+                   thrust::make_tuple(linked_columns.end(), metadata.column_metadata.end())),
+                 std::back_inserter(field_offsets),
+                 [&](auto const& elem) {
+                   return make_arrow_schema_fields(
+                     fbb, thrust::get<0>(elem), thrust::get<1>(elem), write_mode, utc_timestamps);
+                 });
+
+  // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to
+  // create an ipc message flatbuffer
+  fbb.Finish(flatbuf::CreateMessage(
+    fbb,
+    flatbuf::MetadataVersion_V5,    // Metadata version V5 (latest)
+    flatbuf::MessageHeader_Schema,  // Schema type message header
+    flatbuf::CreateSchema(fbb,
+                          flatbuf::Endianness::Endianness_Little,
+                          fbb.CreateVector(field_offsets))
+      .Union(),                                // arrow:schema built from the field vector
+    SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH  // Body length is zero for schema type ipc message
+    ));
+
+  // Construct the final string and store it here to use its view in base64_encode
+  std::string const ipc_message =
+    convert_int32_to_byte_string(IPC_CONTINUATION_TOKEN) +
+    // Since the schema type ipc message doesn't have a body, the flatbuffer size is equal to the
+    // ipc message's metadata length
+    convert_int32_to_byte_string(fbb.GetSize()) +
+    std::string(reinterpret_cast<char*>(fbb.GetBufferPointer()), fbb.GetSize());
+
+  // Encode the final ipc message string to base64 and return
+  return cudf::io::detail::base64_encode(ipc_message);
+}
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
new file mode 100644
index 00000000000..9bc435bf6c8
--- /dev/null
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file arrow_schema_writer.hpp
+ * @brief Arrow IPC schema writer implementation
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/linked_column.hpp>
+#include <cudf/io/data_sink.hpp>
+#include <cudf/io/detail/parquet.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/types.hpp>
+
+namespace cudf::io::parquet::detail {
+
+/**
+ * @brief Construct and return arrow schema from input parquet schema
+ *
+ * Recursively traverses through parquet schema to construct the arrow schema tree.
+ * Serializes the arrow schema tree and stores it as the header (or metadata) of
+ * an otherwise empty ipc message using flatbuffers. The ipc message is then prepended
+ * with header size (padded for 16 byte alignment) and a continuation string. The final
+ * string is base64 encoded and returned.
+ *
+ * @param linked_columns Vector of table column views
+ * @param metadata Metadata of the columns of the table
+ * @param write_mode Flag to indicate that we are guaranteeing a single table write
+ * @param utc_timestamps Flag to indicate if timestamps are UTC
+ *
+ * @return The constructed arrow ipc message string
+ */
+std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
+                                               table_input_metadata const& metadata,
+                                               cudf::io::detail::single_write_mode const write_mode,
+                                               bool const utc_timestamps);
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index ea80ae73c2f..8a866141c4b 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -792,7 +792,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
-                               true,
+                               false,
                                true,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
@@ -801,7 +801,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
-                               true,
+                               false,
                                false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
@@ -812,7 +812,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
-                               true,
+                               false,
                                true,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
@@ -821,7 +821,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
-                               true,
+                               false,
                                false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index e49801e6172..62f1ee88036 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -26,6 +26,8 @@
 
 namespace cudf::io::parquet::detail {
 
+namespace cg = cooperative_groups;
+
 namespace {
 
 // # of threads we're decoding with
@@ -163,7 +165,8 @@ __device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
       // For V1, the choice is an overestimate (s->dict_size), or an exact number that's
       // expensive to compute. For now we're going with the latter.
       else {
-        str_len = gpuInitStringDescriptors<true, unused_state_buf>(s, nullptr, target_pos, t);
+        str_len = gpuInitStringDescriptors<true, unused_state_buf>(
+          s, nullptr, target_pos, cg::this_thread_block());
       }
       break;
 
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 7207173b82f..e0d50d7ccf9 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -23,6 +23,8 @@
 
 namespace cudf::io::parquet::detail {
 
+namespace cg = cooperative_groups;
+
 namespace {
 
 constexpr int decode_block_size = 128;
@@ -277,6 +279,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     }
     // this needs to be here to prevent warp 3 modifying src_pos before all threads have read it
     __syncthreads();
+    auto const tile_warp = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
     if (t < 32) {
       // decode repetition and definition levels.
       // - update validity vectors
@@ -298,9 +301,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
         src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f);
       } else if (s->col.physical_type == BYTE_ARRAY or
                  s->col.physical_type == FIXED_LEN_BYTE_ARRAY) {
-        gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
+        gpuInitStringDescriptors<false>(s, sb, src_target_pos, tile_warp);
       }
-      if (t == 32) { s->dict_pos = src_target_pos; }
+      if (tile_warp.thread_rank() == 0) { s->dict_pos = src_target_pos; }
     } else {
       // WARP1..WARP3: Decode values
       int const dtype = s->col.physical_type;
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index b1f8e6dd5fe..a3f91f6859b 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -21,6 +21,7 @@
 #include "parquet_gpu.hpp"
 #include "rle_stream.cuh"
 
+#include <cooperative_groups.h>
 #include <cuda/atomic>
 #include <cuda/std/tuple>
 
@@ -420,46 +421,62 @@ inline __device__ int gpuDecodeRleBooleans(page_state_s* s, state_buf* sb, int t
  * @param[in,out] s Page state input/output
  * @param[out] sb Page state buffer output
  * @param[in] target_pos Target output position
- * @param[in] t Thread ID
+ * @param[in] g Cooperative group (thread block or tile)
  * @tparam sizes_only True if only sizes are to be calculated
  * @tparam state_buf Typename of the `state_buf` (usually inferred)
+ * @tparam thread_group Typename of the cooperative group (inferred)
  *
  * @return Total length of strings processed
  */
-template <bool sizes_only, typename state_buf>
-__device__ size_type
-gpuInitStringDescriptors(page_state_s* s, [[maybe_unused]] state_buf* sb, int target_pos, int t)
+template <bool sizes_only, typename state_buf, typename thread_group>
+__device__ size_type gpuInitStringDescriptors(page_state_s* s,
+                                              [[maybe_unused]] state_buf* sb,
+                                              int target_pos,
+                                              thread_group const& g)
 {
-  int pos       = s->dict_pos;
-  int total_len = 0;
+  int const t         = g.thread_rank();
+  int const dict_size = s->dict_size;
+  int k               = s->dict_val;
+  int pos             = s->dict_pos;
+  int total_len       = 0;
+
+  // All group threads can participate for fixed len byte arrays.
+  if (s->col.physical_type == FIXED_LEN_BYTE_ARRAY) {
+    int const dtype_len_in = s->dtype_len_in;
+    total_len              = min((target_pos - pos) * dtype_len_in, dict_size - s->dict_val);
+    if constexpr (!sizes_only) {
+      for (pos += t, k += t * dtype_len_in; pos < target_pos; pos += g.size()) {
+        sb->str_len[rolling_index<state_buf::str_buf_size>(pos)] =
+          (k < dict_size) ? dtype_len_in : 0;
+        // dict_idx is upperbounded by dict_size.
+        sb->dict_idx[rolling_index<state_buf::dict_buf_size>(pos)] = k;
+        // Increment k if needed.
+        if (k < dict_size) { k = min(k + (g.size() * dtype_len_in), dict_size); }
+      }
+    }
+    // Only thread_rank = 0 updates the s->dict_val
+    if (!t) { s->dict_val += total_len; }
+  }
+  // This step is purely serial for byte arrays
+  else {
+    if (!t) {
+      uint8_t const* cur = s->data_start;
 
-  // This step is purely serial
-  if (!t) {
-    uint8_t const* cur = s->data_start;
-    int dict_size      = s->dict_size;
-    int k              = s->dict_val;
-
-    while (pos < target_pos) {
-      int len = 0;
-      if (s->col.physical_type == FIXED_LEN_BYTE_ARRAY) {
-        if (k < dict_size) { len = s->dtype_len_in; }
-      } else {
+      for (int len = 0; pos < target_pos; pos++, len = 0) {
         if (k + 4 <= dict_size) {
           len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
           k += 4;
           if (k + len > dict_size) { len = 0; }
         }
+        if constexpr (!sizes_only) {
+          sb->dict_idx[rolling_index<state_buf::dict_buf_size>(pos)] = k;
+          sb->str_len[rolling_index<state_buf::str_buf_size>(pos)]   = len;
+        }
+        k += len;
+        total_len += len;
       }
-      if constexpr (!sizes_only) {
-        sb->dict_idx[rolling_index<state_buf::dict_buf_size>(pos)] = k;
-        sb->str_len[rolling_index<state_buf::str_buf_size>(pos)]   = len;
-      }
-      k += len;
-      total_len += len;
-      pos++;
+      s->dict_val = k;
     }
-    s->dict_val = k;
-    __threadfence_block();
   }
 
   return total_len;
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 58e8a09d5b6..ca74a1c2ba0 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -31,6 +31,8 @@
 
 namespace cudf::io::parquet::detail {
 
+namespace cg = cooperative_groups;
+
 namespace {
 
 constexpr int preprocess_block_size    = 512;
@@ -1006,6 +1008,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     }
     // this needs to be here to prevent warp 1/2 modifying src_pos before all threads have read it
     __syncthreads();
+
+    // Create a warp sized thread block tile
+    auto const tile_warp = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
+
     if (t < 32) {
       // decode repetition and definition levels.
       // - update validity vectors
@@ -1020,9 +1026,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
       if (s->dict_base) {
         src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, lane_id).first;
       } else {
-        gpuInitStringDescriptors<false>(s, sb, src_target_pos, lane_id);
+        gpuInitStringDescriptors<false>(s, sb, src_target_pos, tile_warp);
       }
-      if (t == 32) { s->dict_pos = src_target_pos; }
+      if (tile_warp.thread_rank() == 0) { s->dict_pos = src_target_pos; }
     } else {
       int const me = t - out_thread0;
 
diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp
index 8507eca047e..e42c259b1bf 100644
--- a/cpp/src/io/parquet/parquet_common.hpp
+++ b/cpp/src/io/parquet/parquet_common.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cstdint>
+#include <string>
 
 namespace cudf::io::parquet::detail {
 
@@ -26,6 +27,15 @@ auto constexpr MAX_DECIMAL32_PRECISION  = 9;
 auto constexpr MAX_DECIMAL64_PRECISION  = 18;
 auto constexpr MAX_DECIMAL128_PRECISION = 38;  // log10(2^(sizeof(int128_t) * 8 - 1) - 1)
 
+// Constants copied from arrow source and renamed to match the case
+int32_t constexpr MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL         = sizeof(int32_t);
+int32_t constexpr MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t);
+int32_t constexpr IPC_CONTINUATION_TOKEN                             = -1;
+std::string const ARROW_SCHEMA_KEY                                   = "ARROW:schema";
+
+// Schema type ipc message has zero length body
+int64_t constexpr SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH = 0;
+
 /**
  * @brief Basic data types in Parquet, determines how data is physically stored
  */
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index d371ef5de93..3da303e6928 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -16,7 +16,6 @@
 
 #include "compact_protocol_reader.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
-#include "io/utilities/config_utils.hpp"
 #include "io/utilities/time_utils.cuh"
 #include "reader_impl.hpp"
 #include "reader_impl_chunking.hpp"
@@ -25,6 +24,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/config_utils.hpp>
 
 #include <rmm/exec_policy.hpp>
 
@@ -862,7 +862,7 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
         gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
         break;
       case SNAPPY:
-        if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
+        if (cudf::io::nvcomp_integration::is_stable_enabled()) {
           nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
                                      d_comp_in,
                                      d_comp_out,
@@ -1071,7 +1071,7 @@ struct get_decomp_scratch {
       case BROTLI: return get_gpu_debrotli_scratch_size(di.num_pages);
 
       case SNAPPY:
-        if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
+        if (cudf::io::nvcomp_integration::is_stable_enabled()) {
           return cudf::io::nvcomp::batched_decompress_temp_size(
             cudf::io::nvcomp::compression_type::SNAPPY,
             di.num_pages,
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index ebd4affd099..d1e9a823d3b 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -564,14 +564,14 @@ aggregate_reader_metadata::aggregate_reader_metadata(
   // Collect and apply arrow:schema from Parquet's key value metadata section
   if (use_arrow_schema) { apply_arrow_schema(); }
 
-  // Erase "ARROW:schema" from the output pfm if exists
+  // Erase ARROW_SCHEMA_KEY from the output pfm if exists
   std::for_each(
-    keyval_maps.begin(), keyval_maps.end(), [](auto& pfm) { pfm.erase("ARROW:schema"); });
+    keyval_maps.begin(), keyval_maps.end(), [](auto& pfm) { pfm.erase(ARROW_SCHEMA_KEY); });
 }
 
 arrow_schema_data_types aggregate_reader_metadata::collect_arrow_schema() const
 {
-  // Check the key_value metadata for ARROW:schema, decode and walk it
+  // Check the key_value metadata for arrow schema, decode and walk it
   // Function to convert from flatbuf::duration type to cudf::type_id
   auto const duration_from_flatbuffer = [](flatbuf::Duration const* duration) {
     // TODO: we only need this for arrow::DurationType for now. Else, we can take in a
@@ -645,9 +645,7 @@ arrow_schema_data_types aggregate_reader_metadata::collect_arrow_schema() const
       return true;
     };
 
-  // TODO: Should we check if any file has the "ARROW:schema" key
-  // Or if all files have the same "ARROW:schema"?
-  auto const it = keyval_maps[0].find("ARROW:schema");
+  auto const it = keyval_maps[0].find(ARROW_SCHEMA_KEY);
   if (it == keyval_maps[0].end()) { return {}; }
 
   // Decode the base64 encoded ipc message string
@@ -788,11 +786,6 @@ void aggregate_reader_metadata::apply_arrow_schema()
 std::optional<std::string_view> aggregate_reader_metadata::decode_ipc_message(
   std::string_view const serialized_message) const
 {
-  // Constants copied from arrow source and renamed to match the case
-  constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL         = sizeof(int32_t);
-  constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t);
-  constexpr int32_t IPC_CONTINUATION_TOKEN                             = -1;
-
   // message buffer
   auto message_buf = serialized_message.data();
   // current message (buffer) size
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 9aeb19a7723..6bfa8519c76 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -117,6 +117,9 @@ struct metadata : public FileMetaData {
   void sanitize_schema();
 };
 
+/**
+ * @brief Class to extract data types from arrow schema tree
+ */
 struct arrow_schema_data_types {
   std::vector<arrow_schema_data_types> children;
   data_type type{type_id::EMPTY};
@@ -142,7 +145,7 @@ class aggregate_reader_metadata {
     const;
 
   /**
-   * @brief Decodes and constructs the arrow schema from the "ARROW:schema" IPC message
+   * @brief Decodes and constructs the arrow schema from the ARROW_SCHEMA_KEY IPC message
    * in key value metadata section of Parquet file footer
    */
   [[nodiscard]] arrow_schema_data_types collect_arrow_schema() const;
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index bed4dbc5a66..8413e716224 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -19,6 +19,7 @@
  * @brief cuDF-IO parquet writer class implementation
  */
 
+#include "arrow_schema_writer.hpp"
 #include "compact_protocol_reader.hpp"
 #include "compact_protocol_writer.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
@@ -26,22 +27,20 @@
 #include "io/parquet/parquet_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
 #include "io/utilities/column_utils.cuh"
-#include "io/utilities/config_utils.hpp"
 #include "parquet_common.hpp"
 #include "parquet_gpu.cuh"
 #include "writer_impl.hpp"
+#include "writer_impl_helpers.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/copying.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/linked_column.hpp>
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/detail/utilities.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -70,7 +69,8 @@ struct aggregate_writer_metadata {
                             host_span<std::map<std::string, std::string> const> kv_md,
                             host_span<SchemaElement const> tbl_schema,
                             size_type num_columns,
-                            statistics_freq stats_granularity)
+                            statistics_freq stats_granularity,
+                            std::string const arrow_schema_ipc_message)
     : version(1),
       schema(std::vector<SchemaElement>(tbl_schema.begin(), tbl_schema.end())),
       files(partitions.size())
@@ -92,6 +92,13 @@ struct aggregate_writer_metadata {
                        return KeyValue{kv.first, kv.second};
                      });
     }
+
+    // Append arrow schema to the key-value metadata
+    if (not arrow_schema_ipc_message.empty()) {
+      std::for_each(this->files.begin(), this->files.end(), [&](auto& file) {
+        file.key_value_metadata.emplace_back(KeyValue{ARROW_SCHEMA_KEY, arrow_schema_ipc_message});
+      });
+    }
   }
 
   aggregate_writer_metadata(aggregate_writer_metadata const&) = default;
@@ -182,26 +189,6 @@ struct aggregate_writer_metadata {
 
 namespace {
 
-/**
- * @brief Function that translates GDF compression to parquet compression.
- *
- * @param compression The compression type
- * @return The supported Parquet compression
- */
-Compression to_parquet_compression(compression_type compression)
-{
-  switch (compression) {
-    case compression_type::AUTO:
-    case compression_type::SNAPPY: return Compression::SNAPPY;
-    case compression_type::ZSTD: return Compression::ZSTD;
-    case compression_type::LZ4:
-      // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
-      return Compression::LZ4_RAW;
-    case compression_type::NONE: return Compression::UNCOMPRESSED;
-    default: CUDF_FAIL("Unsupported compression type");
-  }
-}
-
 /**
  * @brief Convert a mask of encodings to a vector.
  *
@@ -326,6 +313,7 @@ struct leaf_schema_fn {
   column_in_metadata const& col_meta;
   bool timestamp_is_int96;
   bool timestamp_is_utc;
+  bool write_arrow_schema;
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, bool>, void> operator()()
@@ -493,10 +481,11 @@ struct leaf_schema_fn {
     }
   }
 
-  //  unsupported outside cudf for parquet 1.0.
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
   {
+    // duration_D is based on int32_t and not a valid arrow duration type so simply convert to
+    // time32(ms).
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::TIME_MILLIS;
     col_schema.stats_dtype    = statistics_dtype::dtype_int32;
@@ -507,62 +496,86 @@ struct leaf_schema_fn {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
   {
-    col_schema.type           = Type::INT32;
-    col_schema.converted_type = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
-    col_schema.ts_scale       = 1000;
-    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+    // If writing arrow schema, no logical type nor converted type is necessary
+    if (write_arrow_schema) {
+      col_schema.type        = Type::INT64;
+      col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    } else {
+      // Write as Time32 logical type otherwise. Parquet TIME_MILLIS annotates INT32
+      col_schema.type           = Type::INT32;
+      col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+      col_schema.converted_type = ConvertedType::TIME_MILLIS;
+      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+      col_schema.ts_scale       = 1000;
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
   {
-    col_schema.type           = Type::INT32;
-    col_schema.converted_type = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
-    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+    // If writing arrow schema, no logical type nor converted type is necessary
+    if (write_arrow_schema) {
+      col_schema.type        = Type::INT64;
+      col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    } else {
+      // Write as Time32 logical type otherwise. Parquet TIME_MILLIS annotates INT32
+      col_schema.type           = Type::INT32;
+      col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+      col_schema.converted_type = ConvertedType::TIME_MILLIS;
+      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_us>, void> operator()()
   {
-    col_schema.type           = Type::INT64;
-    col_schema.converted_type = ConvertedType::TIME_MICROS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
-    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}};
+    col_schema.type        = Type::INT64;
+    col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    // Only write as time64 logical type if not writing arrow schema
+    if (not write_arrow_schema) {
+      col_schema.converted_type = ConvertedType::TIME_MICROS;
+      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}};
+    }
   }
 
-  //  unsupported outside cudf for parquet 1.0.
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ns>, void> operator()()
   {
-    col_schema.type         = Type::INT64;
-    col_schema.stats_dtype  = statistics_dtype::dtype_int64;
-    col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}};
+    col_schema.type        = Type::INT64;
+    col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    // Only write as time64 logical type if not writing arrow schema
+    if (not write_arrow_schema) {
+      col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
   {
-    if (std::is_same_v<T, numeric::decimal32>) {
-      col_schema.type              = Type::INT32;
-      col_schema.stats_dtype       = statistics_dtype::dtype_int32;
-      col_schema.decimal_precision = MAX_DECIMAL32_PRECISION;
-      col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL32_PRECISION}};
-    } else if (std::is_same_v<T, numeric::decimal64>) {
-      col_schema.type              = Type::INT64;
-      col_schema.stats_dtype       = statistics_dtype::dtype_decimal64;
-      col_schema.decimal_precision = MAX_DECIMAL64_PRECISION;
-      col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL64_PRECISION}};
-    } else if (std::is_same_v<T, numeric::decimal128>) {
+    // If writing arrow schema, then convert d32 and d64 to d128
+    if (write_arrow_schema or std::is_same_v<T, numeric::decimal128>) {
       col_schema.type              = Type::FIXED_LEN_BYTE_ARRAY;
       col_schema.type_length       = sizeof(__int128_t);
       col_schema.stats_dtype       = statistics_dtype::dtype_decimal128;
       col_schema.decimal_precision = MAX_DECIMAL128_PRECISION;
       col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL128_PRECISION}};
     } else {
-      CUDF_FAIL("Unsupported fixed point type for parquet writer");
+      if (std::is_same_v<T, numeric::decimal32>) {
+        col_schema.type              = Type::INT32;
+        col_schema.stats_dtype       = statistics_dtype::dtype_int32;
+        col_schema.decimal_precision = MAX_DECIMAL32_PRECISION;
+        col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL32_PRECISION}};
+      } else if (std::is_same_v<T, numeric::decimal64>) {
+        col_schema.type              = Type::INT64;
+        col_schema.stats_dtype       = statistics_dtype::dtype_decimal64;
+        col_schema.decimal_precision = MAX_DECIMAL64_PRECISION;
+        col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL64_PRECISION}};
+      } else {
+        CUDF_FAIL("Unsupported fixed point type for parquet writer");
+      }
     }
+
+    // Write logical and converted types, decimal scale and precision
     col_schema.converted_type = ConvertedType::DECIMAL;
     col_schema.decimal_scale = -col->type().scale();  // parquet and cudf disagree about scale signs
     col_schema.logical_type->decimal_type->scale = -col->type().scale();
@@ -590,33 +603,19 @@ struct leaf_schema_fn {
   }
 };
 
-inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
-                            column_in_metadata const& col_meta,
-                            single_write_mode write_mode)
-{
-  if (col_meta.is_nullability_defined()) {
-    CUDF_EXPECTS(col_meta.nullable() or col->null_count() == 0,
-                 "Mismatch in metadata prescribed nullability and input column. "
-                 "Metadata for input column with nulls cannot prescribe nullability = false");
-    return col_meta.nullable();
-  }
-  // For chunked write, when not provided nullability, we assume the worst case scenario
-  // that all columns are nullable.
-  return write_mode == single_write_mode::NO or col->nullable();
-}
-
 /**
  * @brief Construct schema from input columns and per-column input options
  *
  * Recursively traverses through linked_columns and corresponding metadata to construct schema tree.
  * The resulting schema tree is stored in a vector in pre-order traversal order.
  */
-std::vector<schema_tree_node> construct_schema_tree(
+std::vector<schema_tree_node> construct_parquet_schema_tree(
   cudf::detail::LinkedColVector const& linked_columns,
   table_input_metadata& metadata,
   single_write_mode write_mode,
   bool int96_timestamps,
-  bool utc_timestamps)
+  bool utc_timestamps,
+  bool write_arrow_schema)
 {
   std::vector<schema_tree_node> schema;
   schema_tree_node root{};
@@ -629,7 +628,7 @@ std::vector<schema_tree_node> construct_schema_tree(
 
   std::function<void(cudf::detail::LinkedColPtr const&, column_in_metadata&, size_t)> add_schema =
     [&](cudf::detail::LinkedColPtr const& col, column_in_metadata& col_meta, size_t parent_idx) {
-      bool const col_nullable = is_col_nullable(col, col_meta, write_mode);
+      bool const col_nullable = is_output_column_nullable(col, col_meta, write_mode);
 
       auto set_field_id = [&schema, parent_idx](schema_tree_node& s,
                                                 column_in_metadata const& col_meta) {
@@ -854,7 +853,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         right_child_meta.set_name("value");
         // check the repetition type of key is required i.e. the col should be non-nullable
         auto key_col = col->children[lists_column_view::child_column_index]->children[0];
-        CUDF_EXPECTS(!is_col_nullable(key_col, left_child_meta, write_mode),
+        CUDF_EXPECTS(!is_output_column_nullable(key_col, left_child_meta, write_mode),
                      "key column cannot be nullable. For chunked writing, explicitly set the "
                      "nullability to false in metadata");
         // process key
@@ -886,7 +885,8 @@ std::vector<schema_tree_node> construct_schema_tree(
 
         cudf::type_dispatcher(
           col->type(),
-          leaf_schema_fn{col_schema, col, col_meta, timestamp_is_int96, utc_timestamps});
+          leaf_schema_fn{
+            col_schema, col, col_meta, timestamp_is_int96, utc_timestamps, write_arrow_schema});
 
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
@@ -1148,7 +1148,6 @@ void calculate_page_fragments(device_span<PageFragment> frag,
  *
  * @param frag_stats output statistics
  * @param frags Input page fragments
- * @param int96_timestamps Flag to indicate if timestamps will be written as INT96
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void gather_fragment_statistics(device_span<statistics_chunk> frag_stats,
@@ -1164,32 +1163,6 @@ void gather_fragment_statistics(device_span<statistics_chunk> frag_stats,
   stream.synchronize();
 }
 
-auto to_nvcomp_compression_type(Compression codec)
-{
-  if (codec == Compression::SNAPPY) return nvcomp::compression_type::SNAPPY;
-  if (codec == Compression::ZSTD) return nvcomp::compression_type::ZSTD;
-  // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
-  if (codec == Compression::LZ4_RAW) return nvcomp::compression_type::LZ4;
-  CUDF_FAIL("Unsupported compression type");
-}
-
-auto page_alignment(Compression codec)
-{
-  if (codec == Compression::UNCOMPRESSED or
-      nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) {
-    return 1u;
-  }
-
-  return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(codec));
-}
-
-size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize)
-{
-  if (codec == Compression::UNCOMPRESSED) return 0;
-
-  return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
-}
-
 auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
                      device_span<parquet_column_device_view const> col_desc,
                      uint32_t num_columns,
@@ -1629,23 +1602,127 @@ size_t column_index_buffer_size(EncColumnChunk* ck,
 }
 
 /**
- * @brief Fill the table metadata with default column names.
+ * @brief Convert decimal32 and decimal64 data to decimal128 and return the device vector
  *
- * @param table_meta The table metadata to fill
+ * @tparam DecimalType to convert from
+ *
+ * @param column A view of the input columns
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return A device vector containing the converted decimal128 data
  */
-void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta)
+template <typename DecimalType>
+rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& column,
+                                                           rmm::cuda_stream_view stream)
 {
-  // Fill unnamed columns' names in table_meta
-  std::function<void(column_in_metadata&, std::string)> add_default_name =
-    [&](column_in_metadata& col_meta, std::string default_name) {
-      if (col_meta.get_name().empty()) col_meta.set_name(default_name);
-      for (size_type i = 0; i < col_meta.num_children(); ++i) {
-        add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i));
-      }
-    };
-  for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) {
-    add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i));
-  }
+  size_type constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType);
+
+  rmm::device_uvector<__int128_t> d128_buffer(column.size(), stream);
+
+  thrust::for_each(rmm::exec_policy_nosync(stream),
+                   thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(column.size()),
+                   [in  = column.begin<DecimalType>(),
+                    out = reinterpret_cast<DecimalType*>(d128_buffer.data()),
+                    BIT_WIDTH_RATIO] __device__(auto in_idx) {
+                     auto const out_idx = in_idx * BIT_WIDTH_RATIO;
+                     // The lowest order bits are the value, the remainder
+                     // simply matches the sign bit to satisfy the two's
+                     // complement integer representation of negative numbers.
+                     out[out_idx] = in[in_idx];
+#pragma unroll BIT_WIDTH_RATIO - 1
+                     for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
+                       out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
+                     }
+                   });
+
+  return d128_buffer;
+}
+
+/**
+ * @brief Function to convert decimal32 and decimal64 columns to decimal128 data,
+ *        update the input table metadata, and return a new vector of column views.
+ *
+ * @param[in,out] table_meta The table metadata
+ * @param[in,out] d128_vectors Vector containing the computed decimal128 data buffers.
+ * @param input The input table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return A device vector containing the converted decimal128 data
+ */
+std::vector<column_view> convert_decimal_columns_and_metadata(
+  table_input_metadata& table_meta,
+  std::vector<rmm::device_uvector<__int128_t>>& d128_vectors,
+  table_view const& table,
+  rmm::cuda_stream_view stream)
+{
+  // Lambda function to convert each decimal32/decimal64 column to decimal128.
+  std::function<column_view(column_view, column_in_metadata&)> convert_column =
+    [&](column_view column, column_in_metadata& metadata) -> column_view {
+    // Vector of passable-by-reference children column views
+    std::vector<column_view> converted_children;
+
+    // Process children column views first
+    std::transform(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(column.num_children()),
+      std::back_inserter(converted_children),
+      [&](auto const idx) { return convert_column(column.child(idx), metadata.child(idx)); });
+
+    // Process this column view. Only convert if decimal32 and decimal64 column.
+    switch (column.type().id()) {
+      case type_id::DECIMAL32:
+        // Convert data to decimal128 type
+        d128_vectors.emplace_back(convert_data_to_decimal128<int32_t>(column, stream));
+        // Update metadata
+        metadata.set_decimal_precision(MAX_DECIMAL32_PRECISION);
+        metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
+        // Create a new column view from the d128 data vector
+        return {data_type{type_id::DECIMAL128, column.type().scale()},
+                column.size(),
+                d128_vectors.back().data(),
+                column.null_mask(),
+                column.null_count(),
+                column.offset(),
+                converted_children};
+      case type_id::DECIMAL64:
+        // Convert data to decimal128 type
+        d128_vectors.emplace_back(convert_data_to_decimal128<int64_t>(column, stream));
+        // Update metadata
+        metadata.set_decimal_precision(MAX_DECIMAL64_PRECISION);
+        metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
+        // Create a new column view from the d128 data vector
+        return {data_type{type_id::DECIMAL128, column.type().scale()},
+                column.size(),
+                d128_vectors.back().data(),
+                column.null_mask(),
+                column.null_count(),
+                column.offset(),
+                converted_children};
+      default:
+        // Update the children vector keeping everything else the same
+        return {column.type(),
+                column.size(),
+                column.head(),
+                column.null_mask(),
+                column.null_count(),
+                column.offset(),
+                converted_children};
+    }
+  };
+
+  // Vector of converted column views
+  std::vector<column_view> converted_column_views;
+
+  // Convert each column view
+  std::transform(
+    thrust::make_zip_iterator(
+      thrust::make_tuple(table.begin(), table_meta.column_metadata.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(table.end(), table_meta.column_metadata.end())),
+    std::back_inserter(converted_column_views),
+    [&](auto elem) { return convert_column(thrust::get<0>(elem), thrust::get<1>(elem)); });
+
+  return converted_column_views;
 }
 
 /**
@@ -1698,12 +1775,22 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    bool int96_timestamps,
                                    bool utc_timestamps,
                                    bool write_v2_headers,
+                                   bool write_arrow_schema,
                                    host_span<std::unique_ptr<data_sink> const> out_sink,
                                    rmm::cuda_stream_view stream)
 {
-  auto vec = table_to_linked_columns(input);
-  auto schema_tree =
-    construct_schema_tree(vec, table_meta, write_mode, int96_timestamps, utc_timestamps);
+  // Container to store decimal128 converted data if needed
+  std::vector<rmm::device_uvector<__int128_t>> d128_vectors;
+
+  // Convert decimal32/decimal64 data to decimal128 if writing arrow schema
+  // and initialize LinkedColVector
+  auto vec = table_to_linked_columns(
+    (write_arrow_schema)
+      ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_vectors, input, stream)})
+      : input);
+
+  auto schema_tree = construct_parquet_schema_tree(
+    vec, table_meta, write_mode, int96_timestamps, utc_timestamps, write_arrow_schema);
   // Construct parquet_column_views from the schema tree leaf nodes.
   std::vector<parquet_column_view> parquet_columns;
 
@@ -1826,7 +1913,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   std::unique_ptr<aggregate_writer_metadata> agg_meta;
   if (!curr_agg_meta) {
     agg_meta = std::make_unique<aggregate_writer_metadata>(
-      partitions, kv_meta, this_table_schema, num_columns, stats_granularity);
+      partitions,
+      kv_meta,
+      this_table_schema,
+      num_columns,
+      stats_granularity,
+      (write_arrow_schema)
+        ? construct_arrow_schema_ipc_message(vec, table_meta, write_mode, utc_timestamps)
+        : "");
   } else {
     agg_meta = std::make_unique<aggregate_writer_metadata>(*curr_agg_meta);
 
@@ -2307,6 +2401,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
+    _write_arrow_schema(options.is_enabled_write_arrow_schema()),
     _sorting_columns(options.get_sorting_columns()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
@@ -2337,6 +2432,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
+    _write_arrow_schema(options.is_enabled_write_arrow_schema()),
     _sorting_columns(options.get_sorting_columns()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
@@ -2378,7 +2474,7 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
   CUDF_EXPECTS(not _closed, "Data has already been flushed to out and closed");
 
   if (not _table_meta) { _table_meta = std::make_unique<table_input_metadata>(input); }
-  fill_table_meta(_table_meta);
+  fill_table_meta(*_table_meta);
 
   // All kinds of memory allocation and data compressions/encoding are performed here.
   // If any error occurs, such as out-of-memory exception, the internal state of the current
@@ -2415,6 +2511,7 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                                            _int96_timestamps,
                                            _utc_timestamps,
                                            _write_v2_headers,
+                                           _write_arrow_schema,
                                            _out_sink,
                                            _stream);
     } catch (...) {  // catch any exception type
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 784f78f06d5..63128faf993 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -156,6 +156,7 @@ class writer::impl {
   bool const _int96_timestamps;
   bool const _utc_timestamps;
   bool const _write_v2_headers;
+  bool const _write_arrow_schema;
   std::optional<std::vector<sorting_column>> _sorting_columns;
   int32_t const _column_index_truncate_length;
   std::vector<std::map<std::string, std::string>> const _kv_meta;  // Optional user metadata.
diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp
new file mode 100644
index 00000000000..e2f09f872d3
--- /dev/null
+++ b/cpp/src/io/parquet/writer_impl_helpers.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file writer_impl_helpers.cpp
+ * @brief Helper function implementation for Parquet writer
+ */
+
+#include "writer_impl_helpers.hpp"
+
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+
+namespace cudf::io::parquet::detail {
+
+using namespace cudf::io::detail;
+
+Compression to_parquet_compression(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::AUTO:
+    case compression_type::SNAPPY: return Compression::SNAPPY;
+    case compression_type::ZSTD: return Compression::ZSTD;
+    case compression_type::LZ4:
+      // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
+      return Compression::LZ4_RAW;
+    case compression_type::NONE: return Compression::UNCOMPRESSED;
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+}
+
+nvcomp::compression_type to_nvcomp_compression_type(Compression codec)
+{
+  switch (codec) {
+    case Compression::SNAPPY: return nvcomp::compression_type::SNAPPY;
+    case Compression::ZSTD: return nvcomp::compression_type::ZSTD;
+    // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
+    case Compression::LZ4_RAW: return nvcomp::compression_type::LZ4;
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+}
+
+uint32_t page_alignment(Compression codec)
+{
+  if (codec == Compression::UNCOMPRESSED or
+      nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) {
+    return 1u;
+  }
+
+  return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(codec));
+}
+
+size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize)
+{
+  if (codec == Compression::UNCOMPRESSED) return 0;
+
+  return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
+}
+
+void fill_table_meta(table_input_metadata& table_meta)
+{
+  // Fill unnamed columns' names in table_meta
+  std::function<void(column_in_metadata&, std::string)> add_default_name =
+    [&](column_in_metadata& col_meta, std::string default_name) {
+      if (col_meta.get_name().empty()) col_meta.set_name(default_name);
+      for (size_type i = 0; i < col_meta.num_children(); ++i) {
+        add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i));
+      }
+    };
+  for (size_t i = 0; i < table_meta.column_metadata.size(); ++i) {
+    add_default_name(table_meta.column_metadata[i], "_col" + std::to_string(i));
+  }
+}
+
+[[nodiscard]] size_t column_size(column_view const& column, rmm::cuda_stream_view stream)
+{
+  if (column.is_empty()) { return 0; }
+
+  if (is_fixed_width(column.type())) {
+    return size_of(column.type()) * column.size();
+  } else if (column.type().id() == type_id::STRING) {
+    auto const scol = strings_column_view(column);
+    return cudf::strings::detail::get_offset_value(
+             scol.offsets(), column.size() + column.offset(), stream) -
+           cudf::strings::detail::get_offset_value(scol.offsets(), column.offset(), stream);
+  } else if (column.type().id() == type_id::STRUCT) {
+    auto const scol = structs_column_view(column);
+    size_t ret      = 0;
+    for (int i = 0; i < scol.num_children(); i++) {
+      ret += column_size(scol.get_sliced_child(i, stream), stream);
+    }
+    return ret;
+  } else if (column.type().id() == type_id::LIST) {
+    auto const lcol = lists_column_view(column);
+    return column_size(lcol.get_sliced_child(stream), stream);
+  }
+
+  CUDF_FAIL("Unexpected compound type");
+}
+
+[[nodiscard]] bool is_output_column_nullable(cudf::detail::LinkedColPtr const& column,
+                                             column_in_metadata const& column_metadata,
+                                             single_write_mode write_mode)
+{
+  if (column_metadata.is_nullability_defined()) {
+    CUDF_EXPECTS(column_metadata.nullable() or column->null_count() == 0,
+                 "Mismatch in metadata prescribed nullability and input column. "
+                 "Metadata for input column with nulls cannot prescribe nullability = false");
+    return column_metadata.nullable();
+  }
+  // For chunked write, when not provided nullability, we assume the worst case scenario
+  // that all columns are nullable.
+  return write_mode == single_write_mode::NO or column->nullable();
+}
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp
new file mode 100644
index 00000000000..a85411594e9
--- /dev/null
+++ b/cpp/src/io/parquet/writer_impl_helpers.hpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file writer_impl_helpers.hpp
+ * @brief Helper function implementation for Parquet writer
+ */
+
+#pragma once
+#include "io/comp/nvcomp_adapter.hpp"
+#include "parquet_common.hpp"
+
+#include <cudf/detail/utilities/linked_column.hpp>
+#include <cudf/io/detail/parquet.hpp>
+
+namespace cudf::io::parquet::detail {
+
+/**
+ * @brief Function that translates GDF compression to parquet compression.
+ *
+ * @param compression The compression type
+ * @return The supported Parquet compression
+ */
+Compression to_parquet_compression(compression_type compression);
+
+/**
+ * @brief Function that translates the given compression codec to nvcomp compression type.
+ *
+ * @param codec Compression codec
+ * @return Translated nvcomp compression type
+ */
+nvcomp::compression_type to_nvcomp_compression_type(Compression codec);
+
+/**
+ * @brief Function that computes input alignment requirements for the given compression type.
+ *
+ * @param codec Compression codec
+ * @return Required alignment
+ */
+uint32_t page_alignment(Compression codec);
+
+/**
+ * @brief Gets the maximum compressed chunk size for the largest chunk uncompressed chunk in the
+ *        batch.
+ *
+ * @param codec Compression codec
+ * @param compression_blocksize Size of the largest uncompressed chunk in the batch
+ * @return Maximum compressed chunk size
+ */
+size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize);
+
+/**
+ * @brief Fill the table metadata with default column names.
+ *
+ * @param table_meta The table metadata to fill
+ */
+void fill_table_meta(table_input_metadata& table_meta);
+
+/**
+ * @brief Compute size (in bytes) of the data stored in the given column.
+ *
+ * @param column The input column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The data size of the input
+ */
+[[nodiscard]] size_t column_size(column_view const& column, rmm::cuda_stream_view stream);
+
+/**
+ * @brief Indicates if the column should be marked as nullable in the output schema
+ *
+ * Returns `true` if the input column is nullable or if the write mode is not set to
+ * write the table all at once instead of chunked.
+ *
+ * @param column A view of the (linked) column
+ * @param column_metadata Metadata of the column
+ * @param write_mode Flag to indicate that we are guaranteeing a single table write
+ *
+ * @return Whether the column is nullable.
+ */
+[[nodiscard]] bool is_output_column_nullable(cudf::detail::LinkedColPtr const& column,
+                                             column_in_metadata const& column_metadata,
+                                             ::cudf::io::detail::single_write_mode write_mode);
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index 0e3ce779089..badcd3f58f9 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -16,12 +16,12 @@
 
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/text/device_data_chunks.hpp"
-#include "io/utilities/config_utils.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/config_utils.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 51dc0ca90af..be2e2b9a79c 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -55,6 +55,8 @@
 #include <numeric>
 #include <optional>
 
+namespace cudf::io::text {
+namespace detail {
 namespace {
 
 using cudf::io::text::detail::multistate;
@@ -299,11 +301,6 @@ CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel(
 
 }  // namespace
 
-namespace cudf {
-namespace io {
-namespace text {
-namespace detail {
-
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               byte_range_info byte_range,
@@ -336,173 +333,181 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   CUDF_EXPECTS(delimiter.size() < multistate::max_segment_value,
                "delimiter contains too many total tokens to produce a deterministic result.");
 
-  auto const concurrency = 2;
-
-  // must be at least 32 when using warp-reduce on partials
-  // must be at least 1 more than max possible concurrent tiles
-  // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s
-  auto num_tile_states = std::max(32, TILES_PER_CHUNK * concurrency + 32);
-  auto tile_multistates =
-    scan_tile_state<multistate>(num_tile_states, stream, rmm::mr::get_current_device_resource());
-  auto tile_offsets =
-    scan_tile_state<output_offset>(num_tile_states, stream, rmm::mr::get_current_device_resource());
-
-  multibyte_split_init_kernel<<<TILES_PER_CHUNK,
-                                THREADS_PER_TILE,
-                                0,
-                                stream.value()>>>(  //
-    -TILES_PER_CHUNK,
-    TILES_PER_CHUNK,
-    tile_multistates,
-    tile_offsets,
-    cudf::io::text::detail::scan_tile_status::oob);
-
-  auto multistate_seed = multistate();
-  multistate_seed.enqueue(0, 0);  // this represents the first state in the pattern.
-
-  // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as
-  // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block
-  // would have to follow separate logic.
-  cudf::detail::device_single_thread(
-    [tm = scan_tile_state_view<multistate>(tile_multistates),
-     to = scan_tile_state_view<output_offset>(tile_offsets),
-     multistate_seed] __device__() mutable {
-      tm.set_inclusive_prefix(-1, multistate_seed);
-      to.set_inclusive_prefix(-1, 0);
-    },
-    stream);
-
-  auto reader               = source.create_reader();
-  auto chunk_offset         = std::max<byte_offset>(0, byte_range.offset() - delimiter.size());
-  auto const byte_range_end = byte_range.offset() + byte_range.size();
-  reader->skip_bytes(chunk_offset);
-  // amortize output chunk allocations over 8 worst-case outputs. This limits the overallocation
-  constexpr auto max_growth = 8;
-  output_builder<byte_offset> row_offset_storage(ITEMS_PER_CHUNK, max_growth, stream);
-  output_builder<char> char_storage(ITEMS_PER_CHUNK, max_growth, stream);
-
-  auto streams = cudf::detail::fork_streams(stream, concurrency);
-
-  cudaEvent_t last_launch_event;
-  CUDF_CUDA_TRY(cudaEventCreate(&last_launch_event));
-
-  auto& read_stream     = streams[0];
-  auto& scan_stream     = streams[1];
-  auto chunk            = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream);
-  int64_t base_tile_idx = 0;
+  auto chunk_offset = std::max<byte_offset>(0, byte_range.offset() - delimiter.size());
   std::optional<byte_offset> first_row_offset;
-  std::optional<byte_offset> last_row_offset;
-  bool found_last_offset = false;
   if (byte_range.offset() == 0) { first_row_offset = 0; }
-  std::swap(read_stream, scan_stream);
-
-  while (chunk->size() > 0) {
-    // if we found the last delimiter, or didn't find delimiters inside the byte range at all: abort
-    if (last_row_offset.has_value() or
-        (not first_row_offset.has_value() and chunk_offset >= byte_range_end)) {
-      break;
-    }
-
-    auto tiles_in_launch =
-      cudf::util::div_rounding_up_safe(chunk->size(), static_cast<std::size_t>(ITEMS_PER_TILE));
-
-    auto row_offsets = row_offset_storage.next_output(scan_stream);
+  std::optional<byte_offset> last_row_offset;
 
-    // reset the next chunk of tile state
-    multibyte_split_init_kernel<<<tiles_in_launch,
+  auto [global_offsets, chars] = [&] {
+    // must be at least 32 when using warp-reduce on partials
+    // must be at least 1 more than max possible concurrent tiles
+    // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s
+    auto const concurrency = 2;
+    auto num_tile_states   = std::max(32, TILES_PER_CHUNK * concurrency + 32);
+    auto tile_multistates =
+      scan_tile_state<multistate>(num_tile_states, stream, rmm::mr::get_current_device_resource());
+    auto tile_offsets = scan_tile_state<output_offset>(
+      num_tile_states, stream, rmm::mr::get_current_device_resource());
+
+    multibyte_split_init_kernel<<<TILES_PER_CHUNK,
                                   THREADS_PER_TILE,
                                   0,
-                                  scan_stream.value()>>>(  //
-      base_tile_idx,
-      tiles_in_launch,
+                                  stream.value()>>>(  //
+      -TILES_PER_CHUNK,
+      TILES_PER_CHUNK,
       tile_multistates,
-      tile_offsets);
+      tile_offsets,
+      cudf::io::text::detail::scan_tile_status::oob);
 
-    CUDF_CUDA_TRY(cudaStreamWaitEvent(scan_stream.value(), last_launch_event));
+    auto multistate_seed = multistate();
+    multistate_seed.enqueue(0, 0);  // this represents the first state in the pattern.
 
-    if (delimiter.size() == 1) {
-      // the single-byte case allows for a much more efficient kernel, so we special-case it
-      byte_split_kernel<<<tiles_in_launch,
-                          THREADS_PER_TILE,
-                          0,
-                          scan_stream.value()>>>(  //
-        base_tile_idx,
-        chunk_offset,
-        row_offset_storage.size(),
-        tile_offsets,
-        delimiter[0],
-        *chunk,
-        row_offsets);
-    } else {
-      multibyte_split_kernel<<<tiles_in_launch,
-                               THREADS_PER_TILE,
-                               0,
-                               scan_stream.value()>>>(  //
+    // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as
+    // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block
+    // would have to follow separate logic.
+    cudf::detail::device_single_thread(
+      [tm = scan_tile_state_view<multistate>(tile_multistates),
+       to = scan_tile_state_view<output_offset>(tile_offsets),
+       multistate_seed] __device__() mutable {
+        tm.set_inclusive_prefix(-1, multistate_seed);
+        to.set_inclusive_prefix(-1, 0);
+      },
+      stream);
+
+    auto reader               = source.create_reader();
+    auto const byte_range_end = byte_range.offset() + byte_range.size();
+    reader->skip_bytes(chunk_offset);
+    // amortize output chunk allocations over 8 worst-case outputs. This limits the overallocation
+    constexpr auto max_growth = 8;
+    output_builder<byte_offset> row_offset_storage(ITEMS_PER_CHUNK, max_growth, stream);
+    output_builder<char> char_storage(ITEMS_PER_CHUNK, max_growth, stream);
+
+    auto streams = cudf::detail::fork_streams(stream, concurrency);
+
+    cudaEvent_t last_launch_event;
+    CUDF_CUDA_TRY(cudaEventCreate(&last_launch_event));
+
+    auto& read_stream      = streams[0];
+    auto& scan_stream      = streams[1];
+    auto chunk             = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream);
+    int64_t base_tile_idx  = 0;
+    bool found_last_offset = false;
+    std::swap(read_stream, scan_stream);
+
+    while (chunk->size() > 0) {
+      // if we found the last delimiter, or didn't find delimiters inside the byte range at all:
+      // abort
+      if (last_row_offset.has_value() or
+          (not first_row_offset.has_value() and chunk_offset >= byte_range_end)) {
+        break;
+      }
+
+      auto tiles_in_launch =
+        cudf::util::div_rounding_up_safe(chunk->size(), static_cast<std::size_t>(ITEMS_PER_TILE));
+
+      auto row_offsets = row_offset_storage.next_output(scan_stream);
+
+      // reset the next chunk of tile state
+      multibyte_split_init_kernel<<<tiles_in_launch,
+                                    THREADS_PER_TILE,
+                                    0,
+                                    scan_stream.value()>>>(  //
         base_tile_idx,
-        chunk_offset,
-        row_offset_storage.size(),
+        tiles_in_launch,
         tile_multistates,
-        tile_offsets,
-        {device_delim.data(), static_cast<std::size_t>(device_delim.size())},
-        *chunk,
-        row_offsets);
-    }
+        tile_offsets);
+
+      CUDF_CUDA_TRY(cudaStreamWaitEvent(scan_stream.value(), last_launch_event));
+
+      if (delimiter.size() == 1) {
+        // the single-byte case allows for a much more efficient kernel, so we special-case it
+        byte_split_kernel<<<tiles_in_launch,
+                            THREADS_PER_TILE,
+                            0,
+                            scan_stream.value()>>>(  //
+          base_tile_idx,
+          chunk_offset,
+          row_offset_storage.size(),
+          tile_offsets,
+          delimiter[0],
+          *chunk,
+          row_offsets);
+      } else {
+        multibyte_split_kernel<<<tiles_in_launch,
+                                 THREADS_PER_TILE,
+                                 0,
+                                 scan_stream.value()>>>(  //
+          base_tile_idx,
+          chunk_offset,
+          row_offset_storage.size(),
+          tile_multistates,
+          tile_offsets,
+          {device_delim.data(), static_cast<std::size_t>(device_delim.size())},
+          *chunk,
+          row_offsets);
+      }
 
-    // load the next chunk
-    auto next_chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream);
-    // while that is running, determine how many offsets we output (synchronizes)
-    auto const new_offsets = [&] {
-      auto const new_offsets_unclamped =
-        tile_offsets.get_inclusive_prefix(base_tile_idx + tiles_in_launch - 1, scan_stream) -
-        static_cast<output_offset>(row_offset_storage.size());
-      // if we are not in the last chunk, we can use all offsets
-      if (chunk_offset + static_cast<output_offset>(chunk->size()) < byte_range_end) {
-        return new_offsets_unclamped;
+      // load the next chunk
+      auto next_chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream);
+      // while that is running, determine how many offsets we output (synchronizes)
+      auto const new_offsets = [&] {
+        auto const new_offsets_unclamped =
+          tile_offsets.get_inclusive_prefix(base_tile_idx + tiles_in_launch - 1, scan_stream) -
+          static_cast<output_offset>(row_offset_storage.size());
+        // if we are not in the last chunk, we can use all offsets
+        if (chunk_offset + static_cast<output_offset>(chunk->size()) < byte_range_end) {
+          return new_offsets_unclamped;
+        }
+        // if we are in the last chunk, we need to find the first out-of-bounds offset
+        auto const it = thrust::make_counting_iterator(output_offset{});
+        auto const end_loc =
+          *thrust::find_if(rmm::exec_policy_nosync(scan_stream),
+                           it,
+                           it + new_offsets_unclamped,
+                           [row_offsets, byte_range_end] __device__(output_offset i) {
+                             return row_offsets[i] >= byte_range_end;
+                           });
+        // if we had no out-of-bounds offset, we copy all offsets
+        if (end_loc == new_offsets_unclamped) { return end_loc; }
+        // otherwise we copy only up to (including) the first out-of-bounds delimiter
+        found_last_offset = true;
+        return end_loc + 1;
+      }();
+      row_offset_storage.advance_output(new_offsets, scan_stream);
+      // determine if we found the first or last field offset for the byte range
+      if (new_offsets > 0 and not first_row_offset) {
+        first_row_offset = row_offset_storage.front_element(scan_stream);
+      }
+      if (found_last_offset) { last_row_offset = row_offset_storage.back_element(scan_stream); }
+      // copy over the characters we need, if we already encountered the first field delimiter
+      if (first_row_offset.has_value()) {
+        auto const begin =
+          chunk->data() + std::max<byte_offset>(0, *first_row_offset - chunk_offset);
+        auto const sentinel = last_row_offset.value_or(std::numeric_limits<byte_offset>::max());
+        auto const end =
+          chunk->data() + std::min<byte_offset>(sentinel - chunk_offset, chunk->size());
+        auto const output_size = end - begin;
+        auto char_output       = char_storage.next_output(scan_stream);
+        thrust::copy(rmm::exec_policy_nosync(scan_stream), begin, end, char_output.begin());
+        char_storage.advance_output(output_size, scan_stream);
       }
-      // if we are in the last chunk, we need to find the first out-of-bounds offset
-      auto const it = thrust::make_counting_iterator(output_offset{});
-      auto const end_loc =
-        *thrust::find_if(rmm::exec_policy_nosync(scan_stream),
-                         it,
-                         it + new_offsets_unclamped,
-                         [row_offsets, byte_range_end] __device__(output_offset i) {
-                           return row_offsets[i] >= byte_range_end;
-                         });
-      // if we had no out-of-bounds offset, we copy all offsets
-      if (end_loc == new_offsets_unclamped) { return end_loc; }
-      // otherwise we copy only up to (including) the first out-of-bounds delimiter
-      found_last_offset = true;
-      return end_loc + 1;
-    }();
-    row_offset_storage.advance_output(new_offsets, scan_stream);
-    // determine if we found the first or last field offset for the byte range
-    if (new_offsets > 0 and not first_row_offset) {
-      first_row_offset = row_offset_storage.front_element(scan_stream);
-    }
-    if (found_last_offset) { last_row_offset = row_offset_storage.back_element(scan_stream); }
-    // copy over the characters we need, if we already encountered the first field delimiter
-    if (first_row_offset.has_value()) {
-      auto const begin = chunk->data() + std::max<byte_offset>(0, *first_row_offset - chunk_offset);
-      auto const sentinel = last_row_offset.value_or(std::numeric_limits<byte_offset>::max());
-      auto const end =
-        chunk->data() + std::min<byte_offset>(sentinel - chunk_offset, chunk->size());
-      auto const output_size = end - begin;
-      auto char_output       = char_storage.next_output(scan_stream);
-      thrust::copy(rmm::exec_policy_nosync(scan_stream), begin, end, char_output.begin());
-      char_storage.advance_output(output_size, scan_stream);
-    }
 
-    CUDF_CUDA_TRY(cudaEventRecord(last_launch_event, scan_stream.value()));
+      CUDF_CUDA_TRY(cudaEventRecord(last_launch_event, scan_stream.value()));
 
-    std::swap(read_stream, scan_stream);
-    base_tile_idx += tiles_in_launch;
-    chunk_offset += chunk->size();
-    chunk = std::move(next_chunk);
-  }
+      std::swap(read_stream, scan_stream);
+      base_tile_idx += tiles_in_launch;
+      chunk_offset += chunk->size();
+      chunk = std::move(next_chunk);
+    }
+
+    CUDF_CUDA_TRY(cudaEventDestroy(last_launch_event));
 
-  CUDF_CUDA_TRY(cudaEventDestroy(last_launch_event));
+    cudf::detail::join_streams(streams, stream);
 
-  cudf::detail::join_streams(streams, stream);
+    auto chars          = char_storage.gather(stream, mr);
+    auto global_offsets = row_offset_storage.gather(stream, mr);
+    return std::pair{std::move(global_offsets), std::move(chars)};
+  }();
 
   // if the input was empty, we didn't find a delimiter at all,
   // or the first delimiter was also the last: empty output
@@ -511,9 +516,6 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
     return make_empty_column(type_id::STRING);
   }
 
-  auto chars          = char_storage.gather(stream, mr);
-  auto global_offsets = row_offset_storage.gather(stream, mr);
-
   // insert an offset at the beginning if we started at the beginning of the input
   bool const insert_begin = first_row_offset.value_or(0) == 0;
   // insert an offset at the end if we have not terminated the last row
@@ -591,6 +593,4 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   return result;
 }
 
-}  // namespace text
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::text
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index 20ac89b4d53..a3afbd52896 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include "config_utils.hpp"
+#include "getenv_or.hpp"
 
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <cstdlib>
+#include <sstream>
 #include <string>
 
-namespace cudf::io::detail {
+namespace cudf::io {
 
 namespace cufile_integration {
 
@@ -80,4 +82,4 @@ bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_
 
 }  // namespace nvcomp_integration
 
-}  // namespace cudf::io::detail
+}  // namespace cudf::io
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index a6cbbcd84a6..1dbb9369115 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -15,8 +15,9 @@
  */
 
 #include "file_io_utilities.hpp"
-#include "io/utilities/config_utils.hpp"
 
+#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/io/config_utils.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -40,7 +41,7 @@ class file_sink : public data_sink {
     _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc);
     if (!_output_stream.is_open()) { detail::throw_on_file_open_failure(filepath, true); }
 
-    if (detail::cufile_integration::is_kvikio_enabled()) {
+    if (cufile_integration::is_kvikio_enabled()) {
       _kvikio_file = kvikio::FileHandle(filepath, "w");
       CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index ca8932322bf..91be154e09d 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -15,9 +15,10 @@
  */
 
 #include "file_io_utilities.hpp"
-#include "io/utilities/config_utils.hpp"
 
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/config_utils.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
@@ -44,7 +45,7 @@ class file_source : public datasource {
   explicit file_source(char const* filepath) : _file(filepath, O_RDONLY)
   {
     detail::force_init_cuda_context();
-    if (detail::cufile_integration::is_kvikio_enabled()) {
+    if (cufile_integration::is_kvikio_enabled()) {
       _kvikio_file = kvikio::FileHandle(filepath);
       CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");
@@ -216,7 +217,7 @@ class memory_mapped_source : public file_source {
 
   void map(int fd, size_t offset, size_t size)
   {
-    CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file");
+    CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file", std::overflow_error);
 
     // Offset for `mmap()` must be page aligned
     _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1);
@@ -433,7 +434,7 @@ std::unique_ptr<datasource> datasource::create(std::string const& filepath,
                                                size_t size)
 {
 #ifdef CUFILE_FOUND
-  if (detail::cufile_integration::is_always_enabled()) {
+  if (cufile_integration::is_always_enabled()) {
     // avoid mmap as GDS is expected to be used for most reads
     return std::make_unique<direct_read_source>(filepath.c_str());
   }
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index a9d4f19c848..d7b54399f8d 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -16,9 +16,11 @@
 
 #include "file_io_utilities.hpp"
 
-#include "io/utilities/config_utils.hpp"
+#include "getenv_or.hpp"
 
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/io/config_utils.hpp>
 
 #include <rmm/device_buffer.hpp>
 
@@ -221,7 +223,6 @@ cufile_input_impl::cufile_input_impl(std::string const& filepath)
     // The benefit from multithreaded read plateaus around 16 threads
     pool(getenv_or("LIBCUDF_CUFILE_THREAD_COUNT", 16))
 {
-  pool.sleep_duration = 10;
 }
 
 namespace {
@@ -230,14 +231,15 @@ template <typename DataT,
           typename F,
           typename ResultT = std::invoke_result_t<F, DataT*, size_t, size_t>>
 std::vector<std::future<ResultT>> make_sliced_tasks(
-  F function, DataT* ptr, size_t offset, size_t size, cudf::detail::thread_pool& pool)
+  F function, DataT* ptr, size_t offset, size_t size, BS::thread_pool& pool)
 {
   constexpr size_t default_max_slice_size = 4 * 1024 * 1024;
   static auto const max_slice_size = getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_size);
   auto const slices                = make_file_io_slices(size, max_slice_size);
   std::vector<std::future<ResultT>> slice_tasks;
   std::transform(slices.cbegin(), slices.cend(), std::back_inserter(slice_tasks), [&](auto& slice) {
-    return pool.submit(function, ptr + slice.offset, slice.size, offset + slice.offset);
+    return pool.submit_task(
+      [&] { return function(ptr + slice.offset, slice.size, offset + slice.offset); });
   });
   return slice_tasks;
 }
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 91ef41fba6e..441bede200d 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -19,8 +19,7 @@
 #ifdef CUFILE_FOUND
 #include <cudf_test/file_utilities.hpp>
 
-#include <cudf/utilities/thread_pool.hpp>
-
+#include <BS_thread_pool.hpp>
 #include <cufile.h>
 #endif
 
@@ -150,7 +149,7 @@ class cufile_input_impl final : public cufile_input {
  private:
   cufile_shim const* shim = nullptr;
   cufile_registered_file const cf_file;
-  cudf::detail::thread_pool pool;
+  BS::thread_pool pool;
 };
 
 /**
@@ -167,7 +166,7 @@ class cufile_output_impl final : public cufile_output {
  private:
   cufile_shim const* shim = nullptr;
   cufile_registered_file const cf_file;
-  cudf::detail::thread_pool pool;
+  BS::thread_pool pool;
 };
 #else
 
diff --git a/cpp/src/io/utilities/config_utils.hpp b/cpp/src/io/utilities/getenv_or.hpp
similarity index 63%
rename from cpp/src/io/utilities/config_utils.hpp
rename to cpp/src/io/utilities/getenv_or.hpp
index 74df1375e6f..3fd97a00b61 100644
--- a/cpp/src/io/utilities/config_utils.hpp
+++ b/cpp/src/io/utilities/getenv_or.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,15 +13,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #pragma once
 
 #include <cudf/detail/utilities/logger.hpp>
 
+#include <cstdlib>
 #include <sstream>
 #include <string>
 
-namespace cudf::io::detail {
-
+namespace {
 /**
  * @brief Returns the value of the environment variable, or a default value if the variable is not
  * present.
@@ -45,37 +46,4 @@ T getenv_or(std::string_view env_var_name, T default_val)
   return converted_val;
 }
 
-namespace cufile_integration {
-
-/**
- * @brief Returns true if cuFile and its compatibility mode are enabled.
- */
-bool is_always_enabled();
-
-/**
- * @brief Returns true if only direct IO through cuFile is enabled (compatibility mode is disabled).
- */
-bool is_gds_enabled();
-
-/**
- * @brief Returns true if KvikIO is enabled.
- */
-bool is_kvikio_enabled();
-
-}  // namespace cufile_integration
-
-namespace nvcomp_integration {
-
-/**
- * @brief Returns true if all nvCOMP uses are enabled.
- */
-bool is_all_enabled();
-
-/**
- * @brief Returns true if stable nvCOMP use is enabled.
- */
-bool is_stable_enabled();
-
-}  // namespace nvcomp_integration
-
-}  // namespace cudf::io::detail
+}  // namespace
diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh
index 0fc1c3718b1..ea59f23c77f 100644
--- a/cpp/src/join/mixed_join_kernel.cuh
+++ b/cpp/src/join/mixed_join_kernel.cuh
@@ -24,6 +24,7 @@
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <cooperative_groups.h>
@@ -38,7 +39,7 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <cudf::size_type block_size, bool has_nulls>
-__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
+CUDF_HIDDEN __launch_bounds__(block_size) __global__
   void mixed_join(table_device_view left_table,
                   table_device_view right_table,
                   table_device_view probe,
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index 01e3fe09b38..1f31eaa7878 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -22,6 +22,7 @@
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <cub/cub.cuh>
@@ -34,7 +35,7 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <cudf::size_type block_size, bool has_nulls>
-__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
+CUDF_HIDDEN __launch_bounds__(block_size) __global__
   void mixed_join_semi(table_device_view left_table,
                        table_device_view right_table,
                        table_device_view probe,
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index 618e7a9082e..00a90f8273f 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -22,6 +22,7 @@
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <cooperative_groups.h>
@@ -35,20 +36,19 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <int block_size, bool has_nulls>
-__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
-  void compute_mixed_join_output_size(
-    table_device_view left_table,
-    table_device_view right_table,
-    table_device_view probe,
-    table_device_view build,
-    row_hash const hash_probe,
-    row_equality const equality_probe,
-    join_kind const join_type,
-    cudf::detail::mixed_multimap_type::device_view hash_table_view,
-    ast::detail::expression_device_view device_expression_data,
-    bool const swap_tables,
-    std::size_t* output_size,
-    cudf::device_span<cudf::size_type> matches_per_row)
+CUDF_HIDDEN __launch_bounds__(block_size) __global__ void compute_mixed_join_output_size(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_hash const hash_probe,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  std::size_t* output_size,
+  cudf::device_span<cudf::size_type> matches_per_row)
 {
   // The (required) extern storage of the shared memory array leads to
   // conflicting declarations between different templates. The easiest
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index fbb0f6cb0f5..4fb983dc5a6 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -76,18 +76,6 @@ struct comparator_adapter {
   {
   }
 
-  // suppress "function was declared but never referenced warning"
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 177
-  __device__ constexpr auto operator()(lhs_index_type lhs_index,
-                                       lhs_index_type rhs_index) const noexcept
-  {
-    auto const lhs = static_cast<size_type>(lhs_index);
-    auto const rhs = static_cast<size_type>(rhs_index);
-
-    return _self_equal(lhs, rhs);
-  }
-
   __device__ constexpr auto operator()(rhs_index_type lhs_index,
                                        rhs_index_type rhs_index) const noexcept
   {
@@ -103,13 +91,6 @@ struct comparator_adapter {
     return _two_table_equal(lhs_index, rhs_index);
   }
 
-  __device__ constexpr auto operator()(rhs_index_type lhs_index,
-                                       lhs_index_type rhs_index) const noexcept
-  {
-    return _two_table_equal(lhs_index, rhs_index);
-  }
-#pragma nv_diagnostic pop
-
  private:
   SelfEqual const _self_equal;
   TwoTableEqual const _two_table_equal;
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 43a3d69091a..2ca22f0e017 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -451,8 +451,8 @@ struct replace_multi_fn {
     while (spos < d_str.size_bytes()) {
       for (int tgt_idx = 0; tgt_idx < d_targets.size(); ++tgt_idx) {
         auto const d_tgt = d_targets.element<string_view>(tgt_idx);
-        if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) &&    // check fit
-            (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0))  // and match
+        if (!d_tgt.empty() && (d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) &&  // check fit
+            (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0))                  // and match
         {
           auto const d_repl = (d_repls.size() == 1) ? d_repls.element<string_view>(0)
                                                     : d_repls.element<string_view>(tgt_idx);
@@ -468,9 +468,8 @@ struct replace_multi_fn {
       }
       ++spos;
     }
-    if (out_ptr)  // copy remainder
-    {
-      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
+    if (out_ptr) {
+      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);  // copy remainder
     } else {
       d_sizes[idx] = bytes;
     }
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 23614ac0733..4d7096c02ca 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -357,6 +357,12 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
   auto const chars_bytes =
     get_offset_value(input.offsets(), input.offset() + strings_count, stream) -
     get_offset_value(input.offsets(), input.offset(), stream);
+  if (chars_bytes == 0) {
+    auto offsets = cudf::make_column_from_scalar(
+      numeric_scalar<int32_t>(0, true, stream), strings_count + 1, stream, mr);
+    auto tokens = rmm::device_uvector<string_index_pair>(0, stream);
+    return std::pair{std::move(offsets), std::move(tokens)};
+  }
   auto const d_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
 
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cuh b/cpp/src/text/bpe/byte_pair_encoding.cuh
index 3bb574748b6..a2e441c3284 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cuh
+++ b/cpp/src/text/bpe/byte_pair_encoding.cuh
@@ -89,14 +89,6 @@ struct bpe_equal {
     return lhs == rhs;  // all rows are unique
   }
   // used by find
-  __device__ bool operator()(cudf::size_type lhs, merge_pair_type const& rhs) const noexcept
-  {
-    lhs *= 2;
-    auto const left  = d_strings.element<cudf::string_view>(lhs);
-    auto const right = d_strings.element<cudf::string_view>(lhs + 1);
-    return (left == rhs.first) && (right == rhs.second);
-  }
-  // used by find
   __device__ bool operator()(merge_pair_type const& lhs, cudf::size_type rhs) const noexcept
   {
     rhs *= 2;
@@ -157,11 +149,6 @@ struct mp_equal {
     return left == right;
   }
   // used by find
-  __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
-  {
-    auto const left = d_strings.element<cudf::string_view>(lhs);
-    return left == rhs;
-  }
   __device__ bool operator()(cudf::string_view const& lhs, cudf::size_type rhs) const noexcept
   {
     auto const right = d_strings.element<cudf::string_view>(rhs);
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index 9cf934165f6..e465fb79c89 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -19,16 +19,19 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <nvtext/detail/generate_ngrams.hpp>
 #include <nvtext/jaccard.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
@@ -36,127 +39,375 @@
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/sequence.h>
 #include <thrust/transform.h>
 
 namespace nvtext {
 namespace detail {
 namespace {
 
+constexpr cudf::thread_index_type block_size       = 256;
+constexpr cudf::thread_index_type bytes_per_thread = 4;
+
 /**
  * @brief Retrieve the row data (span) for the given column/row-index
  *
- * @param d_input Input lists column
+ * @param values Flat vector of all values
+ * @param offsets Offsets identifying rows within values
  * @param idx Row index to retrieve
  * @return A device-span of the row values
  */
-__device__ auto get_row(cudf::column_device_view const& d_input, cudf::size_type idx)
+__device__ auto get_row(uint32_t const* values, int64_t const* offsets, cudf::size_type row_idx)
 {
-  auto const offsets =
-    d_input.child(cudf::lists_column_view::offsets_column_index).data<cudf::size_type>();
-  auto const offset = offsets[idx];
-  auto const size   = offsets[idx + 1] - offset;
-  auto const begin =
-    d_input.child(cudf::lists_column_view::child_column_index).data<uint32_t>() + offset;
+  auto const offset = offsets[row_idx];
+  auto const size   = offsets[row_idx + 1] - offset;
+  auto const begin  = values + offset;
   return cudf::device_span<uint32_t const>(begin, size);
 }
 
 /**
- * @brief Count the unique values within each row of the input column
+ * @brief Kernel to count the unique values within each row of the input column
+ *
+ * This is called with a warp per row.
  *
- * This is called with a warp per row
+ * @param d_values Sorted hash values to count uniqueness
+ * @param d_offsets Offsets to each set of row elements in d_values
+ * @param rows Number of rows in the output
+ * @param d_results Number of unique values in each row
  */
-struct sorted_unique_fn {
-  cudf::column_device_view const d_input;
-  cudf::size_type* d_results;
+CUDF_KERNEL void sorted_unique_fn(uint32_t const* d_values,
+                                  int64_t const* d_offsets,
+                                  cudf::size_type rows,
+                                  cudf::size_type* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size)) { return; }
 
-  // warp per row
-  __device__ void operator()(cudf::size_type idx) const
-  {
-    using warp_reduce = cub::WarpReduce<cudf::size_type>;
-    __shared__ typename warp_reduce::TempStorage temp_storage;
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage temp_storage;
 
-    auto const row_idx  = idx / cudf::detail::warp_size;
-    auto const lane_idx = idx % cudf::detail::warp_size;
-    auto const row      = get_row(d_input, row_idx);
-    auto const begin    = row.begin();
+  auto const row_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+  auto const row      = get_row(d_values, d_offsets, row_idx);
+  auto const begin    = row.begin();
 
-    cudf::size_type count = 0;
-    for (auto itr = begin + lane_idx; itr < row.end(); itr += cudf::detail::warp_size) {
-      count += (itr == begin || *itr != *(itr - 1));
-    }
-    auto const result = warp_reduce(temp_storage).Sum(count);
-    if (lane_idx == 0) { d_results[row_idx] = result; }
+  cudf::size_type count = 0;
+  for (auto itr = begin + lane_idx; itr < row.end(); itr += cudf::detail::warp_size) {
+    count += (itr == begin || *itr != *(itr - 1));
   }
-};
+  auto const result = warp_reduce(temp_storage).Sum(count);
+  if (lane_idx == 0) { d_results[row_idx] = result; }
+}
 
-rmm::device_uvector<cudf::size_type> compute_unique_counts(cudf::column_view const& input,
+/**
+ * @brief Count the unique values within each row of the input column
+ *
+ * @param values Sorted hash values to count uniqueness
+ * @param offsets Offsets to each set of row elements in d_values
+ * @param rows Number of rows in the output
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Number of unique values
+ */
+rmm::device_uvector<cudf::size_type> compute_unique_counts(uint32_t const* values,
+                                                           int64_t const* offsets,
+                                                           cudf::size_type rows,
                                                            rmm::cuda_stream_view stream)
 {
-  auto const d_input = cudf::column_device_view::create(input, stream);
-  auto d_results     = rmm::device_uvector<cudf::size_type>(input.size(), stream);
-  sorted_unique_fn fn{*d_input, d_results.data()};
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::counting_iterator<cudf::size_type>(0),
-                     input.size() * cudf::detail::warp_size,
-                     fn);
+  auto d_results        = rmm::device_uvector<cudf::size_type>(rows, stream);
+  auto const num_blocks = cudf::util::div_rounding_up_safe(
+    static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size, block_size);
+  sorted_unique_fn<<<num_blocks, block_size, 0, stream.value()>>>(
+    values, offsets, rows, d_results.data());
   return d_results;
 }
 
+/**
+ * @brief Kernel to count the number of common values within each row of the 2 input columns
+ *
+ * This is called with a warp per row.
+ *
+ * @param d_values1 Sorted hash values to check against d_values2
+ * @param d_offsets1 Offsets to each set of row elements in d_values1
+ * @param d_values2 Sorted hash values to check against d_values1
+ * @param d_offsets2 Offsets to each set of row elements in d_values2
+ * @param rows Number of rows in the output
+ * @param d_results Number of common values in each row
+ */
+CUDF_KERNEL void sorted_intersect_fn(uint32_t const* d_values1,
+                                     int64_t const* d_offsets1,
+                                     uint32_t const* d_values2,
+                                     int64_t const* d_offsets2,
+                                     cudf::size_type rows,
+                                     cudf::size_type* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size)) { return; }
+
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage temp_storage;
+
+  auto const row_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  auto const needles  = get_row(d_values1, d_offsets1, row_idx);
+  auto const haystack = get_row(d_values2, d_offsets2, row_idx);
+
+  auto begin     = haystack.begin();
+  auto const end = haystack.end();
+
+  cudf::size_type count = 0;
+  for (auto itr = needles.begin() + lane_idx; itr < needles.end() && begin < end;
+       itr += cudf::detail::warp_size) {
+    if (itr != needles.begin() && *itr == *(itr - 1)) { continue; }  // skip duplicates
+    // search haystack for this needle (*itr)
+    auto const found = thrust::lower_bound(thrust::seq, begin, end, *itr);
+    count += (found != end) && (*found == *itr);  // increment if found;
+    begin = found;                                // shorten the next lower-bound range
+  }
+  // sum up the counts across this warp
+  auto const result = warp_reduce(temp_storage).Sum(count);
+  if (lane_idx == 0) { d_results[row_idx] = result; }
+}
+
 /**
  * @brief Count the number of common values within each row of the 2 input columns
  *
- * This is called with a warp per row
+ * @param d_values1 Sorted hash values to check against d_values2
+ * @param d_offsets1 Offsets to each set of row elements in d_values1
+ * @param d_values2 Sorted hash values to check against d_values1
+ * @param d_offsets2 Offsets to each set of row elements in d_values2
+ * @param rows Number of rows in the output
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Number of common values
  */
-struct sorted_intersect_fn {
-  cudf::column_device_view const d_input1;
-  cudf::column_device_view const d_input2;
-  cudf::size_type* d_results;
+rmm::device_uvector<cudf::size_type> compute_intersect_counts(uint32_t const* values1,
+                                                              int64_t const* offsets1,
+                                                              uint32_t const* values2,
+                                                              int64_t const* offsets2,
+                                                              cudf::size_type rows,
+                                                              rmm::cuda_stream_view stream)
+{
+  auto d_results        = rmm::device_uvector<cudf::size_type>(rows, stream);
+  auto const num_blocks = cudf::util::div_rounding_up_safe(
+    static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size, block_size);
+  sorted_intersect_fn<<<num_blocks, block_size, 0, stream.value()>>>(
+    values1, offsets1, values2, offsets2, rows, d_results.data());
+  return d_results;
+}
 
-  // warp per row
-  __device__ void operator()(cudf::size_type idx) const
-  {
-    using warp_reduce = cub::WarpReduce<cudf::size_type>;
-    __shared__ typename warp_reduce::TempStorage temp_storage;
+/**
+ * @brief Counts the number of substrings in each row of the given strings column
+ *
+ * Each warp processes a single string.
+ * Formula is `count = max(1, str.length() - width + 1)`
+ * If a string has less than width characters (but not empty), the count is 1
+ * since the entire string is still hashed.
+ *
+ * @param d_strings Input column of strings
+ * @param width Substring size in characters
+ * @param d_counts Output number of substring per row of input
+ */
+CUDF_KERNEL void count_substrings_kernel(cudf::column_device_view const d_strings,
+                                         cudf::size_type width,
+                                         int64_t* d_counts)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(d_strings.size()) * cudf::detail::warp_size)) {
+    return;
+  }
 
-    auto const row_idx  = idx / cudf::detail::warp_size;
-    auto const lane_idx = idx % cudf::detail::warp_size;
+  auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  if (d_strings.is_null(str_idx)) {
+    d_counts[str_idx] = 0;
+    return;
+  }
 
-    auto const needles  = get_row(d_input1, row_idx);
-    auto const haystack = get_row(d_input2, row_idx);
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) {
+    d_counts[str_idx] = 0;
+    return;
+  }
 
-    auto begin     = haystack.begin();
-    auto const end = haystack.end();
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage temp_storage;
 
-    // TODO: investigate cuCollections device-side static-map to match row values
+  auto const end        = d_str.data() + d_str.size_bytes();
+  auto const lane_idx   = idx % cudf::detail::warp_size;
+  cudf::size_type count = 0;
+  for (auto itr = d_str.data() + (lane_idx * bytes_per_thread); itr < end;
+       itr += cudf::detail::warp_size * bytes_per_thread) {
+    for (auto s = itr; (s < (itr + bytes_per_thread)) && (s < end); ++s) {
+      count += static_cast<cudf::size_type>(cudf::strings::detail::is_begin_utf8_char(*s));
+    }
+  }
+  auto const char_count = warp_reduce(temp_storage).Sum(count);
+  if (lane_idx == 0) { d_counts[str_idx] = std::max(1, char_count - width + 1); }
+}
+
+/**
+ * @brief Kernel to hash the substrings for each input row
+ *
+ * Each warp processes a single string.
+ * Substrings of string "hello world" with width=4 produce:
+ *   "hell", "ello", "llo ", "lo w", "o wo", " wor", "worl", "orld"
+ * Each of these substrings is hashed and the hash stored in d_results
+ *
+ * @param d_strings Input column of strings
+ * @param width Substring size in characters
+ * @param d_output_offsets Offsets into d_results
+ * @param d_results Hash values for each substring
+ */
+CUDF_KERNEL void substring_hash_kernel(cudf::column_device_view const d_strings,
+                                       cudf::size_type width,
+                                       int64_t const* d_output_offsets,
+                                       uint32_t* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(d_strings.size()) * cudf::detail::warp_size)) {
+    return;
+  }
 
-    cudf::size_type count = 0;
-    for (auto itr = needles.begin() + lane_idx; itr < needles.end() && begin < end;
-         itr += cudf::detail::warp_size) {
-      if (itr != needles.begin() && *itr == *(itr - 1)) { continue; }  // skip duplicates
-      // search haystack for this needle (*itr)
-      auto const found = thrust::lower_bound(thrust::seq, begin, end, *itr);
-      count += (found != end) && (*found == *itr);  // increment if found;
-      begin = found;                                // shorten the next lower-bound range
+  auto const str_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  if (d_strings.is_null(str_idx)) { return; }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) { return; }
+
+  __shared__ uint32_t hvs[block_size];  // temp store for hash values
+
+  auto const hasher     = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
+  auto const end        = d_str.data() + d_str.size_bytes();
+  auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1;
+
+  auto d_hashes = d_results + d_output_offsets[str_idx];
+  auto itr      = d_str.data() + lane_idx;
+  for (auto i = 0; i < warp_count; ++i) {
+    uint32_t hash = 0;
+    if (itr < end && cudf::strings::detail::is_begin_utf8_char(*itr)) {
+      // resolve substring
+      auto const sub_str =
+        cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
+      auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(sub_str, width);
+      // hash only if we have the full width of characters or this is the beginning of the string
+      if ((left == 0) || (itr == d_str.data())) { hash = hasher(cudf::string_view(itr, bytes)); }
     }
-    // sum up the counts across this warp
-    auto const result = warp_reduce(temp_storage).Sum(count);
-    if (lane_idx == 0) { d_results[row_idx] = result; }
+    hvs[threadIdx.x] = hash;  // store hash into shared memory
+    __syncwarp();
+    if (lane_idx == 0) {
+      // copy valid hash values for this warp into d_hashes
+      auto const hashes     = &hvs[threadIdx.x];
+      auto const hashes_end = hashes + cudf::detail::warp_size;
+      d_hashes =
+        thrust::copy_if(thrust::seq, hashes, hashes_end, d_hashes, [](auto h) { return h != 0; });
+    }
+    __syncwarp();
+    itr += cudf::detail::warp_size;
   }
-};
+}
 
-rmm::device_uvector<cudf::size_type> compute_intersect_counts(cudf::column_view const& input1,
-                                                              cudf::column_view const& input2,
-                                                              rmm::cuda_stream_view stream)
+void segmented_sort(uint32_t const* input,
+                    uint32_t* output,
+                    int64_t items,
+                    cudf::size_type segments,
+                    int64_t const* offsets,
+                    rmm::cuda_stream_view stream)
 {
-  auto const d_input1 = cudf::column_device_view::create(input1, stream);
-  auto const d_input2 = cudf::column_device_view::create(input2, stream);
-  auto d_results      = rmm::device_uvector<cudf::size_type>(input1.size(), stream);
-  sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()};
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::counting_iterator<cudf::size_type>(0),
-                     input1.size() * cudf::detail::warp_size,
-                     fn);
-  return d_results;
+  rmm::device_buffer temp;
+  std::size_t temp_bytes = 0;
+  cub::DeviceSegmentedSort::SortKeys(
+    temp.data(), temp_bytes, input, output, items, segments, offsets, offsets + 1, stream.value());
+  temp = rmm::device_buffer(temp_bytes, stream);
+  cub::DeviceSegmentedSort::SortKeys(
+    temp.data(), temp_bytes, input, output, items, segments, offsets, offsets + 1, stream.value());
+}
+
+/**
+ * @brief Create hashes for each substring
+ *
+ * The hashes are sorted using a segmented-sort as setup to
+ * perform the unique and intersect operations.
+ *
+ * @param input Input strings column to hash
+ * @param width Substring width in characters
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The sorted hash values and offsets to each row
+ */
+std::pair<rmm::device_uvector<uint32_t>, rmm::device_uvector<int64_t>> hash_substrings(
+  cudf::strings_column_view const& input, cudf::size_type width, rmm::cuda_stream_view stream)
+{
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+
+  // count substrings
+  auto offsets          = rmm::device_uvector<int64_t>(input.size() + 1, stream);
+  auto const num_blocks = cudf::util::div_rounding_up_safe(
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size);
+  count_substrings_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+    *d_strings, width, offsets.data());
+  auto const total_hashes =
+    cudf::detail::sizes_to_offsets(offsets.begin(), offsets.end(), offsets.begin(), stream);
+
+  // hash substrings
+  rmm::device_uvector<uint32_t> hashes(total_hashes, stream);
+  substring_hash_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+    *d_strings, width, offsets.data(), hashes.data());
+
+  // sort hashes
+  rmm::device_uvector<uint32_t> sorted(total_hashes, stream);
+  if (total_hashes < static_cast<int64_t>(std::numeric_limits<int>::max())) {
+    segmented_sort(
+      hashes.begin(), sorted.begin(), sorted.size(), input.size(), offsets.begin(), stream);
+  } else {
+    // The CUB segmented sort can only handle max<int> total values
+    // so this code calls it in sections.
+    auto const section_size   = std::numeric_limits<int>::max() / 2L;
+    auto const sort_sections  = cudf::util::div_rounding_up_safe(total_hashes, section_size);
+    auto const offset_indices = [&] {
+      // build a set of indices that point to offsets subsections
+      auto sub_offsets = rmm::device_uvector<int64_t>(sort_sections + 1, stream);
+      thrust::sequence(
+        rmm::exec_policy(stream), sub_offsets.begin(), sub_offsets.end(), 0L, section_size);
+      auto indices = rmm::device_uvector<int64_t>(sub_offsets.size(), stream);
+      thrust::lower_bound(rmm::exec_policy(stream),
+                          offsets.begin(),
+                          offsets.end(),
+                          sub_offsets.begin(),
+                          sub_offsets.end(),
+                          indices.begin());
+      return cudf::detail::make_std_vector_sync(indices, stream);
+    }();
+
+    // Call segmented sort with the sort sections
+    for (auto i = 0L; i < sort_sections; ++i) {
+      auto const index1 = offset_indices[i];
+      auto const index2 = std::min(offset_indices[i + 1], static_cast<int64_t>(offsets.size() - 1));
+      auto const offset1 = offsets.element(index1, stream);
+      auto const offset2 = offsets.element(index2, stream);
+
+      auto const num_items    = offset2 - offset1;
+      auto const num_segments = index2 - index1;
+
+      // There is a bug in the CUB segmented sort and the workaround is to
+      // shift the offset values so the first offset is 0.
+      // This transform can be removed once the bug is fixed.
+      auto sort_offsets = rmm::device_uvector<int64_t>(num_segments + 1, stream);
+      thrust::transform(rmm::exec_policy(stream),
+                        offsets.begin() + index1,
+                        offsets.begin() + index2 + 1,
+                        sort_offsets.begin(),
+                        [offset1] __device__(auto const o) { return o - offset1; });
+
+      segmented_sort(hashes.begin() + offset1,
+                     sorted.begin() + offset1,
+                     num_items,
+                     num_segments,
+                     sort_offsets.begin(),
+                     stream);
+    }
+  }
+  return std::make_pair(std::move(sorted), std::move(offsets));
 }
 
 /**
@@ -186,62 +437,6 @@ struct jaccard_fn {
   }
 };
 
-/**
- * @brief Create hashes for each substring
- *
- * Uses the hash_character_ngrams to hash substrings of the input column.
- * This returns a lists column where each row is the hashes for the substrings
- * of the corresponding input string row.
- *
- * The hashes are then sorted using a segmented-sort as setup to
- * perform the unique and intersect operations.
- */
-std::unique_ptr<cudf::column> hash_substrings(cudf::strings_column_view const& col,
-                                              cudf::size_type width,
-                                              rmm::cuda_stream_view stream)
-{
-  auto hashes = hash_character_ngrams(col, width, stream, rmm::mr::get_current_device_resource());
-  auto const input   = cudf::lists_column_view(hashes->view());
-  auto const offsets = input.offsets_begin();
-  auto const data    = input.child().data<uint32_t>();
-
-  rmm::device_uvector<uint32_t> sorted(input.child().size(), stream);
-
-  // this is wicked fast and much faster than using cudf::lists::detail::sort_list
-  rmm::device_buffer d_temp_storage;
-  size_t temp_storage_bytes = 0;
-  cub::DeviceSegmentedSort::SortKeys(d_temp_storage.data(),
-                                     temp_storage_bytes,
-                                     data,
-                                     sorted.data(),
-                                     sorted.size(),
-                                     input.size(),
-                                     offsets,
-                                     offsets + 1,
-                                     stream.value());
-  d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream};
-  cub::DeviceSegmentedSort::SortKeys(d_temp_storage.data(),
-                                     temp_storage_bytes,
-                                     data,
-                                     sorted.data(),
-                                     sorted.size(),
-                                     input.size(),
-                                     offsets,
-                                     offsets + 1,
-                                     stream.value());
-
-  auto contents = hashes->release();
-  // the offsets are taken from the hashes column since they are the same
-  // before and after the segmented-sort
-  return cudf::make_lists_column(
-    col.size(),
-    std::move(contents.children.front()),
-    std::make_unique<cudf::column>(std::move(sorted), rmm::device_buffer{}, 0),
-    0,
-    rmm::device_buffer{},
-    stream,
-    rmm::mr::get_current_device_resource());
-}
 }  // namespace
 
 std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& input1,
@@ -261,13 +456,14 @@ std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& inp
 
   auto const [d_uniques1, d_uniques2, d_intersects] = [&] {
     // build hashes of the substrings
-    auto const hash1 = hash_substrings(input1, width, stream);
-    auto const hash2 = hash_substrings(input2, width, stream);
+    auto const [hash1, offsets1] = hash_substrings(input1, width, stream);
+    auto const [hash2, offsets2] = hash_substrings(input2, width, stream);
 
     // compute the unique counts in each set and the intersection counts
-    auto d_uniques1   = compute_unique_counts(hash1->view(), stream);
-    auto d_uniques2   = compute_unique_counts(hash2->view(), stream);
-    auto d_intersects = compute_intersect_counts(hash1->view(), hash2->view(), stream);
+    auto d_uniques1   = compute_unique_counts(hash1.data(), offsets1.data(), input1.size(), stream);
+    auto d_uniques2   = compute_unique_counts(hash2.data(), offsets2.data(), input2.size(), stream);
+    auto d_intersects = compute_intersect_counts(
+      hash1.data(), offsets1.data(), hash2.data(), offsets2.data(), input1.size(), stream);
 
     return std::tuple{std::move(d_uniques1), std::move(d_uniques2), std::move(d_intersects)};
   }();
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index ea09f5d17af..97abb1487d8 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -86,18 +86,10 @@ struct vocab_equal {
     return lhs == rhs;  // all rows are expected to be unique
   }
   // used by find
-  // suppress "function was declared but never referenced warning"
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 177
-  __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
-  {
-    return d_strings.element<cudf::string_view>(lhs) == rhs;
-  }
   __device__ bool operator()(cudf::string_view const& lhs, cudf::size_type rhs) const noexcept
   {
     return d_strings.element<cudf::string_view>(rhs) == lhs;
   }
-#pragma nv_diagnostic pop
 };
 
 using probe_scheme        = cuco::linear_probing<1, vocab_hasher>;
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 64427326d87..ec21813705a 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -28,6 +28,7 @@
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -459,6 +460,14 @@ std::unique_ptr<column> cast(column_view const& input,
   return type_dispatcher(input.type(), detail::dispatch_unary_cast_from{input}, type, stream, mr);
 }
 
+struct is_supported_cast_impl {
+  template <typename From, typename To>
+  bool operator()() const
+  {
+    return is_supported_cast<From, To>();
+  }
+};
+
 }  // namespace detail
 
 std::unique_ptr<column> cast(column_view const& input,
@@ -470,4 +479,11 @@ std::unique_ptr<column> cast(column_view const& input,
   return detail::cast(input, type, stream, mr);
 }
 
+bool is_supported_cast(data_type from, data_type to) noexcept
+{
+  // No matching detail API call/nvtx annotation, since this doesn't
+  // launch a kernel.
+  return double_type_dispatcher(from, to, detail::is_supported_cast_impl{});
+}
+
 }  // namespace cudf
diff --git a/cpp/tests/binaryop/binop-verify-input-test.cpp b/cpp/tests/binaryop/binop-verify-input-test.cpp
index 1346dcd4666..def6e94452e 100644
--- a/cpp/tests/binaryop/binop-verify-input-test.cpp
+++ b/cpp/tests/binaryop/binop-verify-input-test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -42,5 +42,5 @@ TEST_F(BinopVerifyInputTest, Vector_Vector_ErrorSecondOperandVectorZeroSize)
 
   EXPECT_THROW(cudf::binary_operation(
                  lhs, rhs, cudf::binary_operator::ADD, cudf::data_type(cudf::type_id::INT64)),
-               cudf::logic_error);
+               std::invalid_argument);
 }
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index ab7984d4b03..a222289216d 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -38,7 +38,7 @@ struct FixedPointTest : public cudf::test::BaseFixture {};
 template <typename T>
 struct FixedPointTestAllReps : public cudf::test::BaseFixture {};
 
-using RepresentationTypes = ::testing::Types<int32_t, int64_t>;
+using RepresentationTypes = ::testing::Types<int32_t, int64_t, __int128_t>;
 
 TYPED_TEST_SUITE(FixedPointTestAllReps, RepresentationTypes);
 
@@ -53,6 +53,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXConstruction)
   auto num4 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-4));
   auto num5 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-5));
   auto num6 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-6));
+  auto num7 = cudf::convert_floating_to_fixed<decimalXX>(0.0, scale_type(-4));
 
   EXPECT_EQ(1, cudf::convert_fixed_to_floating<double>(num0));
   EXPECT_EQ(1.2, cudf::convert_fixed_to_floating<double>(num1));
@@ -61,6 +62,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXConstruction)
   EXPECT_EQ(1.2345, cudf::convert_fixed_to_floating<double>(num4));
   EXPECT_EQ(1.23456, cudf::convert_fixed_to_floating<double>(num5));
   EXPECT_EQ(1.234567, cudf::convert_fixed_to_floating<double>(num6));
+  EXPECT_EQ(0.0, cudf::convert_fixed_to_floating<double>(num7));
 }
 
 TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
@@ -74,6 +76,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
   auto num4 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-4));
   auto num5 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-5));
   auto num6 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-6));
+  auto num7 = cudf::convert_floating_to_fixed<decimalXX>(-0.0, scale_type(-4));
 
   EXPECT_EQ(-1, cudf::convert_fixed_to_floating<double>(num0));
   EXPECT_EQ(-1.2, cudf::convert_fixed_to_floating<double>(num1));
@@ -82,6 +85,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
   EXPECT_EQ(-1.2345, cudf::convert_fixed_to_floating<double>(num4));
   EXPECT_EQ(-1.23456, cudf::convert_fixed_to_floating<double>(num5));
   EXPECT_EQ(-1.234567, cudf::convert_fixed_to_floating<double>(num6));
+  EXPECT_EQ(-0.0, cudf::convert_fixed_to_floating<double>(num7));
 }
 
 TYPED_TEST(FixedPointTestAllReps, PaddedDecimalXXConstruction)
@@ -99,14 +103,10 @@ TYPED_TEST(FixedPointTestAllReps, PaddedDecimalXXConstruction)
 
   EXPECT_EQ(1.1, cudf::convert_fixed_to_floating<double>(a));
   EXPECT_EQ(1.01, cudf::convert_fixed_to_floating<double>(b));
-  EXPECT_EQ(1,
-            cudf::convert_fixed_to_floating<double>(
-              c));  // intentional (inherited problem from floating point)
+  EXPECT_EQ(1.001, cudf::convert_fixed_to_floating<double>(c));
   EXPECT_EQ(1.0001, cudf::convert_fixed_to_floating<double>(d));
   EXPECT_EQ(1.00001, cudf::convert_fixed_to_floating<double>(e));
-  EXPECT_EQ(1,
-            cudf::convert_fixed_to_floating<double>(
-              f));  // intentional (inherited problem from floating point)
+  EXPECT_EQ(1.000001, cudf::convert_fixed_to_floating<double>(f));
 
   EXPECT_TRUE(1.000123 - cudf::convert_fixed_to_floating<double>(x) <
               std::numeric_limits<double>::epsilon());
@@ -153,6 +153,119 @@ TYPED_TEST(FixedPointTestAllReps, MoreSimpleBinaryFPConstruction)
   EXPECT_EQ(2.0625, cudf::convert_fixed_to_floating<double>(num1));
 }
 
+TEST_F(FixedPointTest, PreciseFloatDecimal64Construction)
+{
+  // Need 9 decimal digits to uniquely represent all floats (numeric_limits::max_digits10()).
+  // Precise conversion: set the scale factor to 9 less than the order-of-magnitude.
+  // But with -9 scale factor decimal32 can overflow: use decimal64 instead.
+
+  // Positive Exponent
+  {
+    auto num0 = cudf::convert_floating_to_fixed<decimal64>(3.141593E7f, scale_type(-2));
+    auto num1 = cudf::convert_floating_to_fixed<decimal64>(3.141593E12f, scale_type(3));
+    auto num2 = cudf::convert_floating_to_fixed<decimal64>(3.141593E17f, scale_type(8));
+    auto num3 = cudf::convert_floating_to_fixed<decimal64>(3.141593E22f, scale_type(13));
+    auto num4 = cudf::convert_floating_to_fixed<decimal64>(3.141593E27f, scale_type(18));
+    auto num5 = cudf::convert_floating_to_fixed<decimal64>(3.141593E32f, scale_type(23));
+    auto num6 = cudf::convert_floating_to_fixed<decimal64>(3.141593E37f, scale_type(28));
+
+    EXPECT_EQ(3.141593E7f, cudf::convert_fixed_to_floating<float>(num0));
+    EXPECT_EQ(3.141593E12f, cudf::convert_fixed_to_floating<float>(num1));
+    EXPECT_EQ(3.141593E17f, cudf::convert_fixed_to_floating<float>(num2));
+    EXPECT_EQ(3.141593E22f, cudf::convert_fixed_to_floating<float>(num3));
+    EXPECT_EQ(3.141593E27f, cudf::convert_fixed_to_floating<float>(num4));
+    EXPECT_EQ(3.141593E32f, cudf::convert_fixed_to_floating<float>(num5));
+    EXPECT_EQ(3.141593E37f, cudf::convert_fixed_to_floating<float>(num6));
+  }
+
+  // Negative Exponent
+  {
+    auto num0 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-7f, scale_type(-16));
+    auto num1 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-12f, scale_type(-21));
+    auto num2 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-17f, scale_type(-26));
+    auto num3 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-22f, scale_type(-31));
+    auto num4 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-27f, scale_type(-36));
+    auto num5 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-32f, scale_type(-41));
+    auto num6 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-37f, scale_type(-47));
+
+    EXPECT_EQ(3.141593E-7f, cudf::convert_fixed_to_floating<float>(num0));
+    EXPECT_EQ(3.141593E-12f, cudf::convert_fixed_to_floating<float>(num1));
+    EXPECT_EQ(3.141593E-17f, cudf::convert_fixed_to_floating<float>(num2));
+    EXPECT_EQ(3.141593E-22f, cudf::convert_fixed_to_floating<float>(num3));
+    EXPECT_EQ(3.141593E-27f, cudf::convert_fixed_to_floating<float>(num4));
+    EXPECT_EQ(3.141593E-32f, cudf::convert_fixed_to_floating<float>(num5));
+    EXPECT_EQ(3.141593E-37f, cudf::convert_fixed_to_floating<float>(num6));
+
+    // Denormals
+    auto num7  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-39f, scale_type(-48));
+    auto num8  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-41f, scale_type(-50));
+    auto num9  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-43f, scale_type(-52));
+    auto num10 = cudf::convert_floating_to_fixed<decimal64>(FLT_TRUE_MIN, scale_type(-54));
+
+    EXPECT_EQ(3.141593E-39f, cudf::convert_fixed_to_floating<float>(num7));
+    EXPECT_EQ(3.141593E-41f, cudf::convert_fixed_to_floating<float>(num8));
+    EXPECT_EQ(3.141593E-43f, cudf::convert_fixed_to_floating<float>(num9));
+    EXPECT_EQ(FLT_TRUE_MIN, cudf::convert_fixed_to_floating<float>(num10));
+  }
+}
+
+TEST_F(FixedPointTest, PreciseDoubleDecimal64Construction)
+{
+  // Need 17 decimal digits to uniquely represent all doubles (numeric_limits::max_digits10()).
+  // Precise conversion: set the scale factor to 17 less than the order-of-magnitude.
+
+  using decimal64 = fixed_point<int64_t, Radix::BASE_10>;
+
+  // Positive Exponent
+  {
+    auto num0 = cudf::convert_floating_to_fixed<decimal64>(3.141593E8, scale_type(-9));
+    auto num1 = cudf::convert_floating_to_fixed<decimal64>(3.141593E58, scale_type(41));
+    auto num2 = cudf::convert_floating_to_fixed<decimal64>(3.141593E108, scale_type(91));
+    auto num3 = cudf::convert_floating_to_fixed<decimal64>(3.141593E158, scale_type(141));
+    auto num4 = cudf::convert_floating_to_fixed<decimal64>(3.141593E208, scale_type(191));
+    auto num5 = cudf::convert_floating_to_fixed<decimal64>(3.141593E258, scale_type(241));
+    auto num6 = cudf::convert_floating_to_fixed<decimal64>(3.141593E307, scale_type(290));
+
+    EXPECT_EQ(3.141593E8, cudf::convert_fixed_to_floating<double>(num0));
+    EXPECT_EQ(3.141593E58, cudf::convert_fixed_to_floating<double>(num1));
+    EXPECT_EQ(3.141593E108, cudf::convert_fixed_to_floating<double>(num2));
+    EXPECT_EQ(3.141593E158, cudf::convert_fixed_to_floating<double>(num3));
+    EXPECT_EQ(3.141593E208, cudf::convert_fixed_to_floating<double>(num4));
+    EXPECT_EQ(3.141593E258, cudf::convert_fixed_to_floating<double>(num5));
+    EXPECT_EQ(3.141593E307, cudf::convert_fixed_to_floating<double>(num6));
+  }
+
+  // Negative Exponent
+  {
+    auto num0 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-8, scale_type(-25));
+    auto num1 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-58, scale_type(-75));
+    auto num2 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-108, scale_type(-125));
+    auto num3 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-158, scale_type(-175));
+    auto num4 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-208, scale_type(-225));
+    auto num5 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-258, scale_type(-275));
+    auto num6 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-308, scale_type(-325));
+
+    EXPECT_EQ(3.141593E-8, cudf::convert_fixed_to_floating<double>(num0));
+    EXPECT_EQ(3.141593E-58, cudf::convert_fixed_to_floating<double>(num1));
+    EXPECT_EQ(3.141593E-108, cudf::convert_fixed_to_floating<double>(num2));
+    EXPECT_EQ(3.141593E-158, cudf::convert_fixed_to_floating<double>(num3));
+    EXPECT_EQ(3.141593E-208, cudf::convert_fixed_to_floating<double>(num4));
+    EXPECT_EQ(3.141593E-258, cudf::convert_fixed_to_floating<double>(num5));
+    EXPECT_EQ(3.141593E-308, cudf::convert_fixed_to_floating<double>(num6));
+
+    // Denormals
+    auto num7  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-309, scale_type(-326));
+    auto num8  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-314, scale_type(-331));
+    auto num9  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-319, scale_type(-336));
+    auto num10 = cudf::convert_floating_to_fixed<decimal64>(DBL_TRUE_MIN, scale_type(-341));
+
+    EXPECT_EQ(3.141593E-309, cudf::convert_fixed_to_floating<double>(num7));
+    EXPECT_EQ(3.141593E-314, cudf::convert_fixed_to_floating<double>(num8));
+    EXPECT_EQ(3.141593E-319, cudf::convert_fixed_to_floating<double>(num9));
+    EXPECT_EQ(DBL_TRUE_MIN, cudf::convert_fixed_to_floating<double>(num10));
+  }
+}
+
 TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXMath)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
@@ -442,8 +555,6 @@ void float_vector_test(ValueType const initial_value,
                        int32_t const scale,
                        Binop binop)
 {
-  using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
-
   std::vector<decimal32> vec1(size);
   std::vector<ValueType> vec2(size);
 
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 860544b8606..8903f09b82b 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -352,11 +352,15 @@ TEST_F(ToArrowDeviceTest, EmptyTable)
   auto got_arrow_device = cudf::to_arrow_device(table->view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_device->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_device->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_device->sync_event)));
   compare_arrays(schema.get(), arr.get(), &got_arrow_device->array);
 
   got_arrow_device = cudf::to_arrow_device(std::move(*table));
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_device->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_device->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_device->sync_event)));
   compare_arrays(schema.get(), arr.get(), &got_arrow_device->array);
 }
 
@@ -386,6 +390,8 @@ TEST_F(ToArrowDeviceTest, DateTimeTable)
   auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
 
   EXPECT_EQ(data.size(), got_arrow_array->array.length);
   EXPECT_EQ(0, got_arrow_array->array.null_count);
@@ -402,6 +408,8 @@ TEST_F(ToArrowDeviceTest, DateTimeTable)
   got_arrow_array = cudf::to_arrow_device(std::move(input));
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
 
   EXPECT_EQ(data.size(), got_arrow_array->array.length);
   EXPECT_EQ(0, got_arrow_array->array.null_count);
@@ -456,6 +464,8 @@ TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
   auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
 
   EXPECT_EQ(data.size(), got_arrow_array->array.length);
   EXPECT_EQ(0, got_arrow_array->array.null_count);
@@ -472,6 +482,8 @@ TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
   got_arrow_array = cudf::to_arrow_device(std::move(input));
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
 
   EXPECT_EQ(data.size(), got_arrow_array->array.length);
   EXPECT_EQ(0, got_arrow_array->array.null_count);
@@ -538,6 +550,8 @@ TEST_F(ToArrowDeviceTest, NestedList)
   auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
   compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
   got_arrow_array = cudf::to_arrow_device(std::move(input));
@@ -682,11 +696,15 @@ TEST_F(ToArrowDeviceTest, StructColumn)
   auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
   compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
   got_arrow_array = cudf::to_arrow_device(std::move(input));
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
   compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 }
 
@@ -755,11 +773,15 @@ TEST_F(ToArrowDeviceTest, FixedPoint64Table)
     auto got_arrow_array = cudf::to_arrow_device(input.view());
     ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
     ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
     compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
     got_arrow_array = cudf::to_arrow_device(std::move(input));
     ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
     ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
     compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
   }
 }
@@ -802,11 +824,15 @@ TEST_F(ToArrowDeviceTest, FixedPoint128Table)
     auto got_arrow_array = cudf::to_arrow_device(input.view());
     EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
     EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
     compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
     got_arrow_array = cudf::to_arrow_device(std::move(input));
     EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
     EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
     compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
   }
 }
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 9c76c344157..993ab82f423 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1400,9 +1400,7 @@ TEST_F(JsonReaderTest, JsonLongString)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
 
   cudf::column_view int16_with_mask(repeat_times);
   cudf::column_view int16(
diff --git a/cpp/tests/io/json_writer.cpp b/cpp/tests/io/json_writer.cpp
index 946b939f456..2c4e29a01b9 100644
--- a/cpp/tests/io/json_writer.cpp
+++ b/cpp/tests/io/json_writer.cpp
@@ -51,16 +51,14 @@ TEST_F(JsonWriterTest, EmptyInput)
                        .build();
 
   // Empty columns in table
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   std::string const expected = R"([])";
   EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
 
   // Empty columns in table - JSON Lines
   out_buffer.clear();
   out_options.enable_lines(true);
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   std::string const expected_lines = "\n";
   EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
 
@@ -68,8 +66,7 @@ TEST_F(JsonWriterTest, EmptyInput)
   cudf::table_view tbl_view2{};
   out_options.set_table(tbl_view2);
   out_buffer.clear();
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
 }
 
@@ -94,22 +91,17 @@ TEST_F(JsonWriterTest, ErrorCases)
                        .build();
 
   // not enough column names
-  EXPECT_THROW(
-    cudf::io::write_json(
-      out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
-    cudf::logic_error);
+  EXPECT_THROW(cudf::io::write_json(out_options, cudf::test::get_default_stream()),
+               cudf::logic_error);
 
   mt.schema_info.emplace_back("int16");
   out_options.set_metadata(mt);
-  EXPECT_NO_THROW(cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()));
+  EXPECT_NO_THROW(cudf::io::write_json(out_options, cudf::test::get_default_stream()));
 
   // chunk_rows must be at least 8
   out_options.set_rows_per_chunk(0);
-  EXPECT_THROW(
-    cudf::io::write_json(
-      out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
-    cudf::logic_error);
+  EXPECT_THROW(cudf::io::write_json(out_options, cudf::test::get_default_stream()),
+               cudf::logic_error);
 }
 
 TEST_F(JsonWriterTest, PlainTable)
@@ -131,9 +123,7 @@ TEST_F(JsonWriterTest, PlainTable)
                            .lines(false)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
 
   std::string const expected =
     R"([{"col1":"a","col2":"d","int":1,"float":1.5,"int16":null},{"col1":"b","col2":"e","int":2,"float":2.5,"int16":2},{"col1":"c","col2":"f","int":3,"float":3.5,"int16":null}])";
@@ -163,9 +153,7 @@ TEST_F(JsonWriterTest, SimpleNested)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
   std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]}
 {"a":6,"b":7,"c":{"d":8},"f":10.5}
 {"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]}
@@ -197,9 +185,7 @@ TEST_F(JsonWriterTest, MixedNested)
                            .lines(false)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
   std::string const expected =
     R"([{"a":1,"b":2,"c":{"d":[3]},"f":5.5,"g":[{"h":1}]},)"
     R"({"a":6,"b":7,"c":{"d":[8]},"f":10.5},)"
@@ -232,8 +218,7 @@ TEST_F(JsonWriterTest, WriteReadNested)
                        .na_rep("null")
                        .build();
 
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]}
 {"a":6,"b":7,"c":{"d":8},"f":10.5}
 {"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]}
@@ -308,8 +293,7 @@ TEST_F(JsonWriterTest, WriteReadNested)
   mt.schema_info[2].children.clear();
   out_options.set_metadata(mt);
   out_buffer.clear();
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
 
   in_options = cudf::io::json_reader_options::builder(
                  cudf::io::source_info{out_buffer.data(), out_buffer.size()})
@@ -332,8 +316,7 @@ TEST_F(JsonWriterTest, WriteReadNested)
   // without column names
   out_options.set_metadata(cudf::io::table_metadata{});
   out_buffer.clear();
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   in_options = cudf::io::json_reader_options::builder(
                  cudf::io::source_info{out_buffer.data(), out_buffer.size()})
                  .lines(true)
@@ -371,8 +354,7 @@ TEST_F(JsonWriterTest, SpecialChars)
                        .na_rep("null")
                        .build();
 
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   std::string const expected = R"({"\"a\"":1,"'b'":"abcd"}
 {"\"a\"":6,"'b'":"b\b\f\n\r\t"}
 {"\"a\"":1,"'b'":"\"c\""}
@@ -405,9 +387,7 @@ TEST_F(JsonWriterTest, NullList)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
   std::string const expected = R"({"a":[null],"b":[[1,2,3],[null],[null,null,null],[4,null,5]]}
 {"a":[2,null,null,3],"b":null}
 {"a":[null,null,4],"b":[[2,null],null]}
@@ -446,9 +426,7 @@ TEST_F(JsonWriterTest, ChunkedNested)
                            .na_rep("null")
                            .rows_per_chunk(8);
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
   std::string const expected =
     R"({"a":1,"b":-2,"c":{},"e":[{"f":1}]}
 {"a":2,"b":-2,"c":{}}
@@ -504,9 +482,7 @@ TEST_F(JsonWriterTest, StructAllNullCombinations)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
   std::string const expected = R"({}
 {"e":1}
 {"d":1}
@@ -568,9 +544,7 @@ TEST_F(JsonWriterTest, Unicode)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
 
   std::string const expected =
     R"({"col1":"\"\\\/\b\f\n\r\t","col2":"C\u10ae\u226a\u31f3\u434f\u51f9\u6ca6\u738b\u8fbf\u9fb8\ua057\ubbdc\uc2a4\ud3f6\ue4fe\ufd20","int16":null}
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index b5e080f3cc5..39ba62952b4 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -54,9 +54,9 @@ using int32_col   = column_wrapper<int32_t>;
 using int64_col   = column_wrapper<int64_t>;
 using float32_col = column_wrapper<float>;
 using float64_col = column_wrapper<double>;
-using dec32_col   = column_wrapper<numeric::decimal32>;
-using dec64_col   = column_wrapper<numeric::decimal64>;
-using dec128_col  = column_wrapper<numeric::decimal128>;
+using dec32_col   = cudf::test::fixed_point_column_wrapper<numeric::decimal32::rep>;
+using dec64_col   = cudf::test::fixed_point_column_wrapper<numeric::decimal64::rep>;
+using dec128_col  = cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep>;
 using struct_col  = cudf::test::structs_column_wrapper;
 template <typename T>
 using list_col = cudf::test::lists_column_wrapper<T>;
@@ -355,12 +355,6 @@ TEST_F(OrcWriterTest, MultiColumn)
   auto col4_data = random_values<float>(num_rows);
   auto col5_data = random_values<double>(num_rows);
   auto col6_vals = random_values<int64_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{col6_vals[i], numeric::scale_type{12}};
-  });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{col6_vals[i], numeric::scale_type{-12}};
-  });
 
   bool_col col0(col0_data.begin(), col0_data.end());
   int8_col col1(col1_data.begin(), col1_data.end());
@@ -368,8 +362,8 @@ TEST_F(OrcWriterTest, MultiColumn)
   int32_col col3(col3_data.begin(), col3_data.end());
   float32_col col4(col4_data.begin(), col4_data.end());
   float64_col col5(col5_data.begin(), col5_data.end());
-  dec128_col col6(col6_data, col6_data + num_rows);
-  dec128_col col7(col7_data, col7_data + num_rows);
+  dec128_col col6{col6_vals.begin(), col6_vals.end(), numeric::scale_type{12}};
+  dec128_col col7{col6_vals.begin(), col6_vals.end(), numeric::scale_type{-12}};
 
   list_col<int64_t> col8{
     {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}};
@@ -416,9 +410,6 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   auto col4_data = random_values<float>(num_rows);
   auto col5_data = random_values<double>(num_rows);
   auto col6_vals = random_values<int32_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{col6_vals[i], numeric::scale_type{2}};
-  });
   auto col0_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); });
   auto col1_mask =
@@ -438,7 +429,7 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   int32_col col3{col3_data.begin(), col3_data.end(), col3_mask};
   float32_col col4{col4_data.begin(), col4_data.end(), col4_mask};
   float64_col col5{col5_data.begin(), col5_data.end(), col5_mask};
-  dec64_col col6{col6_data, col6_data + num_rows, col6_mask};
+  dec64_col col6{col6_vals.begin(), col6_vals.end(), col6_mask, numeric::scale_type{2}};
   list_col<int32_t> col7{
     {{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}},
     col0_mask};
@@ -541,14 +532,11 @@ TEST_F(OrcWriterTest, SlicedTable)
   auto seq_col0  = random_values<int32_t>(num_rows);
   auto seq_col2  = random_values<float>(num_rows);
   auto vals_col3 = random_values<int32_t>(num_rows);
-  auto seq_col3  = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{vals_col3[i], numeric::scale_type{2}};
-  });
 
   int32_col col0(seq_col0.begin(), seq_col0.end());
   str_col col1(strings.begin(), strings.end());
   float32_col col2(seq_col2.begin(), seq_col2.end());
-  dec64_col col3(seq_col3, seq_col3 + num_rows);
+  dec64_col col3{vals_col3.begin(), vals_col3.end(), numeric::scale_type{2}};
 
   list_col<int64_t> col4{
     {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}};
@@ -1213,11 +1201,8 @@ TEST_P(OrcWriterTestDecimal, Decimal64)
 
   // Using int16_t because scale causes values to overflow if they already require 32 bits
   auto const vals = random_values<int32_t>(num_rows);
-  auto data       = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{vals[i], numeric::scale_type{scale}};
-  });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 7 == 0; });
-  dec64_col col{data, data + num_rows, mask};
+  dec64_col col{vals.begin(), vals.end(), mask, numeric::scale_type{scale}};
   cudf::table_view tbl({static_cast<cudf::column_view>(col)});
 
   auto filepath = temp_env->get_temp_filepath("Decimal64.orc");
@@ -1244,11 +1229,8 @@ TEST_F(OrcWriterTest, Decimal32)
 
   // Using int16_t because scale causes values to overflow if they already require 32 bits
   auto const vals = random_values<int16_t>(num_rows);
-  auto data       = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) {
-    return numeric::decimal32{vals[i], numeric::scale_type{2}};
-  });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13; });
-  dec32_col col{data, data + num_rows, mask};
+  dec32_col col{vals.begin(), vals.end(), mask, numeric::scale_type{2}};
   cudf::table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("Decimal32.orc");
@@ -1527,12 +1509,9 @@ TEST_F(OrcReaderTest, DecimalOptions)
 {
   constexpr auto num_rows = 10;
   auto col_vals           = random_values<int64_t>(num_rows);
-  auto col_data           = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{col_vals[i], numeric::scale_type{2}};
-  });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 == 0; });
 
-  dec128_col col{col_data, col_data + num_rows, mask};
+  dec128_col col{col_vals.begin(), col_vals.end(), mask, numeric::scale_type{2}};
   table_view expected({col});
 
   cudf::io::table_input_metadata expected_metadata(expected);
@@ -1555,15 +1534,9 @@ TEST_F(OrcWriterTest, DecimalOptionsNested)
 {
   auto const num_rows = 100;
 
-  auto dec_vals  = random_values<int32_t>(num_rows);
-  auto dec1_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{dec_vals[i], numeric::scale_type{2}};
-  });
-  auto dec2_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{dec_vals[i], numeric::scale_type{2}};
-  });
-  dec64_col dec1_col(dec1_data, dec1_data + num_rows);
-  dec128_col dec2_col(dec2_data, dec2_data + num_rows);
+  auto dec_vals = random_values<int32_t>(num_rows);
+  dec64_col dec1_col{dec_vals.begin(), dec_vals.end(), numeric::scale_type{2}};
+  dec128_col dec2_col{dec_vals.begin(), dec_vals.end(), numeric::scale_type{2}};
   auto child_struct_col = cudf::test::structs_column_wrapper{dec1_col, dec2_col};
 
   auto int_vals = random_values<int32_t>(num_rows);
@@ -1974,7 +1947,7 @@ TEST_F(OrcStatisticsTest, Empty)
   int32_col col0{};
   float64_col col1{};
   str_col col2{};
-  dec64_col col3{};
+  dec64_col col3{{}, numeric::scale_type{0}};
   column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep> col4;
   bool_col col5{};
   table_view expected({col0, col1, col2, col3, col4, col5});
diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp
index f106fd5a487..9e66fc9409f 100644
--- a/cpp/tests/io/parquet_v2_test.cpp
+++ b/cpp/tests/io/parquet_v2_test.cpp
@@ -47,15 +47,6 @@ TEST_P(ParquetV2Test, MultiColumn)
   auto col6_vals = random_values<int16_t>(num_rows);
   auto col7_vals = random_values<int32_t>(num_rows);
   auto col8_vals = random_values<int64_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&col6_vals](auto i) {
-    return numeric::decimal32{col6_vals[i], numeric::scale_type{5}};
-  });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&col7_vals](auto i) {
-    return numeric::decimal64{col7_vals[i], numeric::scale_type{-5}};
-  });
-  auto col8_data = cudf::detail::make_counting_transform_iterator(0, [&col8_vals](auto i) {
-    return numeric::decimal128{col8_vals[i], numeric::scale_type{-6}};
-  });
 
   // column_wrapper<bool> col0{col0_data.begin(), col0_data.end(), no_nulls()};
   column_wrapper<int8_t> col1{col1_data.begin(), col1_data.end(), no_nulls()};
@@ -63,9 +54,13 @@ TEST_P(ParquetV2Test, MultiColumn)
   column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), no_nulls()};
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), no_nulls()};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), no_nulls()};
-  column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, no_nulls()};
-  column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, no_nulls()};
-  column_wrapper<numeric::decimal128> col8{col8_data, col8_data + num_rows, no_nulls()};
+
+  cudf::test::fixed_point_column_wrapper<numeric::decimal32::rep> col6(
+    col6_vals.begin(), col6_vals.end(), no_nulls(), numeric::scale_type{5});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal64::rep> col7(
+    col7_vals.begin(), col7_vals.end(), no_nulls(), numeric::scale_type{-5});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col8(
+    col8_vals.begin(), col8_vals.end(), no_nulls(), numeric::scale_type{-6});
 
   auto expected = table_view{{col1, col2, col3, col4, col5, col6, col7, col8}};
 
@@ -109,14 +104,6 @@ TEST_P(ParquetV2Test, MultiColumnWithNulls)
   auto col5_data = random_values<double>(num_rows);
   auto col6_vals = random_values<int32_t>(num_rows);
   auto col7_vals = random_values<int64_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&col6_vals](auto i) {
-    return numeric::decimal32{col6_vals[i], numeric::scale_type{-2}};
-  });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&col7_vals](auto i) {
-    return numeric::decimal64{col7_vals[i], numeric::scale_type{-8}};
-  });
-  // auto col0_mask = cudf::detail::make_counting_transform_iterator(
-  //    0, [](auto i) { return (i % 2); });
   auto col1_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 10); });
   auto col2_mask = no_nulls();
@@ -138,8 +125,11 @@ TEST_P(ParquetV2Test, MultiColumnWithNulls)
   column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), col3_mask};
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), col4_mask};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), col5_mask};
-  column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, col6_mask};
-  column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, col7_mask};
+
+  cudf::test::fixed_point_column_wrapper<numeric::decimal32::rep> col6(
+    col6_vals.begin(), col6_vals.end(), col6_mask, numeric::scale_type{-2});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal64::rep> col7(
+    col7_vals.begin(), col7_vals.end(), col7_mask, numeric::scale_type{-8});
 
   auto expected = table_view{{/*col0, */ col1, col2, col3, col4, col5, col6, col7}};
 
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index a1f4c7b81d8..e07ebe25322 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -35,7 +35,7 @@
 using cudf::test::iterators::no_nulls;
 
 template <typename mask_op_t>
-void test_durations(mask_op_t mask_op, bool use_byte_stream_split)
+void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_schema)
 {
   std::default_random_engine generator;
   std::uniform_int_distribution<int> distribution_d(0, 30);
@@ -76,20 +76,27 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split)
 
   auto filepath = temp_env->get_temp_filepath("Durations.parquet");
   cudf::io::parquet_writer_options out_opts =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .write_arrow_schema(arrow_schema);
+
   cudf::io::write_parquet(out_opts);
 
   cudf::io::parquet_reader_options in_opts =
-    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+      .use_arrow_schema(arrow_schema);
   auto result = cudf::io::read_parquet(in_opts);
 
   auto durations_d_got =
     cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view());
 
-  auto durations_s_got =
-    cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view());
+  if (arrow_schema) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(1));
+  } else {
+    auto durations_s_got =
+      cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view());
+  }
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(2));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(3));
@@ -98,10 +105,15 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split)
 
 TEST_F(ParquetWriterTest, Durations)
 {
-  test_durations([](auto i) { return true; }, false);
-  test_durations([](auto i) { return (i % 2) != 0; }, false);
-  test_durations([](auto i) { return (i % 3) != 0; }, false);
-  test_durations([](auto i) { return false; }, false);
+  test_durations([](auto i) { return true; }, false, false);
+  test_durations([](auto i) { return (i % 2) != 0; }, false, false);
+  test_durations([](auto i) { return (i % 3) != 0; }, false, false);
+  test_durations([](auto i) { return false; }, false, false);
+
+  test_durations([](auto i) { return true; }, false, true);
+  test_durations([](auto i) { return (i % 2) != 0; }, false, true);
+  test_durations([](auto i) { return (i % 3) != 0; }, false, true);
+  test_durations([](auto i) { return false; }, false, true);
 }
 
 TEST_F(ParquetWriterTest, MultiIndex)
@@ -493,6 +505,50 @@ TEST_F(ParquetWriterTest, DecimalWrite)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, table);
 }
 
+TEST_F(ParquetWriterTest, DecimalWriteWithArrowSchema)
+{
+  constexpr cudf::size_type num_rows = 500;
+  auto seq_col0                      = random_values<int32_t>(num_rows);
+  auto seq_col1                      = random_values<int64_t>(num_rows);
+
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
+
+  auto col0 = cudf::test::fixed_point_column_wrapper<int32_t>{
+    seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}};
+  auto col1 = cudf::test::fixed_point_column_wrapper<int64_t>{
+    seq_col1.begin(), seq_col1.end(), valids, numeric::scale_type{-9}};
+
+  auto table = table_view({col0, col1});
+
+  auto filepath = temp_env->get_temp_filepath("DecimalWriteWithArrowSchema.parquet");
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table)
+      .write_arrow_schema(true);
+
+  cudf::io::table_input_metadata expected_metadata(table);
+  // verify success if equal precision is given
+  expected_metadata.column_metadata[0].set_decimal_precision(
+    cudf::io::parquet::detail::MAX_DECIMAL32_PRECISION);
+  expected_metadata.column_metadata[1].set_decimal_precision(
+    cudf::io::parquet::detail::MAX_DECIMAL64_PRECISION);
+  args.set_metadata(std::move(expected_metadata));
+  cudf::io::write_parquet(args);
+
+  auto expected_col0 = cudf::test::fixed_point_column_wrapper<__int128_t>{
+    seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}};
+  auto expected_col1 = cudf::test::fixed_point_column_wrapper<__int128_t>{
+    seq_col1.begin(), seq_col1.end(), valids, numeric::scale_type{-9}};
+
+  auto expected_table = table_view({expected_col0, expected_col1});
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected_table);
+}
+
 TEST_F(ParquetWriterTest, RowGroupSizeInvalid)
 {
   auto const unused_table = std::make_unique<table>();
@@ -1935,10 +1991,15 @@ TEST_F(ParquetWriterTest, DecimalByteStreamSplit)
 
 TEST_F(ParquetWriterTest, DurationByteStreamSplit)
 {
-  test_durations([](auto i) { return true; }, true);
-  test_durations([](auto i) { return (i % 2) != 0; }, true);
-  test_durations([](auto i) { return (i % 3) != 0; }, true);
-  test_durations([](auto i) { return false; }, true);
+  test_durations([](auto i) { return true; }, true, false);
+  test_durations([](auto i) { return (i % 2) != 0; }, true, false);
+  test_durations([](auto i) { return (i % 3) != 0; }, true, false);
+  test_durations([](auto i) { return false; }, true, false);
+
+  test_durations([](auto i) { return true; }, true, true);
+  test_durations([](auto i) { return (i % 2) != 0; }, true, true);
+  test_durations([](auto i) { return (i % 3) != 0; }, true, true);
+  test_durations([](auto i) { return false; }, true, true);
 }
 
 TEST_F(ParquetWriterTest, WriteFixedLenByteArray)
diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp
index 718ee83cf09..8fb2b403051 100644
--- a/cpp/tests/lists/contains_tests.cpp
+++ b/cpp/tests/lists/contains_tests.cpp
@@ -224,9 +224,8 @@ TYPED_TEST(TypedContainsTest, SlicedLists)
 
   {
     // First Slice.
-    auto sliced_column_1 =
-      cudf::detail::slice(search_space, {1, 8}, cudf::get_default_stream()).front();
-    auto search_key_one = create_scalar_search_key<T>(1);
+    auto sliced_column_1 = cudf::slice(search_space, {1, 8}, cudf::get_default_stream()).front();
+    auto search_key_one  = create_scalar_search_key<T>(1);
     {
       // CONTAINS
       auto result          = cudf::lists::contains(sliced_column_1, *search_key_one);
@@ -257,9 +256,8 @@ TYPED_TEST(TypedContainsTest, SlicedLists)
 
   {
     // Second Slice.
-    auto sliced_column_2 =
-      cudf::detail::slice(search_space, {3, 10}, cudf::get_default_stream()).front();
-    auto search_key_one = create_scalar_search_key<T>(1);
+    auto sliced_column_2 = cudf::slice(search_space, {3, 10}, cudf::get_default_stream()).front();
+    auto search_key_one  = create_scalar_search_key<T>(1);
     {
       // CONTAINS
       auto result          = cudf::lists::contains(sliced_column_2, *search_key_one);
diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp
index bc7488bbf9e..de155c35a5e 100644
--- a/cpp/tests/reshape/interleave_columns_tests.cpp
+++ b/cpp/tests/reshape/interleave_columns_tests.cpp
@@ -363,19 +363,16 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointInterleave)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
+  using RepType   = typename decimalXX::rep;
 
   for (int i = 0; i > -4; --i) {
-    auto const ONE  = decimalXX{1, scale_type{i}};
-    auto const TWO  = decimalXX{2, scale_type{i}};
-    auto const FOUR = decimalXX{4, scale_type{i}};
-    auto const FIVE = decimalXX{5, scale_type{i}};
+    auto const a = cudf::test::fixed_point_column_wrapper<RepType>({1, 4}, scale_type{i});
+    auto const b = cudf::test::fixed_point_column_wrapper<RepType>({2, 5}, scale_type{i});
 
-    auto const a = cudf::test::fixed_width_column_wrapper<decimalXX>({ONE, FOUR});
-    auto const b = cudf::test::fixed_width_column_wrapper<decimalXX>({TWO, FIVE});
-
-    auto const input    = cudf::table_view{std::vector<cudf::column_view>{a, b}};
-    auto const expected = cudf::test::fixed_width_column_wrapper<decimalXX>({ONE, TWO, FOUR, FIVE});
-    auto const actual   = cudf::interleave_columns(input);
+    auto const input = cudf::table_view{std::vector<cudf::column_view>{a, b}};
+    auto const expected =
+      cudf::test::fixed_point_column_wrapper<RepType>({1, 2, 4, 5}, scale_type{i});
+    auto const actual = cudf::interleave_columns(input);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, actual->view());
   }
diff --git a/cpp/tests/streams/io/csv_test.cpp b/cpp/tests/streams/io/csv_test.cpp
index 6e27db02d56..42894a0ebcb 100644
--- a/cpp/tests/streams/io/csv_test.cpp
+++ b/cpp/tests/streams/io/csv_test.cpp
@@ -39,12 +39,6 @@ TEST_F(CSVTest, CSVWriter)
 
   std::vector<size_t> zeros(num_rows, 0);
   std::vector<size_t> ones(num_rows, 1);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{ones[i], numeric::scale_type{12}};
-  });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{ones[i], numeric::scale_type{-12}};
-  });
 
   cudf::test::fixed_width_column_wrapper<bool> col0(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<int8_t> col1(zeros.begin(), zeros.end());
@@ -52,8 +46,10 @@ TEST_F(CSVTest, CSVWriter)
   cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6(col6_data, col6_data + num_rows);
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7(col7_data, col7_data + num_rows);
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col6(
+    ones.begin(), ones.end(), numeric::scale_type{12});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col7(
+    ones.begin(), ones.end(), numeric::scale_type{-12});
 
   std::vector<std::string> col8_data(num_rows, "rapids");
   cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end());
@@ -72,12 +68,6 @@ TEST_F(CSVTest, CSVReader)
 
   std::vector<size_t> zeros(num_rows, 0);
   std::vector<size_t> ones(num_rows, 1);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{ones[i], numeric::scale_type{12}};
-  });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{ones[i], numeric::scale_type{-12}};
-  });
 
   cudf::test::fixed_width_column_wrapper<bool> col0(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<int8_t> col1(zeros.begin(), zeros.end());
@@ -85,8 +75,10 @@ TEST_F(CSVTest, CSVReader)
   cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6(col6_data, col6_data + num_rows);
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7(col7_data, col7_data + num_rows);
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col6(
+    ones.begin(), ones.end(), numeric::scale_type{12});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col7(
+    ones.begin(), ones.end(), numeric::scale_type{-12});
 
   std::vector<std::string> col8_data(num_rows, "rapids");
   cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end());
diff --git a/cpp/tests/streams/io/orc_test.cpp b/cpp/tests/streams/io/orc_test.cpp
index 401c7049381..cc43bf15b5d 100644
--- a/cpp/tests/streams/io/orc_test.cpp
+++ b/cpp/tests/streams/io/orc_test.cpp
@@ -59,22 +59,10 @@ cudf::table construct_table()
   cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros_iterator, zeros_iterator + num_rows);
   cudf::test::fixed_width_column_wrapper<float> col4(zeros_iterator, zeros_iterator + num_rows);
   cudf::test::fixed_width_column_wrapper<double> col5(zeros_iterator, zeros_iterator + num_rows);
-
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6 = [&ones_iterator] {
-    auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-      return numeric::decimal128{ones_iterator[i], numeric::scale_type{12}};
-    });
-    return cudf::test::fixed_width_column_wrapper<numeric::decimal128>(col6_data,
-                                                                       col6_data + num_rows);
-  }();
-
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7 = [&ones_iterator] {
-    auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-      return numeric::decimal128{ones_iterator[i], numeric::scale_type{-12}};
-    });
-    return cudf::test::fixed_width_column_wrapper<numeric::decimal128>(col7_data,
-                                                                       col7_data + num_rows);
-  }();
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col6(
+    ones_iterator, ones_iterator + num_rows, numeric::scale_type{12});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col7(
+    ones_iterator, ones_iterator + num_rows, numeric::scale_type{-12});
 
   cudf::test::lists_column_wrapper<int64_t> col8 = [] {
     auto col8_mask =
diff --git a/cpp/tests/streams/io/parquet_test.cpp b/cpp/tests/streams/io/parquet_test.cpp
index b277d184e3a..9d2dec2d697 100644
--- a/cpp/tests/streams/io/parquet_test.cpp
+++ b/cpp/tests/streams/io/parquet_test.cpp
@@ -55,20 +55,10 @@ cudf::table construct_table()
   cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6 = [&ones] {
-    auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-      return numeric::decimal128{ones[i], numeric::scale_type{12}};
-    });
-    return cudf::test::fixed_width_column_wrapper<numeric::decimal128>(col6_data,
-                                                                       col6_data + num_rows);
-  }();
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7 = [&ones] {
-    auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-      return numeric::decimal128{ones[i], numeric::scale_type{-12}};
-    });
-    return cudf::test::fixed_width_column_wrapper<numeric::decimal128>(col7_data,
-                                                                       col7_data + num_rows);
-  }();
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col6(
+    ones.begin(), ones.end(), numeric::scale_type{12});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col7(
+    ones.begin(), ones.end(), numeric::scale_type{-12});
 
   cudf::test::lists_column_wrapper<int64_t> col8{
     {1, 1}, {1, 1, 1}, {}, {1}, {1, 1, 1, 1}, {1, 1, 1, 1, 1}, {}, {1, -1}, {}, {-1, -1}};
diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp
index 3aa7467d156..6c4afbb435a 100644
--- a/cpp/tests/strings/replace_tests.cpp
+++ b/cpp/tests/strings/replace_tests.cpp
@@ -532,6 +532,23 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong)
   }
 }
 
+TEST_F(StringsReplaceTest, EmptyTarget)
+{
+  auto const input = cudf::test::strings_column_wrapper({"hello", "world", "", "accénted"});
+  auto const sv    = cudf::strings_column_view(input);
+
+  auto const targets = cudf::test::strings_column_wrapper({"e", "", "d"});
+  auto const tv      = cudf::strings_column_view(targets);
+
+  auto const repls = cudf::test::strings_column_wrapper({"E", "_", "D"});
+  auto const rv    = cudf::strings_column_view(repls);
+
+  // empty target should be ignored
+  auto results  = cudf::strings::replace_multiple(sv, tv, rv);
+  auto expected = cudf::test::strings_column_wrapper({"hEllo", "worlD", "", "accéntED"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+}
+
 TEST_F(StringsReplaceTest, EmptyStringsColumn)
 {
   auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index d53c64ed539..4c020cb4c29 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -307,6 +307,26 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
 }
 
+TEST_F(StringsSplitTest, SplitRecordAllEmpty)
+{
+  auto input     = cudf::test::strings_column_wrapper({"", "", "", ""});
+  auto sv        = cudf::strings_column_view(input);
+  auto delimiter = cudf::string_scalar("s");
+  auto empty     = cudf::string_scalar("");
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{}, LCW{}, LCW{}, LCW{}});
+  auto result = cudf::strings::split_record(sv, delimiter);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  result = cudf::strings::split_record(sv, empty);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+
+  result = cudf::strings::rsplit_record(sv, delimiter);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  result = cudf::strings::rsplit_record(sv, empty);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+}
+
 TEST_F(StringsSplitTest, MultiByteDelimiters)
 {
   // Overlapping delimiters
diff --git a/dependencies.yaml b/dependencies.yaml
index e3f8a72e76c..a19574b7658 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["11.8", "12.2"]
+      cuda: ["11.8", "12.5"]
       arch: [x86_64]
     includes:
       - build_base
@@ -243,7 +243,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - &cmake_ver cmake>=3.26.4
+          - &cmake_ver cmake>=3.26.4,!=3.30.0
           - &ninja ninja
   build_all:
     common:
@@ -323,6 +323,7 @@ dependencies:
         packages:
           # Hard pin the patch version used during the build.
           # Sync with conda build constraint & wheel run constraint.
+          # TODO: Change to `2.0.*` for NumPy 2
           - numpy==1.23.*
   build_python_cudf:
     common:
@@ -402,6 +403,10 @@ dependencies:
               cuda: "12.2"
             packages:
               - cuda-version=12.2
+          - matrix:
+              cuda: "12.5"
+            packages:
+              - cuda-version=12.5
   cuda:
     specific:
       - output_types: conda
@@ -547,6 +552,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - fsspec>=0.6.0
+          # TODO: Update `numpy` in `build_python_common` when dropping `<2.0a0`
           - numpy>=1.23,<2.0a0
           - pandas>=2.0,<2.2.3dev0
   run_cudf:
@@ -755,7 +761,7 @@ dependencies:
           - {matrix: null, packages: *cupy_packages_cu11}
   test_python_pandas_cudf:
     common:
-      - output_types: pyproject
+      - output_types: [requirements, pyproject]
         packages:
           # dependencies to run pandas tests
           # https://github.com/pandas-dev/pandas/blob/main/environment.yml
@@ -766,7 +772,7 @@ dependencies:
           - pytest-reportlog
   test_python_cudf_pandas:
     common:
-      - output_types: pyproject
+      - output_types: [requirements, pyproject]
         packages:
           - ipython
           - openpyxl
diff --git a/docs/cudf/source/developer_guide/documentation.md b/docs/cudf/source/developer_guide/documentation.md
index c8da689479c..4f5a57fec02 100644
--- a/docs/cudf/source/developer_guide/documentation.md
+++ b/docs/cudf/source/developer_guide/documentation.md
@@ -164,7 +164,7 @@ The directive should be used inside docstrings like so:
 Docstring body
 
 .. pandas-compat::
-    **$API_NAME**
+    :meth:`pandas.DataFrame.METHOD`
 
     Explanation of differences
 ```
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
index ebf5fab3052..558268ea495 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
@@ -1,6 +1,6 @@
-=======
-copying
-=======
+========
+datetime
+========
 
 .. automodule:: cudf._lib.pylibcudf.datetime
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst
new file mode 100644
index 00000000000..03f769ee861
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst
@@ -0,0 +1,6 @@
+===========
+expressions
+===========
+
+.. automodule:: cudf._lib.pylibcudf.expressions
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index e9dad705cbf..505765bba0f 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -15,25 +15,27 @@ This page provides API documentation for pylibcudf.
     concatenate
     copying
     datetime
+    expressions
     filling
     gpumemoryview
     groupby
-    io/index.rst
     interop
     join
     lists
     merge
     quantiles
     reduce
+    replace
     reshape
     rolling
     round
     scalar
     search
-    stream_compaction
     sorting
-    replace
+    stream_compaction
     table
+    traits
+    transform
     types
     unary
 
@@ -41,4 +43,5 @@ This page provides API documentation for pylibcudf.
     :maxdepth: 2
     :caption: Subpackages
 
+    io/index.rst
     strings/index.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst
new file mode 100644
index 00000000000..5a2276f8b2d
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst
@@ -0,0 +1,6 @@
+===
+CSV
+===
+
+.. automodule:: cudf._lib.pylibcudf.io.csv
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index bde6d8094ce..697bce739de 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -16,4 +16,5 @@ I/O Functions
     :maxdepth: 1
 
     avro
+    csv
     json
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
new file mode 100644
index 00000000000..294ca8dc78c
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
@@ -0,0 +1,6 @@
+======
+traits
+======
+
+.. automodule:: cudf._lib.pylibcudf.traits
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst
new file mode 100644
index 00000000000..ef04bbad7e6
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst
@@ -0,0 +1,6 @@
+=========
+transform
+=========
+
+.. automodule:: cudf._lib.pylibcudf.transform
+   :members:
diff --git a/java/README.md b/java/README.md
index 2d8e2190fee..0d9e060b7cd 100644
--- a/java/README.md
+++ b/java/README.md
@@ -51,9 +51,13 @@ CUDA 11.0:
 ## Build From Source
 
 Build [libcudf](../cpp) first, and make sure the JDK is installed and available. Specify
-the cmake option `-DCUDF_USE_ARROW_STATIC=ON -DCUDF_ENABLE_ARROW_S3=OFF` when building so
-that Apache Arrow is linked statically to libcudf, as this will help create a jar that
-does not require Arrow and its dependencies to be available in the runtime environment.
+the following cmake options to the libcudf build:
+```
+-DCUDF_LARGE_STRINGS_DISABLED=ON -DCUDF_USE_ARROW_STATIC=ON -DCUDF_ENABLE_ARROW_S3=OFF
+```
+These options:
+- Disable large string support, see https://github.com/rapidsai/cudf/issues/16215
+- Statically link Arrow to libcudf to remove Arrow as a runtime dependency.
 
 After building libcudf, the Java bindings can be built via Maven, e.g.:
 ```
diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index 72b1742f7cb..5a429bdc739 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -58,6 +58,7 @@ cmake .. -G"${CMAKE_GENERATOR}" \
          -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX \
          -DCUDA_STATIC_RUNTIME=$ENABLE_CUDA_STATIC_RUNTIME \
          -DUSE_NVTX=$ENABLE_NVTX \
+         -DCUDF_LARGE_STRINGS_DISABLED=ON \
          -DCUDF_USE_ARROW_STATIC=ON \
          -DCUDF_ENABLE_ARROW_S3=OFF \
          -DBUILD_TESTS=$BUILD_CPP_TESTS \
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 1d6a3b3304a..7136b162c13 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3509,9 +3509,9 @@ void testCastFloatToDecimal() {
   @Test
   void testCastDoubleToDecimal() {
     testCastNumericToDecimalsAndBack(DType.FLOAT64, false, 0,
-        () -> ColumnVector.fromBoxedDoubles(1.0, 2.1, -3.23, null, 2.41281, (double) Long.MAX_VALUE),
-        () -> ColumnVector.fromBoxedDoubles(1.0, 2.0, -3.0, null, 2.0, (double) Long.MAX_VALUE),
-        new Long[]{1L, 2L, -3L, null, 2L, Long.MAX_VALUE}
+        () -> ColumnVector.fromBoxedDoubles(1.0, 2.1, -3.23, null, 2.41281, (double) Integer.MAX_VALUE),
+        () -> ColumnVector.fromBoxedDoubles(1.0, 2.0, -3.0, null, 2.0, (double) Integer.MAX_VALUE),
+        new Long[]{1L, 2L, -3L, null, 2L, (long) Integer.MAX_VALUE}
     );
     testCastNumericToDecimalsAndBack(DType.FLOAT64, false, -2,
         () -> ColumnVector.fromBoxedDoubles(1.0, 2.1, -3.23, null, 2.41281, -55.01999),
diff --git a/pyproject.toml b/pyproject.toml
index 2f59864894b..e15cb7b3cdd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,69 @@ quiet-level = 3
 line-length = 79
 
 [tool.ruff.lint]
-select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH", "FA", "UP006", "UP007"]
+typing-modules = ["cudf._typing"]
+select = [
+    # pycodestyle Error
+    "E",
+    # Pyflakes
+    "F",
+    # pycodestyle Warning
+    "W",
+    # no-blank-line-before-function
+    "D201",
+    # one-blank-line-after-class
+    "D204",
+    # indent-with-spaces
+    "D206",
+    # under-indentation
+    "D207",
+    # over-indentation
+    "D208",
+    # new-line-after-last-paragraph
+    "D209",
+    # surrounding-whitespace
+    "D210",
+    # blank-line-before-class
+    "D211",
+    # section-not-over-indented
+    "D214",
+    # section-underline-not-over-indented
+    "D215",
+    # triple-single-quotes
+    "D300",
+    # escape-sequence-in-docstring
+    "D301",
+    # first-line-capitalized
+    "D403",
+    # capitalize-section-name
+    "D405",
+    # new-line-after-section-name
+    "D406",
+    # dashed-underline-after-section
+    "D407",
+    # section-underline-after-name
+    "D408",
+    # section-underline-matches-section-length
+    "D409",
+    # no-blank-line-after-section
+    "D410",
+    # no-blank-line-before-section
+    "D411",
+    # blank-lines-between-header-and-content
+    "D412",
+    # empty-docstring-section
+    "D414",
+    # overload-with-docstring
+    "D418",
+    # flake8-type-checking
+    "TCH",
+    # flake8-future-annotations
+    "FA",
+    # non-pep585-annotation
+    "UP006",
+    # non-pep604-annotation
+    "UP007"
+]
 ignore = [
     # whitespace before :
     "E203",
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index e6dfe2eae62..8ce92e1c0f6 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -192,8 +192,7 @@ def convert_nulls_to_none(records, df):
         col
         for col in df.columns
         if df[col].dtype in pandas_dtypes_to_np_dtypes
-        or pd.api.types.is_datetime64_dtype(df[col].dtype)
-        or pd.api.types.is_timedelta64_dtype(df[col].dtype)
+        or df[col].dtype.kind in "mM"
     ]
 
     for record in records:
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 5a067e84f56..38b7e9ebe04 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -21,7 +21,6 @@ set(cython_sources
     copying.pyx
     csv.pyx
     datetime.pyx
-    expressions.pyx
     filling.pyx
     groupby.pyx
     hash.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 18b95f5f2e1..34c0e29d0b1 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 import numpy as np
 
 from . import (
@@ -8,7 +8,6 @@
     copying,
     csv,
     datetime,
-    expressions,
     filling,
     groupby,
     hash,
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 7155017b7af..e030147fdd3 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -202,11 +202,13 @@ cdef class Column:
 
     def _clear_cache(self):
         self._distinct_count = {}
-        try:
-            del self.memory_usage
-        except AttributeError:
-            # `self.memory_usage` was never called before, So ignore.
-            pass
+        attrs = ("memory_usage", "is_monotonic_increasing", "is_monotonic_decreasing")
+        for attr in attrs:
+            try:
+                delattr(self, attr)
+            except AttributeError:
+                # attr was not called yet, so ignore.
+                pass
         self._null_count = None
 
     def set_mask(self, value):
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index c706351a683..099b61d62ae 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -1,7 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from libcpp.map cimport map
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -9,8 +8,12 @@ from libcpp.vector cimport vector
 
 cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
 from cudf._lib.pylibcudf.io.datasource cimport Datasource, NativeFileDatasource
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
-from cudf._lib.types cimport dtype_to_data_type
+from cudf._lib.types cimport dtype_to_pylibcudf_type
+
+import errno
+import os
+from collections import abc
+from io import BytesIO, StringIO
 
 import numpy as np
 import pandas as pd
@@ -18,65 +21,24 @@ import pandas as pd
 import cudf
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-
-import errno
-import os
-from collections import abc
-from enum import IntEnum
-from io import BytesIO, StringIO
-
-from libc.stdint cimport int32_t
 from libcpp cimport bool
 
-from cudf._lib.io.utils cimport make_sink_info, make_source_info
+from cudf._lib.io.utils cimport make_sink_info
 from cudf._lib.pylibcudf.libcudf.io.csv cimport (
-    csv_reader_options,
     csv_writer_options,
-    read_csv as cpp_read_csv,
     write_csv as cpp_write_csv,
 )
 from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
-    compression_type,
-    quote_style,
-    sink_info,
-    source_info,
-    table_with_metadata,
-)
+from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type, sink_info
 from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table
 
 from pyarrow.lib import NativeFile
 
+import cudf._lib.pylibcudf as plc
 from cudf.api.types import is_hashable
 
-ctypedef int32_t underlying_type_t_compression
-
-
-class Compression(IntEnum):
-    INFER = (
-        <underlying_type_t_compression> compression_type.AUTO
-    )
-    SNAPPY = (
-        <underlying_type_t_compression> compression_type.SNAPPY
-    )
-    GZIP = (
-        <underlying_type_t_compression> compression_type.GZIP
-    )
-    BZ2 = (
-        <underlying_type_t_compression> compression_type.BZIP2
-    )
-    BROTLI = (
-        <underlying_type_t_compression> compression_type.BROTLI
-    )
-    ZIP = (
-        <underlying_type_t_compression> compression_type.ZIP
-    )
-    XZ = (
-        <underlying_type_t_compression> compression_type.XZ
-    )
-
+from cudf._lib.pylibcudf.types cimport DataType
 
 CSV_HEX_TYPE_MAP = {
     "hex": np.dtype("int64"),
@@ -84,234 +46,6 @@ CSV_HEX_TYPE_MAP = {
     "hex32": np.dtype("int32")
 }
 
-cdef csv_reader_options make_csv_reader_options(
-    object datasource,
-    object lineterminator,
-    object quotechar,
-    int quoting,
-    bool doublequote,
-    object header,
-    bool mangle_dupe_cols,
-    object usecols,
-    object delimiter,
-    bool delim_whitespace,
-    bool skipinitialspace,
-    object names,
-    object dtype,
-    int skipfooter,
-    int skiprows,
-    bool dayfirst,
-    object compression,
-    object thousands,
-    object decimal,
-    object true_values,
-    object false_values,
-    object nrows,
-    object byte_range,
-    bool skip_blank_lines,
-    object parse_dates,
-    object comment,
-    object na_values,
-    bool keep_default_na,
-    bool na_filter,
-    object prefix,
-    object index_col,
-) except *:
-    cdef source_info c_source_info = make_source_info([datasource])
-    cdef compression_type c_compression
-    cdef vector[string] c_names
-    cdef size_t c_byte_range_offset = (
-        byte_range[0] if byte_range is not None else 0
-    )
-    cdef size_t c_byte_range_size = (
-        byte_range[1] if byte_range is not None else 0
-    )
-    cdef vector[int] c_use_cols_indexes
-    cdef vector[string] c_use_cols_names
-    cdef size_type c_nrows = nrows if nrows is not None else -1
-    cdef quote_style c_quoting
-    cdef vector[string] c_parse_dates_names
-    cdef vector[int] c_parse_dates_indexes
-    cdef vector[string] c_hex_col_names
-    cdef vector[data_type] c_dtypes_list
-    cdef map[string, data_type] c_dtypes_map
-    cdef vector[int] c_hex_col_indexes
-    cdef vector[string] c_true_values
-    cdef vector[string] c_false_values
-    cdef vector[string] c_na_values
-
-    # Reader settings
-    if compression is None:
-        c_compression = compression_type.NONE
-    else:
-        compression = str(compression)
-        compression = Compression[compression.upper()]
-        c_compression = <compression_type> (
-            <underlying_type_t_compression> compression
-        )
-
-    if quoting == 1:
-        c_quoting = quote_style.ALL
-    elif quoting == 2:
-        c_quoting = quote_style.NONNUMERIC
-    elif quoting == 3:
-        c_quoting = quote_style.NONE
-    else:
-        # Default value
-        c_quoting = quote_style.MINIMAL
-
-    cdef csv_reader_options csv_reader_options_c = move(
-        csv_reader_options.builder(c_source_info)
-        .compression(c_compression)
-        .mangle_dupe_cols(mangle_dupe_cols)
-        .byte_range_offset(c_byte_range_offset)
-        .byte_range_size(c_byte_range_size)
-        .nrows(c_nrows)
-        .skiprows(skiprows)
-        .skipfooter(skipfooter)
-        .quoting(c_quoting)
-        .lineterminator(ord(lineterminator))
-        .quotechar(ord(quotechar))
-        .decimal(ord(decimal))
-        .delim_whitespace(delim_whitespace)
-        .skipinitialspace(skipinitialspace)
-        .skip_blank_lines(skip_blank_lines)
-        .doublequote(doublequote)
-        .keep_default_na(keep_default_na)
-        .na_filter(na_filter)
-        .dayfirst(dayfirst)
-        .build()
-    )
-
-    if names is not None:
-        # explicitly mentioned name, so don't check header
-        if header is None or header == 'infer':
-            csv_reader_options_c.set_header(-1)
-        else:
-            csv_reader_options_c.set_header(header)
-
-        c_names.reserve(len(names))
-        for name in names:
-            c_names.push_back(str(name).encode())
-        csv_reader_options_c.set_names(c_names)
-    else:
-        if header is None:
-            csv_reader_options_c.set_header(-1)
-        elif header == 'infer':
-            csv_reader_options_c.set_header(0)
-        else:
-            csv_reader_options_c.set_header(header)
-
-    if prefix is not None:
-        csv_reader_options_c.set_prefix(prefix.encode())
-
-    if usecols is not None:
-        all_int = all(isinstance(col, int) for col in usecols)
-        if all_int:
-            c_use_cols_indexes.reserve(len(usecols))
-            c_use_cols_indexes = usecols
-            csv_reader_options_c.set_use_cols_indexes(c_use_cols_indexes)
-        else:
-            c_use_cols_names.reserve(len(usecols))
-            for col_name in usecols:
-                c_use_cols_names.push_back(
-                    str(col_name).encode()
-                )
-            csv_reader_options_c.set_use_cols_names(c_use_cols_names)
-
-    if delimiter is not None:
-        csv_reader_options_c.set_delimiter(ord(delimiter))
-
-    if thousands is not None:
-        csv_reader_options_c.set_thousands(ord(thousands))
-
-    if comment is not None:
-        csv_reader_options_c.set_comment(ord(comment))
-
-    if parse_dates is not None:
-        if isinstance(parse_dates, abc.Mapping):
-            raise NotImplementedError(
-                "`parse_dates`: dictionaries are unsupported")
-        if not isinstance(parse_dates, abc.Iterable):
-            raise NotImplementedError(
-                "`parse_dates`: an iterable is required")
-        for col in parse_dates:
-            if isinstance(col, str):
-                c_parse_dates_names.push_back(str(col).encode())
-            elif isinstance(col, int):
-                c_parse_dates_indexes.push_back(col)
-            else:
-                raise NotImplementedError(
-                    "`parse_dates`: Nesting is unsupported")
-        csv_reader_options_c.set_parse_dates(c_parse_dates_names)
-        csv_reader_options_c.set_parse_dates(c_parse_dates_indexes)
-
-    if dtype is not None:
-        if isinstance(dtype, abc.Mapping):
-            for k, v in dtype.items():
-                col_type = v
-                if is_hashable(v) and v in CSV_HEX_TYPE_MAP:
-                    col_type = CSV_HEX_TYPE_MAP[v]
-                    c_hex_col_names.push_back(str(k).encode())
-
-                c_dtypes_map[str(k).encode()] = \
-                    _get_cudf_data_type_from_dtype(
-                        cudf.dtype(col_type))
-            csv_reader_options_c.set_dtypes(c_dtypes_map)
-            csv_reader_options_c.set_parse_hex(c_hex_col_names)
-        elif (
-            cudf.api.types.is_scalar(dtype) or
-            isinstance(dtype, (
-                np.dtype, pd.api.extensions.ExtensionDtype, type
-            ))
-        ):
-            c_dtypes_list.reserve(1)
-            if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP:
-                dtype = CSV_HEX_TYPE_MAP[dtype]
-                c_hex_col_indexes.push_back(0)
-
-            c_dtypes_list.push_back(
-                _get_cudf_data_type_from_dtype(dtype)
-            )
-            csv_reader_options_c.set_dtypes(c_dtypes_list)
-            csv_reader_options_c.set_parse_hex(c_hex_col_indexes)
-        elif isinstance(dtype, abc.Collection):
-            c_dtypes_list.reserve(len(dtype))
-            for index, col_dtype in enumerate(dtype):
-                if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP:
-                    col_dtype = CSV_HEX_TYPE_MAP[col_dtype]
-                    c_hex_col_indexes.push_back(index)
-
-                c_dtypes_list.push_back(
-                    _get_cudf_data_type_from_dtype(col_dtype)
-                )
-            csv_reader_options_c.set_dtypes(c_dtypes_list)
-            csv_reader_options_c.set_parse_hex(c_hex_col_indexes)
-        else:
-            raise ValueError(
-                "dtype should be a scalar/str/list-like/dict-like"
-            )
-
-    if true_values is not None:
-        c_true_values.reserve(len(true_values))
-        for tv in true_values:
-            c_true_values.push_back(tv.encode())
-        csv_reader_options_c.set_true_values(c_true_values)
-
-    if false_values is not None:
-        c_false_values.reserve(len(false_values))
-        for fv in false_values:
-            c_false_values.push_back(fv.encode())
-        csv_reader_options_c.set_false_values(c_false_values)
-
-    if na_values is not None:
-        c_na_values.reserve(len(na_values))
-        for nv in na_values:
-            c_na_values.push_back(nv.encode())
-        csv_reader_options_c.set_na_values(c_na_values)
-
-    return csv_reader_options_c
-
 
 def validate_args(
     object delimiter,
@@ -381,7 +115,6 @@ def read_csv(
     bool na_filter=True,
     object prefix=None,
     object index_col=None,
-    **kwargs,
 ):
     """
     Cython function to call into libcudf API, see `read_csv`.
@@ -413,23 +146,120 @@ def read_csv(
     if delimiter is None:
         delimiter = sep
 
-    cdef csv_reader_options read_csv_options_c = make_csv_reader_options(
-        datasource, lineterminator, quotechar, quoting, doublequote,
-        header, mangle_dupe_cols, usecols, delimiter, delim_whitespace,
-        skipinitialspace, names, dtype, skipfooter, skiprows, dayfirst,
-        compression, thousands, decimal, true_values, false_values, nrows,
-        byte_range, skip_blank_lines, parse_dates, comment, na_values,
-        keep_default_na, na_filter, prefix, index_col)
+    delimiter = str(delimiter)
+
+    if byte_range is None:
+        byte_range = (0, 0)
+
+    if compression is None:
+        c_compression = compression_type.NONE
+    else:
+        compression_map = {
+            "infer": compression_type.AUTO,
+            "gzip": compression_type.GZIP,
+            "bz2": compression_type.BZIP2,
+            "zip": compression_type.ZIP,
+        }
+        c_compression = compression_map[compression]
 
-    cdef table_with_metadata c_result
-    with nogil:
-        c_result = move(cpp_read_csv(read_csv_options_c))
+    # We need this later when setting index cols
+    orig_header = header
+
+    if names is not None:
+        # explicitly mentioned name, so don't check header
+        if header is None or header == 'infer':
+            header = -1
+        else:
+            header = header
+        names = list(names)
+    else:
+        if header is None:
+            header = -1
+        elif header == 'infer':
+            header = 0
 
-    meta_names = [info.name.decode() for info in c_result.metadata.schema_info]
-    df = cudf.DataFrame._from_data(*data_from_unique_ptr(
-        move(c_result.tbl),
-        column_names=meta_names
-    ))
+    hex_cols = []
+
+    new_dtypes = []
+    if dtype is not None:
+        if isinstance(dtype, abc.Mapping):
+            new_dtypes = dict()
+            for k, v in dtype.items():
+                col_type = v
+                if is_hashable(v) and v in CSV_HEX_TYPE_MAP:
+                    col_type = CSV_HEX_TYPE_MAP[v]
+                    hex_cols.append(str(k))
+
+                new_dtypes[k] = _get_plc_data_type_from_dtype(
+                    cudf.dtype(col_type)
+                )
+        elif (
+            cudf.api.types.is_scalar(dtype) or
+            isinstance(dtype, (
+                np.dtype, pd.api.extensions.ExtensionDtype, type
+            ))
+        ):
+            if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP:
+                dtype = CSV_HEX_TYPE_MAP[dtype]
+                hex_cols.append(0)
+
+            new_dtypes.append(
+                _get_plc_data_type_from_dtype(dtype)
+            )
+        elif isinstance(dtype, abc.Collection):
+            for index, col_dtype in enumerate(dtype):
+                if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP:
+                    col_dtype = CSV_HEX_TYPE_MAP[col_dtype]
+                    hex_cols.append(index)
+
+                new_dtypes.append(
+                    _get_plc_data_type_from_dtype(col_dtype)
+                )
+        else:
+            raise ValueError(
+                "dtype should be a scalar/str/list-like/dict-like"
+            )
+
+    lineterminator = str(lineterminator)
+
+    df = cudf.DataFrame._from_data(
+        *data_from_pylibcudf_io(
+            plc.io.csv.read_csv(
+                plc.io.SourceInfo([datasource]),
+                lineterminator=lineterminator,
+                quotechar = quotechar,
+                quoting = quoting,
+                doublequote = doublequote,
+                header = header,
+                mangle_dupe_cols = mangle_dupe_cols,
+                usecols = usecols,
+                delimiter = delimiter,
+                delim_whitespace = delim_whitespace,
+                skipinitialspace = skipinitialspace,
+                col_names = names,
+                dtypes = new_dtypes,
+                skipfooter = skipfooter,
+                skiprows = skiprows,
+                dayfirst = dayfirst,
+                compression = c_compression,
+                thousands = thousands,
+                decimal = decimal,
+                true_values = true_values,
+                false_values = false_values,
+                nrows = nrows if nrows is not None else -1,
+                byte_range_offset = byte_range[0],
+                byte_range_size = byte_range[1],
+                skip_blank_lines = skip_blank_lines,
+                parse_dates = parse_dates,
+                parse_hex = hex_cols,
+                comment = comment,
+                na_values = na_values,
+                keep_default_na = keep_default_na,
+                na_filter = na_filter,
+                prefix = prefix,
+            )
+        )
+    )
 
     if dtype is not None:
         if isinstance(dtype, abc.Mapping):
@@ -450,7 +280,7 @@ def read_csv(
                     col_name = df._data.names[index]
                     df._data[col_name] = df._data[col_name].astype(col_dtype)
 
-    if names is not None and isinstance(names[0], (int)):
+    if names is not None and len(names) and isinstance(names[0], (int)):
         df.columns = [int(x) for x in df._data]
 
     # Set index if the index_col parameter is passed
@@ -459,7 +289,7 @@ def read_csv(
             index_col_name = df._data.select_by_index(index_col).names[0]
             df = df.set_index(index_col_name)
             if isinstance(index_col_name, str) and \
-                    names is None and header in ("infer",):
+                    names is None and orig_header == "infer":
                 if index_col_name.startswith("Unnamed:"):
                     # TODO: Try to upstream it to libcudf
                     # csv reader in future
@@ -550,7 +380,7 @@ def write_csv(
         )
 
 
-cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
+cdef DataType _get_plc_data_type_from_dtype(object dtype) except *:
     # TODO: Remove this work-around Dictionary types
     # in libcudf are fully mapped to categorical columns:
     # https://github.com/rapidsai/cudf/issues/3960
@@ -561,36 +391,36 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
 
     if isinstance(dtype, str):
         if str(dtype) == "date32":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_DAYS
             )
         elif str(dtype) in ("date", "date64"):
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_MILLISECONDS
             )
         elif str(dtype) == "timestamp":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_MILLISECONDS
             )
         elif str(dtype) == "timestamp[us]":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_MICROSECONDS
             )
         elif str(dtype) == "timestamp[s]":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_SECONDS
             )
         elif str(dtype) == "timestamp[ms]":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_MILLISECONDS
             )
         elif str(dtype) == "timestamp[ns]":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_NANOSECONDS
             )
 
     dtype = cudf.dtype(dtype)
-    return dtype_to_data_type(dtype)
+    return dtype_to_pylibcudf_type(dtype)
 
 
 def columns_apply_na_rep(column_names, na_rep):
diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx
deleted file mode 100644
index 3fb29279ed7..00000000000
--- a/python/cudf/cudf/_lib/expressions.pyx
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-
-from enum import Enum
-
-import numpy as np
-
-from cython.operator cimport dereference
-from libc.stdint cimport int64_t
-from libcpp.memory cimport make_unique, unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf cimport expressions as libcudf_exp
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport (
-    timestamp_ms,
-    timestamp_us,
-)
-
-# Necessary for proper casting, see below.
-ctypedef int32_t underlying_type_ast_operator
-
-
-# Aliases for simplicity
-ctypedef unique_ptr[libcudf_exp.expression] expression_ptr
-
-
-class ASTOperator(Enum):
-    ADD = libcudf_exp.ast_operator.ADD
-    SUB = libcudf_exp.ast_operator.SUB
-    MUL = libcudf_exp.ast_operator.MUL
-    DIV = libcudf_exp.ast_operator.DIV
-    TRUE_DIV = libcudf_exp.ast_operator.TRUE_DIV
-    FLOOR_DIV = libcudf_exp.ast_operator.FLOOR_DIV
-    MOD = libcudf_exp.ast_operator.MOD
-    PYMOD = libcudf_exp.ast_operator.PYMOD
-    POW = libcudf_exp.ast_operator.POW
-    EQUAL = libcudf_exp.ast_operator.EQUAL
-    NULL_EQUAL = libcudf_exp.ast_operator.NULL_EQUAL
-    NOT_EQUAL = libcudf_exp.ast_operator.NOT_EQUAL
-    LESS = libcudf_exp.ast_operator.LESS
-    GREATER = libcudf_exp.ast_operator.GREATER
-    LESS_EQUAL = libcudf_exp.ast_operator.LESS_EQUAL
-    GREATER_EQUAL = libcudf_exp.ast_operator.GREATER_EQUAL
-    BITWISE_AND = libcudf_exp.ast_operator.BITWISE_AND
-    BITWISE_OR = libcudf_exp.ast_operator.BITWISE_OR
-    BITWISE_XOR = libcudf_exp.ast_operator.BITWISE_XOR
-    LOGICAL_AND = libcudf_exp.ast_operator.LOGICAL_AND
-    NULL_LOGICAL_AND = libcudf_exp.ast_operator.NULL_LOGICAL_AND
-    LOGICAL_OR = libcudf_exp.ast_operator.LOGICAL_OR
-    NULL_LOGICAL_OR = libcudf_exp.ast_operator.NULL_LOGICAL_OR
-    # Unary operators
-    IDENTITY = libcudf_exp.ast_operator.IDENTITY
-    IS_NULL = libcudf_exp.ast_operator.IS_NULL
-    SIN = libcudf_exp.ast_operator.SIN
-    COS = libcudf_exp.ast_operator.COS
-    TAN = libcudf_exp.ast_operator.TAN
-    ARCSIN = libcudf_exp.ast_operator.ARCSIN
-    ARCCOS = libcudf_exp.ast_operator.ARCCOS
-    ARCTAN = libcudf_exp.ast_operator.ARCTAN
-    SINH = libcudf_exp.ast_operator.SINH
-    COSH = libcudf_exp.ast_operator.COSH
-    TANH = libcudf_exp.ast_operator.TANH
-    ARCSINH = libcudf_exp.ast_operator.ARCSINH
-    ARCCOSH = libcudf_exp.ast_operator.ARCCOSH
-    ARCTANH = libcudf_exp.ast_operator.ARCTANH
-    EXP = libcudf_exp.ast_operator.EXP
-    LOG = libcudf_exp.ast_operator.LOG
-    SQRT = libcudf_exp.ast_operator.SQRT
-    CBRT = libcudf_exp.ast_operator.CBRT
-    CEIL = libcudf_exp.ast_operator.CEIL
-    FLOOR = libcudf_exp.ast_operator.FLOOR
-    ABS = libcudf_exp.ast_operator.ABS
-    RINT = libcudf_exp.ast_operator.RINT
-    BIT_INVERT = libcudf_exp.ast_operator.BIT_INVERT
-    NOT = libcudf_exp.ast_operator.NOT
-
-
-class TableReference(Enum):
-    LEFT = libcudf_exp.table_reference.LEFT
-    RIGHT = libcudf_exp.table_reference.RIGHT
-
-
-# Note that this function only currently supports numeric literals. libcudf
-# expressions don't really support other types yet though, so this isn't
-# restrictive at the moment.
-cdef class Literal(Expression):
-    def __cinit__(self, value):
-        if isinstance(value, int):
-            self.c_scalar.reset(new numeric_scalar[int64_t](value, True))
-            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
-                <numeric_scalar[int64_t] &>dereference(self.c_scalar)
-            ))
-        elif isinstance(value, float):
-            self.c_scalar.reset(new numeric_scalar[double](value, True))
-            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
-                <numeric_scalar[double] &>dereference(self.c_scalar)
-            ))
-        elif isinstance(value, str):
-            self.c_scalar.reset(new string_scalar(value.encode(), True))
-            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
-                <string_scalar &>dereference(self.c_scalar)
-            ))
-        elif isinstance(value, np.datetime64):
-            scale, _ = np.datetime_data(value.dtype)
-            int_value = value.astype(np.int64)
-            if scale == "ms":
-                self.c_scalar.reset(new timestamp_scalar[timestamp_ms](
-                    <int64_t>int_value, True)
-                )
-                self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
-                    <timestamp_scalar[timestamp_ms] &>dereference(self.c_scalar)
-                ))
-            elif scale == "us":
-                self.c_scalar.reset(new timestamp_scalar[timestamp_us](
-                    <int64_t>int_value, True)
-                )
-                self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
-                    <timestamp_scalar[timestamp_us] &>dereference(self.c_scalar)
-                ))
-            else:
-                raise NotImplementedError(
-                    f"Unhandled datetime scale {scale=}"
-                )
-        else:
-            raise NotImplementedError(
-                f"Don't know how to make literal with type {type(value)}"
-            )
-
-
-cdef class ColumnReference(Expression):
-    def __cinit__(self, size_type index):
-        self.c_obj = <expression_ptr>move(make_unique[libcudf_exp.column_reference](
-            index
-        ))
-
-
-cdef class Operation(Expression):
-    def __cinit__(self, op, Expression left, Expression right=None):
-        cdef libcudf_exp.ast_operator op_value = <libcudf_exp.ast_operator>(
-            <underlying_type_ast_operator> op.value
-        )
-
-        if right is None:
-            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.operation](
-                op_value, dereference(left.c_obj)
-            ))
-        else:
-            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.operation](
-                op_value, dereference(left.c_obj), dereference(right.c_obj)
-            ))
-
-cdef class ColumnNameReference(Expression):
-    def __cinit__(self, string name):
-        self.c_obj = <expression_ptr> \
-            move(make_unique[libcudf_exp.column_name_reference](name))
diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 252d986843a..680a87c789e 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -16,6 +16,10 @@ cdef source_info make_source_info(list src) except*
 cdef sink_info make_sinks_info(
     list src, vector[unique_ptr[data_sink]] & data) except*
 cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except*
+cdef add_df_col_struct_names(
+    df,
+    child_names_dict
+)
 cdef update_struct_field_names(
     table,
     vector[column_name_info]& schema_info)
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 1d7c56888d9..58956b9e9b7 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -147,10 +147,37 @@ cdef cppclass iobase_data_sink(data_sink):
         return buf.tell()
 
 
+cdef add_df_col_struct_names(df, child_names_dict):
+    for name, child_names in child_names_dict.items():
+        col = df._data[name]
+
+        df._data[name] = update_col_struct_field_names(col, child_names)
+
+
+cdef update_col_struct_field_names(Column col, child_names):
+    if col.children:
+        children = list(col.children)
+        for i, (child, names) in enumerate(zip(children, child_names.values())):
+            children[i] = update_col_struct_field_names(
+                child,
+                names
+            )
+        col.set_base_children(tuple(children))
+
+    if isinstance(col.dtype, StructDtype):
+        col = col._rename_fields(
+            child_names.keys()
+        )
+
+    return col
+
+
 cdef update_struct_field_names(
     table,
     vector[column_name_info]& schema_info
 ):
+    # Deprecated, remove in favor of add_col_struct_names
+    # when a reader is ported to pylibcudf
     for i, (name, col) in enumerate(table._data.items()):
         table._data[name] = update_column_struct_field_names(
             col, schema_info[i]
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 22e34feb547..03bf9ed8b75 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -8,26 +8,17 @@ import cudf
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
-from libcpp.map cimport map
-from libcpp.string cimport string
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
 
 cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-from cudf._lib.io.utils cimport make_source_info, update_struct_field_names
-from cudf._lib.pylibcudf.libcudf.io.json cimport (
-    json_reader_options,
-    json_recovery_mode_t,
-    read_json as libcudf_read_json,
-    schema_element,
-)
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
-    compression_type,
-    table_with_metadata,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from cudf._lib.column cimport Column
+from cudf._lib.io.utils cimport add_df_col_struct_names
+from cudf._lib.pylibcudf.io.types cimport compression_type
+from cudf._lib.pylibcudf.libcudf.io.json cimport json_recovery_mode_t
+from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
+from cudf._lib.pylibcudf.types cimport DataType
 from cudf._lib.types cimport dtype_to_data_type
-from cudf._lib.utils cimport data_from_unique_ptr
+from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
 
 import cudf._lib.pylibcudf as plc
 
@@ -62,6 +53,7 @@ cpdef read_json(object filepaths_or_buffers,
     # If input data is a JSON string (or StringIO), hold a reference to
     # the encoded memoryview externally to ensure the encoded buffer
     # isn't destroyed before calling libcudf `read_json()`
+
     for idx in range(len(filepaths_or_buffers)):
         if isinstance(filepaths_or_buffers[idx], io.StringIO):
             filepaths_or_buffers[idx] = \
@@ -71,17 +63,7 @@ cpdef read_json(object filepaths_or_buffers,
             filepaths_or_buffers[idx] = filepaths_or_buffers[idx].encode()
 
     # Setup arguments
-    cdef vector[data_type] c_dtypes_list
-    cdef map[string, schema_element] c_dtypes_schema_map
     cdef cudf_io_types.compression_type c_compression
-    # Determine byte read offsets if applicable
-    cdef size_type c_range_offset = (
-        byte_range[0] if byte_range is not None else 0
-    )
-    cdef size_type c_range_size = (
-        byte_range[1] if byte_range is not None else 0
-    )
-    cdef bool c_lines = lines
 
     if compression is not None:
         if compression == 'gzip':
@@ -94,57 +76,71 @@ cpdef read_json(object filepaths_or_buffers,
             c_compression = cudf_io_types.compression_type.AUTO
     else:
         c_compression = cudf_io_types.compression_type.NONE
-    is_list_like_dtypes = False
+
+    processed_dtypes = None
+
     if dtype is False:
         raise ValueError("False value is unsupported for `dtype`")
     elif dtype is not True:
+        processed_dtypes = []
         if isinstance(dtype, abc.Mapping):
             for k, v in dtype.items():
-                c_dtypes_schema_map[str(k).encode()] = \
-                    _get_cudf_schema_element_from_dtype(v)
+                # Make sure keys are string
+                k = str(k)
+                lib_type, child_types = _get_cudf_schema_element_from_dtype(v)
+                processed_dtypes.append((k, lib_type, child_types))
         elif isinstance(dtype, abc.Collection):
-            is_list_like_dtypes = True
-            c_dtypes_list.reserve(len(dtype))
             for col_dtype in dtype:
-                c_dtypes_list.push_back(
-                    _get_cudf_data_type_from_dtype(
-                        col_dtype))
+                processed_dtypes.append(
+                    # Ignore child columns since we cannot specify their dtypes
+                    # when passing a list
+                    _get_cudf_schema_element_from_dtype(col_dtype)[0]
+                )
         else:
             raise TypeError("`dtype` must be 'list like' or 'dict'")
 
-    cdef json_reader_options opts = move(
-        json_reader_options.builder(make_source_info(filepaths_or_buffers))
-        .compression(c_compression)
-        .lines(c_lines)
-        .byte_range_offset(c_range_offset)
-        .byte_range_size(c_range_size)
-        .recovery_mode(_get_json_recovery_mode(on_bad_lines))
-        .build()
-    )
-    if is_list_like_dtypes:
-        opts.set_dtypes(c_dtypes_list)
+    if cudf.get_option("io.json.low_memory") and lines:
+        res_cols, res_col_names, res_child_names = plc.io.json.chunked_read_json(
+            plc.io.SourceInfo(filepaths_or_buffers),
+            processed_dtypes,
+            c_compression,
+            keep_quotes = keep_quotes,
+            mixed_types_as_string = mixed_types_as_string,
+            prune_columns = prune_columns,
+            recovery_mode = _get_json_recovery_mode(on_bad_lines)
+        )
+        df = cudf.DataFrame._from_data(
+            *_data_from_columns(
+                columns=[Column.from_pylibcudf(plc) for plc in res_cols],
+                column_names=res_col_names,
+                index_names=None
+               )
+            )
+        add_df_col_struct_names(df, res_child_names)
+        return df
     else:
-        opts.set_dtypes(c_dtypes_schema_map)
-
-    opts.enable_keep_quotes(keep_quotes)
-    opts.enable_mixed_types_as_string(mixed_types_as_string)
-    opts.enable_prune_columns(prune_columns)
-
-    # Read JSON
-    cdef cudf_io_types.table_with_metadata c_result
-
-    with nogil:
-        c_result = move(libcudf_read_json(opts))
-
-    meta_names = [info.name.decode() for info in c_result.metadata.schema_info]
-    df = cudf.DataFrame._from_data(*data_from_unique_ptr(
-        move(c_result.tbl),
-        column_names=meta_names
-    ))
+        table_w_meta = plc.io.json.read_json(
+            plc.io.SourceInfo(filepaths_or_buffers),
+            processed_dtypes,
+            c_compression,
+            lines,
+            byte_range_offset = byte_range[0] if byte_range is not None else 0,
+            byte_range_size = byte_range[1] if byte_range is not None else 0,
+            keep_quotes = keep_quotes,
+            mixed_types_as_string = mixed_types_as_string,
+            prune_columns = prune_columns,
+            recovery_mode = _get_json_recovery_mode(on_bad_lines)
+        )
 
-    update_struct_field_names(df, c_result.metadata.schema_info)
+        df = cudf.DataFrame._from_data(
+            *data_from_pylibcudf_io(
+                table_w_meta
+            )
+        )
 
-    return df
+        # Post-processing to add in struct column names
+        add_df_col_struct_names(df, table_w_meta.child_names)
+        return df
 
 
 @acquire_spill_lock()
@@ -192,28 +188,32 @@ def write_json(
         )
 
 
-cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except *:
-    cdef schema_element s_element
-    cdef data_type lib_type
+cdef _get_cudf_schema_element_from_dtype(object dtype) except *:
     dtype = cudf.dtype(dtype)
     if isinstance(dtype, cudf.CategoricalDtype):
         raise NotImplementedError(
             "CategoricalDtype as dtype is not yet "
             "supported in JSON reader"
         )
-    lib_type = dtype_to_data_type(dtype)
-    s_element.type = lib_type
+
+    lib_type = DataType.from_libcudf(dtype_to_data_type(dtype))
+    child_types = []
+
     if isinstance(dtype, cudf.StructDtype):
         for name, child_type in dtype.fields.items():
-            s_element.child_types[name.encode()] = \
+            child_lib_type, grandchild_types = \
                 _get_cudf_schema_element_from_dtype(child_type)
+            child_types.append((name, child_lib_type, grandchild_types))
     elif isinstance(dtype, cudf.ListDtype):
-        s_element.child_types["offsets".encode()] = \
-            _get_cudf_schema_element_from_dtype(cudf.dtype("int32"))
-        s_element.child_types["element".encode()] = \
+        child_lib_type, grandchild_types = \
             _get_cudf_schema_element_from_dtype(dtype.element_type)
 
-    return s_element
+        child_types = [
+            ("offsets", DataType.from_libcudf(data_type(type_id.INT32)), []),
+            ("element", child_lib_type, grandchild_types)
+        ]
+
+    return lib_type, child_types
 
 
 cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 0ad09dba717..76f37c3b845 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -8,11 +8,6 @@ from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
-    count_elements as cpp_count_elements,
-)
-from cudf._lib.pylibcudf.libcudf.lists.extract cimport extract_list_element
 from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
     lists_column_view,
 )
@@ -38,19 +33,10 @@ from cudf._lib.pylibcudf cimport Scalar
 
 @acquire_spill_lock()
 def count_elements(Column col):
-
-    # shared_ptr required because lists_column_view has no default
-    # ctor
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.count_elements(
+            col.to_pylibcudf(mode="read"))
     )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_count_elements(list_view.get()[0]))
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
 
 
 @acquire_spill_lock()
@@ -116,37 +102,23 @@ def sort_lists(Column col, bool ascending, str na_position):
 
 @acquire_spill_lock()
 def extract_element_scalar(Column col, size_type index):
-    # shared_ptr required because lists_column_view has no default
-    # ctor
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.extract_list_element(
+            col.to_pylibcudf(mode="read"),
+            index,
+        )
     )
 
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(extract_list_element(list_view.get()[0], index))
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
-
 
 @acquire_spill_lock()
 def extract_element_column(Column col, Column index):
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.extract_list_element(
+            col.to_pylibcudf(mode="read"),
+            index.to_pylibcudf(mode="read"),
+        )
     )
 
-    cdef column_view index_view = index.view()
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(extract_list_element(list_view.get()[0], index_view))
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
-
 
 @acquire_spill_lock()
 def contains_scalar(Column col, py_search_key):
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index d1ec5be9e62..e7959d21e01 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -37,12 +37,12 @@ cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
 cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
 cimport cudf._lib.pylibcudf.libcudf.types as cudf_types
 from cudf._lib.column cimport Column
-from cudf._lib.expressions cimport Expression
 from cudf._lib.io.utils cimport (
     make_sinks_info,
     make_source_info,
     update_struct_field_names,
 )
+from cudf._lib.pylibcudf.expressions cimport Expression
 from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
 from cudf._lib.pylibcudf.libcudf.expressions cimport expression
 from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
@@ -440,6 +440,7 @@ def write_parquet(
     object column_encoding=None,
     object column_type_length=None,
     object output_as_binary=None,
+    write_arrow_schema=False,
 ):
     """
     Cython function to call into libcudf API, see `write_parquet`.
@@ -544,6 +545,7 @@ def write_parquet(
         .write_v2_headers(header_version == "2.0")
         .dictionary_policy(dict_policy)
         .utc_timestamps(False)
+        .write_arrow_schema(write_arrow_schema)
         .build()
     )
     if partitions_info is not None:
@@ -623,6 +625,9 @@ cdef class ParquetWriter:
         If ``True``, enable dictionary encoding for Parquet page data
         subject to ``max_dictionary_size`` constraints.
         If ``False``, disable dictionary encoding for Parquet page data.
+    store_schema : bool, default False
+        If ``True``, enable computing and writing arrow schema to Parquet
+        file footer's key-value metadata section for faithful round-tripping.
     See Also
     --------
     cudf.io.parquet.write_parquet
@@ -641,6 +646,7 @@ cdef class ParquetWriter:
     cdef size_type max_page_size_rows
     cdef size_t max_dictionary_size
     cdef cudf_io_types.dictionary_policy dict_policy
+    cdef bool write_arrow_schema
 
     def __cinit__(self, object filepath_or_buffer, object index=None,
                   object compression="snappy", str statistics="ROWGROUP",
@@ -649,7 +655,8 @@ cdef class ParquetWriter:
                   int max_page_size_bytes=524288,
                   int max_page_size_rows=20000,
                   int max_dictionary_size=1048576,
-                  bool use_dictionary=True):
+                  bool use_dictionary=True,
+                  bool store_schema=False):
         filepaths_or_buffers = (
             list(filepath_or_buffer)
             if is_list_like(filepath_or_buffer)
@@ -670,6 +677,7 @@ cdef class ParquetWriter:
             if use_dictionary
             else cudf_io_types.dictionary_policy.NEVER
         )
+        self.write_arrow_schema = store_schema
 
     def write_table(self, table, object partitions_info=None):
         """ Writes a single table to the file """
@@ -788,6 +796,7 @@ cdef class ParquetWriter:
                 .max_page_size_bytes(self.max_page_size_bytes)
                 .max_page_size_rows(self.max_page_size_rows)
                 .max_dictionary_size(self.max_dictionary_size)
+                .write_arrow_schema(self.write_arrow_schema)
                 .build()
             )
             args.set_dictionary_policy(self.dict_policy)
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 0a198f431a7..0800fa18e94 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -20,6 +20,7 @@ set(cython_sources
     concatenate.pyx
     copying.pyx
     datetime.pyx
+    expressions.pyx
     filling.pyx
     gpumemoryview.pyx
     groupby.pyx
@@ -38,6 +39,8 @@ set(cython_sources
     stream_compaction.pyx
     sorting.pyx
     table.pyx
+    traits.pyx
+    transform.pyx
     types.pyx
     unary.pyx
     utils.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 5131df9a5cd..26e89b818d3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -8,6 +8,7 @@ from . cimport (
     concatenate,
     copying,
     datetime,
+    expressions,
     filling,
     groupby,
     join,
@@ -23,6 +24,8 @@ from . cimport (
     sorting,
     stream_compaction,
     strings,
+    traits,
+    transform,
     types,
     unary,
 )
@@ -54,12 +57,15 @@ __all__ = [
     "quantiles",
     "reduce",
     "replace",
+    "reshape",
     "rolling",
     "round",
     "search",
     "stream_compaction",
     "strings",
     "sorting",
+    "traits",
+    "transform",
     "types",
     "unary",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 43a9e2aca31..e89a5ed9f96 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -7,6 +7,7 @@
     concatenate,
     copying,
     datetime,
+    expressions,
     filling,
     groupby,
     interop,
@@ -23,6 +24,8 @@
     sorting,
     stream_compaction,
     strings,
+    traits,
+    transform,
     types,
     unary,
 )
@@ -35,6 +38,7 @@
 __all__ = [
     "Column",
     "DataType",
+    "MaskState",
     "Scalar",
     "Table",
     "TypeId",
@@ -54,12 +58,15 @@
     "quantiles",
     "reduce",
     "replace",
+    "reshape",
     "rolling",
     "round",
     "search",
     "stream_compaction",
     "strings",
     "sorting",
+    "traits",
+    "transform",
     "types",
     "unary",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
index 9a8c8e49dcf..2411e28ac66 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
@@ -1,5 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libcpp cimport bool
+
 from cudf._lib.pylibcudf.libcudf.binaryop cimport binary_operator
 
 from .column cimport Column
@@ -22,3 +24,10 @@ cpdef Column binary_operation(
     binary_operator op,
     DataType output_type
 )
+
+cpdef bool is_supported_operation(
+    DataType out,
+    DataType lhs,
+    DataType rhs,
+    binary_operator op
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
index c1d669c3c1c..44d9f4ad04a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
@@ -2,6 +2,7 @@
 
 from cython.operator import dereference
 
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
@@ -84,3 +85,37 @@ cpdef Column binary_operation(
         raise ValueError(f"Invalid arguments {lhs} and {rhs}")
 
     return Column.from_libcudf(move(result))
+
+
+cpdef bool is_supported_operation(
+    DataType out,
+    DataType lhs,
+    DataType rhs,
+    binary_operator op
+):
+    """Check if an operation is supported for the given data types.
+
+    For details, see :cpp:func::is_supported_operation`.
+
+    Parameters
+    ----------
+    out : DataType
+        The output data type.
+    lhs : DataType
+        The left hand side data type.
+    rhs : DataType
+        The right hand side data type.
+    op : BinaryOperator
+        The operation to check.
+    Returns
+    -------
+    bool
+        True if the operation is supported, False otherwise
+    """
+
+    return cpp_binaryop.is_supported_operation(
+        out.c_obj,
+        lhs.c_obj,
+        rhs.c_obj,
+        op
+    )
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd
index d13791d95cf..13ee0a70681 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd
@@ -50,6 +50,7 @@ cdef class Column:
     cpdef gpumemoryview null_mask(self)
     cpdef list children(self)
     cpdef Column copy(self)
+    cpdef Column with_mask(self, gpumemoryview, size_type)
 
     cpdef ListColumnView list_view(self)
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index e0cf8b7ee32..cb96c1d9fce 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -175,6 +175,32 @@ cdef class Column:
             children,
         )
 
+    cpdef Column with_mask(self, gpumemoryview mask, size_type null_count):
+        """Augment this column with a new null mask.
+
+        Parameters
+        ----------
+        mask : gpumemoryview
+            New mask (or None to unset the mask)
+        null_count : int
+            New null count. If this is incorrect, bad things happen.
+
+        Returns
+        -------
+        New Column object sharing data with self (except for the mask which is new).
+        """
+        if mask is None and null_count > 0:
+            raise ValueError("Empty mask must have null count of zero")
+        return Column(
+            self._data_type,
+            self._size,
+            self._data,
+            mask,
+            null_count,
+            self._offset,
+            self._children,
+        )
+
     @staticmethod
     cdef Column from_column_view(const column_view& cv, Column owner):
         """Create a Column from a libcudf column_view.
@@ -250,7 +276,7 @@ cdef class Column:
         column is in use.
         """
         data = gpumemoryview(obj)
-        iface = data.__cuda_array_interface__()
+        iface = data.__cuda_array_interface__
         if iface.get('mask') is not None:
             raise ValueError("mask not yet supported.")
 
@@ -400,8 +426,8 @@ def is_c_contiguous(
     itemsize : int
         Size of an element in bytes.
 
-    Return
-    ------
+    Returns
+    -------
     bool
         The boolean answer.
     """
diff --git a/python/cudf/cudf/_lib/expressions.pxd b/python/cudf/cudf/_lib/pylibcudf/expressions.pxd
similarity index 50%
rename from python/cudf/cudf/_lib/expressions.pxd
rename to python/cudf/cudf/_lib/pylibcudf/expressions.pxd
index 4a20c5fc545..64825b89d9f 100644
--- a/python/cudf/cudf/_lib/expressions.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/expressions.pxd
@@ -1,36 +1,31 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport int32_t, int64_t
+# Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
 
 from cudf._lib.pylibcudf.libcudf.expressions cimport (
-    column_reference,
+    ast_operator,
     expression,
-    literal,
-    operation,
-)
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
-    numeric_scalar,
-    scalar,
-    string_scalar,
-    timestamp_scalar,
+    table_reference,
 )
 
+from .scalar cimport Scalar
+
 
 cdef class Expression:
     cdef unique_ptr[expression] c_obj
 
-
 cdef class Literal(Expression):
-    cdef unique_ptr[scalar] c_scalar
-
+    # Hold on to input scalar so it doesn't get gc'ed
+    cdef Scalar scalar
 
 cdef class ColumnReference(Expression):
     pass
 
-
 cdef class Operation(Expression):
-    pass
+    # Hold on to the input expressions so
+    # they don't get gc'ed
+    cdef Expression right
+    cdef Expression left
 
 cdef class ColumnNameReference(Expression):
     pass
diff --git a/python/cudf/cudf/_lib/pylibcudf/expressions.pyx b/python/cudf/cudf/_lib/pylibcudf/expressions.pyx
new file mode 100644
index 00000000000..38de11406ad
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/expressions.pyx
@@ -0,0 +1,195 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from cudf._lib.pylibcudf.libcudf.expressions import \
+    ast_operator as ASTOperator  # no-cython-lint
+from cudf._lib.pylibcudf.libcudf.expressions import \
+    table_reference as TableReference  # no-cython-lint
+
+from cython.operator cimport dereference
+from libc.stdint cimport int32_t, int64_t
+from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.libcudf cimport expressions as libcudf_exp
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
+    duration_scalar,
+    numeric_scalar,
+    string_scalar,
+    timestamp_scalar,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type, type_id
+from cudf._lib.pylibcudf.libcudf.wrappers.durations cimport (
+    duration_ms,
+    duration_ns,
+    duration_s,
+    duration_us,
+)
+from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport (
+    timestamp_ms,
+    timestamp_ns,
+    timestamp_s,
+    timestamp_us,
+)
+
+from .scalar cimport Scalar
+from .traits cimport is_chrono, is_numeric
+from .types cimport DataType
+
+# Aliases for simplicity
+ctypedef unique_ptr[libcudf_exp.expression] expression_ptr
+
+cdef class Literal(Expression):
+    """
+    A literal value used in an abstract syntax tree.
+
+    For details, see :cpp:class:`cudf::ast::literal`.
+
+    Parameters
+    ----------
+    value : Scalar
+        The Scalar value of the Literal.
+        Must be either numeric, string, or a timestamp/duration scalar.
+    """
+    def __cinit__(self, Scalar value):
+        self.scalar = value
+        cdef DataType typ = value.type()
+        cdef type_id tid = value.type().id()
+        if not (is_numeric(typ) or is_chrono(typ) or tid == type_id.STRING):
+            raise ValueError(
+                "Only numeric, string, or timestamp/duration scalars are accepted"
+            )
+        # TODO: Accept type-erased scalar in AST C++ code
+        # Then a lot of this code can be deleted
+        if tid == type_id.INT64:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[int64_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.INT32:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[int32_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.FLOAT64:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[double] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.FLOAT32:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[float] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.STRING:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <string_scalar &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.TIMESTAMP_NANOSECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <timestamp_scalar[timestamp_ns] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.TIMESTAMP_MICROSECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <timestamp_scalar[timestamp_us] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.TIMESTAMP_MILLISECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <timestamp_scalar[timestamp_ms] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.TIMESTAMP_MILLISECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <timestamp_scalar[timestamp_ms] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.TIMESTAMP_SECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <timestamp_scalar[timestamp_s] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.DURATION_NANOSECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <duration_scalar[duration_ns] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.DURATION_MICROSECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <duration_scalar[duration_us] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.DURATION_MILLISECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <duration_scalar[duration_ms] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.DURATION_MILLISECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <duration_scalar[duration_ms] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.DURATION_SECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <duration_scalar[duration_s] &>dereference(self.scalar.c_obj)
+            ))
+        else:
+            raise NotImplementedError(
+                f"Don't know how to make literal with type id {tid}"
+            )
+
+cdef class ColumnReference(Expression):
+    """
+    An expression referring to data from a column in a table.
+
+    For details, see :cpp:class:`cudf::ast::column_reference`.
+
+    Parameters
+    ----------
+    index : size_type
+        The index of this column in the table
+        (provided when the expression is evaluated).
+    table_source : TableReference, default TableReferenece.LEFT
+        Which table to use in cases with two tables (e.g. joins)
+    """
+    def __cinit__(
+        self,
+        size_type index,
+        table_reference table_source=table_reference.LEFT
+    ):
+        self.c_obj = <expression_ptr>move(make_unique[libcudf_exp.column_reference](
+            index, table_source
+        ))
+
+
+cdef class Operation(Expression):
+    """
+    An operation expression holds an operator and zero or more operands.
+
+    For details, see :cpp:class:`cudf::ast::operation`.
+
+    Parameters
+    ----------
+    op : Operator
+    left : Expression
+        Left input expression (left operand)
+    right: Expression, default None
+        Right input expression (right operand).
+        You should only pass this if the input expression is a binary operation.
+    """
+    def __cinit__(self, ast_operator op, Expression left, Expression right=None):
+        self.left = left
+        self.right = right
+        if right is None:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.operation](
+                op, dereference(left.c_obj)
+            ))
+        else:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.operation](
+                op, dereference(left.c_obj), dereference(right.c_obj)
+            ))
+
+cdef class ColumnNameReference(Expression):
+    """
+    An expression referring to data from a column in a table.
+
+    For details, see :cpp:class:`cudf::ast::column_name_reference`.
+
+    Parameters
+    ----------
+    column_name : str
+        Name of this column in the table metadata
+        (provided when the expression is evaluated).
+    """
+    def __cinit__(self, str name):
+        self.c_obj = <expression_ptr> \
+            move(make_unique[libcudf_exp.column_name_reference](
+                <string>(name.encode("utf-8"))
+            ))
diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
index a2f5b2ac387..0904022a944 100644
--- a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
@@ -22,5 +22,6 @@ cdef class gpumemoryview:
         # TODO: Need to respect readonly
         self.ptr = cai["data"][0]
 
+    @property
     def __cuda_array_interface__(self):
         return self.obj.__cuda_array_interface__
diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
index c6c146b0445..eaa05c26986 100644
--- a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
@@ -16,6 +16,7 @@ from cudf._lib.pylibcudf.libcudf.groupby cimport (
     scan_request,
 )
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order
 
 from .column cimport Column
 from .table cimport Table
@@ -38,6 +39,9 @@ cdef class GroupByRequest:
 cdef class GroupBy:
     cdef unique_ptr[groupby] c_obj
     cdef Table _keys
+    cdef unique_ptr[vector[order]] _column_order
+    cdef unique_ptr[vector[null_order]] _null_precedence
+
     cpdef tuple aggregate(self, list requests)
     cpdef tuple scan(self, list requests)
     cpdef tuple shift(self, Table values, list offset, list fill_values)
diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
index 46fe61025ce..f5bb46ca6a2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
@@ -2,7 +2,7 @@
 
 from cython.operator cimport dereference
 from libcpp.functional cimport reference_wrapper
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
@@ -22,7 +22,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from .aggregation cimport Aggregation
 from .column cimport Column
 from .table cimport Table
-from .types cimport null_policy, sorted
+from .types cimport null_order, null_policy, order, sorted
 from .utils cimport _as_vector
 
 
@@ -87,17 +87,43 @@ cdef class GroupBy:
     keys : Table
         The columns to group by.
     null_handling : null_policy, optional
-        Whether or not to include null rows in ``keys``. Default is null_policy.EXCLUDE.
+        Whether or not to include null rows in `keys`.
+        Default is ``null_policy.EXCLUDE``.
     keys_are_sorted : sorted, optional
-        Whether the keys are already sorted. Default is sorted.NO.
+        Whether the keys are already sorted. Default is ``sorted.NO``.
+    column_order : list[order]
+        Indicates the order of each column. Default is ``order.ASCENDING``.
+        Ignored if `keys_are_sorted` is ``sorted.NO``.
+    null_precedence : list[null_order]
+        Indicates the ordering of null values in each column.
+        Default is ``null_order.AFTER``. Ignored if `keys_are_sorted` is ``sorted.NO``.
     """
     def __init__(
         self,
         Table keys,
         null_policy null_handling=null_policy.EXCLUDE,
-        sorted keys_are_sorted=sorted.NO
+        sorted keys_are_sorted=sorted.NO,
+        list column_order=None,
+        list null_precedence=None,
     ):
-        self.c_obj.reset(new groupby(keys.view(), null_handling, keys_are_sorted))
+        self._column_order = make_unique[vector[order]]()
+        self._null_precedence = make_unique[vector[null_order]]()
+        if column_order is not None:
+            for o in column_order:
+                dereference(self._column_order).push_back(<order?>o)
+        if null_precedence is not None:
+            for o in null_precedence:
+                dereference(self._null_precedence).push_back(<null_order?>o)
+
+        self.c_obj.reset(
+            new groupby(
+                keys.view(),
+                null_handling,
+                keys_are_sorted,
+                dereference(self._column_order.get()),
+                dereference(self._null_precedence.get()),
+            )
+        )
         # keep a reference to the keys table so it doesn't get
         # deallocated from under us:
         self._keys = keys
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
index 084b341ec48..8dd08d11dc8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources avro.pyx datasource.pyx json.pyx types.pyx)
+set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx types.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
@@ -21,7 +21,7 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
 )
 
-set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_json
-                                pylibcudf_io_types
+set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_csv pylibcudf_io_datasource
+                                pylibcudf_io_json pylibcudf_io_types
 )
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
index ef4c65b277e..5b3272d60e0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
@@ -1,4 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+# CSV is removed since it is def not cpdef (to force kw-only arguments)
 from . cimport avro, datasource, json, types
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
index fb4e4c7e4bb..e17deaa4663 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, datasource, json, types
+from . import avro, csv, datasource, json, types
 from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/csv.pyx b/python/cudf/cudf/_lib/pylibcudf/io/csv.pyx
new file mode 100644
index 00000000000..e9efb5befee
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/csv.pyx
@@ -0,0 +1,264 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.map cimport map
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from cudf._lib.pylibcudf.libcudf.io.csv cimport (
+    csv_reader_options,
+    read_csv as cpp_read_csv,
+)
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    compression_type,
+    quote_style,
+    table_with_metadata,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from cudf._lib.pylibcudf.types cimport DataType
+
+
+cdef tuple _process_parse_dates_hex(list cols):
+    cdef vector[string] str_cols
+    cdef vector[int] int_cols
+    for col in cols:
+        if isinstance(col, str):
+            str_cols.push_back(col.encode())
+        else:
+            int_cols.push_back(col)
+    return str_cols, int_cols
+
+cdef vector[string] _make_str_vector(list vals):
+    cdef vector[string] res
+    for val in vals:
+        res.push_back((<str?>val).encode())
+    return res
+
+
+def read_csv(
+    SourceInfo source_info,
+    *,
+    compression_type compression = compression_type.AUTO,
+    size_t byte_range_offset = 0,
+    size_t byte_range_size = 0,
+    list col_names = None,
+    str prefix = "",
+    bool mangle_dupe_cols = True,
+    list usecols = None,
+    size_type nrows = -1,
+    size_type skiprows = 0,
+    size_type skipfooter = 0,
+    size_type header = 0,
+    str lineterminator = "\n",
+    str delimiter = None,
+    str thousands = None,
+    str decimal = ".",
+    str comment = None,
+    bool delim_whitespace = False,
+    bool skipinitialspace = False,
+    bool skip_blank_lines = True,
+    quote_style quoting = quote_style.MINIMAL,
+    str quotechar = '"',
+    bool doublequote = True,
+    list parse_dates = None,
+    list parse_hex = None,
+    # Technically this should be dict/list
+    # but using a fused type prevents using None as default
+    object dtypes = None,
+    list true_values = None,
+    list false_values = None,
+    list na_values = None,
+    bool keep_default_na = True,
+    bool na_filter = True,
+    bool dayfirst = False,
+    # Note: These options are supported by the libcudf reader
+    # but are not exposed here since there is no demand for them
+    # on the Python side yet.
+    # bool detect_whitespace_around_quotes = False,
+    # DataType timestamp_type = DataType(type_id.EMPTY),
+):
+    """Reads a CSV file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo to read the CSV file from.
+    compression : compression_type, default CompressionType.AUTO
+        The compression format of the CSV source.
+    byte_range_offset : size_type, default 0
+        Number of bytes to skip from source start.
+    byte_range_size : size_type, default 0
+        Number of bytes to read. By default, will read all bytes.
+    col_names : list, default None
+        The column names to use.
+    prefix : string, default ''
+        The prefix to apply to the column names.
+    mangle_dupe_cols : bool, default True
+        If True, rename duplicate column names.
+    usecols : list, default None
+        Specify the string column names/integer column indices of columns to be read.
+    nrows : size_type, default -1
+        The number of rows to read.
+    skiprows : size_type, default 0
+        The number of rows to skip from the start before reading
+    skipfooter : size_type, default 0
+        The number of rows to skip from the end
+    header : size_type, default 0
+        The index of the row that will be used for header names.
+        Pass -1 to use default column names.
+    lineterminator : str, default '\\n'
+        The character used to determine the end of a line.
+    delimiter : str, default ","
+        The character used to separate fields in a row.
+    thousands : str, default None
+        The character used as the thousands separator.
+        Cannot match delimiter.
+    decimal : str, default '.'
+        The character used as the decimal separator.
+        Cannot match delimiter.
+    comment : str, default None
+        The character used to identify the start of a comment line.
+        (which will be skipped by the reader)
+    delim_whitespace : bool, default False
+        If True, treat whitespace as the field delimiter.
+    skipinitialspace : bool, default False
+        If True, skip whitespace after the delimiter.
+    skip_blank_lines : bool, default True
+        If True, ignore empty lines (otherwise line values are parsed as null).
+    quoting : QuoteStyle, default QuoteStyle.MINIMAL
+        The quoting style used in the input CSV data. One of
+        { QuoteStyle.MINIMAL, QuoteStyle.ALL, QuoteStyle.NONNUMERIC, QuoteStyle.NONE }
+    quotechar : str, default '"'
+        The character used to indicate quoting.
+    doublequote : bool, default True
+        If True, a quote inside a value is double-quoted.
+    parse_dates : list, default None
+        A list of integer column indices/string column names
+        of columns to read as datetime.
+    parse_hex : list, default None
+        A list of integer column indices/string column names
+        of columns to read as hexadecimal.
+    dtypes : Union[Dict[str, DataType], List[DataType]], default None
+        A list of data types or a dictionary mapping column names
+        to a DataType.
+    true_values : List[str], default None
+        A list of additional values to recognize as True.
+    false_values : List[str], default None
+        A list of additional values to recognize as False.
+    na_values : List[str], default None
+        A list of additional values to recognize as null.
+    keep_default_na : bool, default True
+        Whether to keep the built-in default N/A values.
+    na_filter : bool, default True
+        Whether to detect missing values. If False, can
+        improve performance.
+    dayfirst : bool, default False
+        If True, interpret dates as being in the DD/MM format.
+
+    Returns
+    -------
+    TableWithMetadata
+        The Table and its corresponding metadata (column names) that were read in.
+    """
+    cdef vector[string] c_parse_dates_names
+    cdef vector[int] c_parse_dates_indexes
+    cdef vector[int] c_parse_hex_names
+    cdef vector[int] c_parse_hex_indexes
+    cdef vector[data_type] c_dtypes_list
+    cdef map[string, data_type] c_dtypes_map
+
+    cdef csv_reader_options options = move(
+        csv_reader_options.builder(source_info.c_obj)
+        .compression(compression)
+        .mangle_dupe_cols(mangle_dupe_cols)
+        .byte_range_offset(byte_range_offset)
+        .byte_range_size(byte_range_size)
+        .nrows(nrows)
+        .skiprows(skiprows)
+        .skipfooter(skipfooter)
+        .quoting(quoting)
+        .lineterminator(ord(lineterminator))
+        .quotechar(ord(quotechar))
+        .decimal(ord(decimal))
+        .delim_whitespace(delim_whitespace)
+        .skipinitialspace(skipinitialspace)
+        .skip_blank_lines(skip_blank_lines)
+        .doublequote(doublequote)
+        .keep_default_na(keep_default_na)
+        .na_filter(na_filter)
+        .dayfirst(dayfirst)
+        .build()
+    )
+
+    options.set_header(header)
+
+    if col_names is not None:
+        options.set_names([str(name).encode() for name in col_names])
+
+    if prefix is not None:
+        options.set_prefix(prefix.encode())
+
+    if usecols is not None:
+        if all([isinstance(col, int) for col in usecols]):
+            options.set_use_cols_indexes(list(usecols))
+        else:
+            options.set_use_cols_names([str(name).encode() for name in usecols])
+
+    if delimiter is not None:
+        options.set_delimiter(ord(delimiter))
+
+    if thousands is not None:
+        options.set_thousands(ord(thousands))
+
+    if comment is not None:
+        options.set_comment(ord(comment))
+
+    if parse_dates is not None:
+        if not all([isinstance(col, (str, int)) for col in parse_dates]):
+            raise NotImplementedError(
+                    "`parse_dates`: Must pass a list of column names/indices")
+
+        # Set both since users are allowed to mix column names and indices
+        c_parse_dates_names, c_parse_dates_indexes = \
+            _process_parse_dates_hex(parse_dates)
+        options.set_parse_dates(c_parse_dates_names)
+        options.set_parse_dates(c_parse_dates_indexes)
+
+    if parse_hex is not None:
+        if not all([isinstance(col, (str, int)) for col in parse_hex]):
+            raise NotImplementedError(
+                    "`parse_hex`: Must pass a list of column names/indices")
+
+        # Set both since users are allowed to mix column names and indices
+        c_parse_hex_names, c_parse_hex_indexes = _process_parse_dates_hex(parse_hex)
+        options.set_parse_hex(c_parse_hex_names)
+        options.set_parse_hex(c_parse_hex_indexes)
+
+    if isinstance(dtypes, list):
+        for dtype in dtypes:
+            c_dtypes_list.push_back((<DataType?>dtype).c_obj)
+        options.set_dtypes(c_dtypes_list)
+    elif isinstance(dtypes, dict):
+        # dtypes_t is dict
+        for k, v in dtypes.items():
+            c_dtypes_map[str(k).encode()] = (<DataType?>v).c_obj
+        options.set_dtypes(c_dtypes_map)
+    elif dtypes is not None:
+        raise TypeError("dtypes must either by a list/dict")
+
+    if true_values is not None:
+        options.set_true_values(_make_str_vector(true_values))
+
+    if false_values is not None:
+        options.set_false_values(_make_str_vector(false_values))
+
+    if na_values is not None:
+        options.set_na_values(_make_str_vector(na_values))
+
+    cdef table_with_metadata c_result
+    with nogil:
+        c_result = move(cpp_read_csv(options))
+
+    return TableWithMetadata.from_libcudf(c_result)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
index aa7fa0efdaf..8f265f585de 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
@@ -7,6 +7,8 @@ from pyarrow.lib cimport NativeFile
 from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
 from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
+import warnings
+
 
 cdef class Datasource:
     cdef datasource* get_datasource(self) except * nogil:
@@ -16,10 +18,16 @@ cdef class Datasource:
 
 cdef class NativeFileDatasource(Datasource):
 
-    def __cinit__(self, NativeFile native_file,):
+    def __cinit__(self, NativeFile native_file):
 
         cdef shared_ptr[CRandomAccessFile] ra_src
 
+        warnings.warn(
+            "Support for reading pyarrow's NativeFile is deprecated "
+            "and will be removed in a future release of cudf.",
+            FutureWarning,
+        )
+
         ra_src = native_file.get_random_access_file()
         self.c_datasource.reset(new arrow_io_source(ra_src))
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
index a91d574131f..2e0e92a054f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
@@ -1,11 +1,30 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-
 from libcpp cimport bool
 
-from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
+from cudf._lib.pylibcudf.io.types cimport (
+    SinkInfo,
+    SourceInfo,
+    TableWithMetadata,
+    compression_type,
+)
+from cudf._lib.pylibcudf.libcudf.io.json cimport json_recovery_mode_t
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
+cpdef TableWithMetadata read_json(
+    SourceInfo source_info,
+    list dtypes = *,
+    compression_type compression = *,
+    bool lines = *,
+    size_type byte_range_offset = *,
+    size_type byte_range_size = *,
+    bool keep_quotes = *,
+    bool mixed_types_as_string = *,
+    bool prune_columns = *,
+    json_recovery_mode_t recovery_mode = *,
+)
+
+
 cpdef void write_json(
     SinkInfo sink_info,
     TableWithMetadata tbl,
@@ -16,3 +35,14 @@ cpdef void write_json(
     str true_value = *,
     str false_value = *
 )
+
+cpdef tuple chunked_read_json(
+    SourceInfo source_info,
+    list dtypes = *,
+    compression_type compression = *,
+    bool keep_quotes = *,
+    bool mixed_types_as_string = *,
+    bool prune_columns = *,
+    json_recovery_mode_t recovery_mode = *,
+    int chunk_size= *,
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
index 7530eba3803..2710ee60075 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
@@ -1,16 +1,262 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-
 from libcpp cimport bool
 from libcpp.limits cimport numeric_limits
+from libcpp.map cimport map
 from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
 
-from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
+from cudf._lib.pylibcudf.concatenate cimport concatenate
+from cudf._lib.pylibcudf.io.types cimport (
+    SinkInfo,
+    SourceInfo,
+    TableWithMetadata,
+)
 from cudf._lib.pylibcudf.libcudf.io.json cimport (
+    json_reader_options,
+    json_recovery_mode_t,
     json_writer_options,
+    read_json as cpp_read_json,
+    schema_element,
     write_json as cpp_write_json,
 )
-from cudf._lib.pylibcudf.libcudf.io.types cimport table_metadata
-from cudf._lib.pylibcudf.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    compression_type,
+    table_metadata,
+    table_with_metadata,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from cudf._lib.pylibcudf.types cimport DataType
+
+
+cdef map[string, schema_element] _generate_schema_map(list dtypes):
+    cdef map[string, schema_element] schema_map
+    cdef schema_element s_elem
+    cdef string c_name
+
+    for name, dtype, child_dtypes in dtypes:
+        if not (isinstance(name, str) and
+                isinstance(dtype, DataType) and
+                isinstance(child_dtypes, list)):
+
+            raise ValueError("Must pass a list of a tuple containing "
+                             "(column_name, column_dtype, list of child_dtypes)")
+
+        c_name = <str>name.encode()
+
+        s_elem.type = (<DataType>dtype).c_obj
+        s_elem.child_types = _generate_schema_map(child_dtypes)
+
+        schema_map[c_name] = s_elem
+    return schema_map
+
+
+cdef json_reader_options _setup_json_reader_options(
+        SourceInfo source_info,
+        list dtypes,
+        compression_type compression,
+        bool lines,
+        size_type byte_range_offset,
+        size_type byte_range_size,
+        bool keep_quotes,
+        bool mixed_types_as_string,
+        bool prune_columns,
+        json_recovery_mode_t recovery_mode):
+
+    cdef vector[data_type] types_vec
+    cdef json_reader_options opts = move(
+        json_reader_options.builder(source_info.c_obj)
+        .compression(compression)
+        .lines(lines)
+        .byte_range_offset(byte_range_offset)
+        .byte_range_size(byte_range_size)
+        .recovery_mode(recovery_mode)
+        .build()
+    )
+
+    if dtypes is not None:
+        if isinstance(dtypes[0], tuple):
+            opts.set_dtypes(move(_generate_schema_map(dtypes)))
+        else:
+            for dtype in dtypes:
+                types_vec.push_back((<DataType>dtype).c_obj)
+            opts.set_dtypes(types_vec)
+
+    opts.enable_keep_quotes(keep_quotes)
+    opts.enable_mixed_types_as_string(mixed_types_as_string)
+    opts.enable_prune_columns(prune_columns)
+    return opts
+
+
+cpdef tuple chunked_read_json(
+    SourceInfo source_info,
+    list dtypes = None,
+    compression_type compression = compression_type.AUTO,
+    bool keep_quotes = False,
+    bool mixed_types_as_string = False,
+    bool prune_columns = False,
+    json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL,
+    int chunk_size=100_000_000,
+):
+    """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo object to read the JSON file from.
+    dtypes : list, default None
+        Set data types for the columns in the JSON file.
+
+        Each element of the list has the format
+        (column_name, column_dtype, list of child dtypes), where
+        the list of child dtypes is an empty list if the child is not
+        a nested type (list or struct dtype), and is of format
+        (column_child_name, column_child_type, list of grandchild dtypes).
+    compression: CompressionType, default CompressionType.AUTO
+        The compression format of the JSON source.
+    keep_quotes : bool, default False
+        Whether the reader should keep quotes of string values.
+    mixed_types_as_string : bool, default False
+        If True, mixed type columns are returned as string columns.
+        If `False` parsing mixed type columns will thrown an error.
+    prune_columns : bool, default False
+        Whether to only read columns specified in dtypes.
+    recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL
+        Whether to raise an error or set corresponding values to null
+        when encountering an invalid JSON line.
+    chunk_size : int, default 100_000_000 bytes.
+        The number of bytes to be read in chunks.
+        The chunk_size should be set to at least row_size.
+
+    Returns
+    -------
+    tuple
+        A tuple of (columns, column_name, child_names)
+    """
+    cdef size_type c_range_size = (
+        chunk_size if chunk_size is not None else 0
+    )
+    cdef json_reader_options opts = _setup_json_reader_options(
+        source_info=source_info,
+        dtypes=dtypes,
+        compression=compression,
+        lines=True,
+        byte_range_offset=0,
+        byte_range_size=0,
+        keep_quotes=keep_quotes,
+        mixed_types_as_string=mixed_types_as_string,
+        prune_columns=prune_columns,
+        recovery_mode=recovery_mode,
+    )
+
+    # Read JSON
+    cdef table_with_metadata c_result
+
+    final_columns = []
+    meta_names = None
+    child_names = None
+    i = 0
+    while True:
+        opts.set_byte_range_offset(c_range_size * i)
+        opts.set_byte_range_size(c_range_size)
+
+        try:
+            with nogil:
+                c_result = move(cpp_read_json(opts))
+        except (ValueError, OverflowError):
+            break
+        if meta_names is None:
+            meta_names = [info.name.decode() for info in c_result.metadata.schema_info]
+        if child_names is None:
+            child_names = TableWithMetadata._parse_col_names(
+                c_result.metadata.schema_info
+            )
+        new_chunk = [
+            col for col in TableWithMetadata.from_libcudf(
+                c_result).columns
+        ]
+
+        if len(final_columns) == 0:
+            final_columns = new_chunk
+        else:
+            for col_idx in range(len(meta_names)):
+                final_columns[col_idx] = concatenate(
+                    [final_columns[col_idx], new_chunk[col_idx]]
+                )
+                # Must drop any residual GPU columns to save memory
+                new_chunk[col_idx] = None
+        i += 1
+    return (final_columns, meta_names, child_names)
+
+
+cpdef TableWithMetadata read_json(
+    SourceInfo source_info,
+    list dtypes = None,
+    compression_type compression = compression_type.AUTO,
+    bool lines = False,
+    size_type byte_range_offset = 0,
+    size_type byte_range_size = 0,
+    bool keep_quotes = False,
+    bool mixed_types_as_string = False,
+    bool prune_columns = False,
+    json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL,
+):
+    """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo object to read the JSON file from.
+    dtypes : list, default None
+        Set data types for the columns in the JSON file.
+
+        Each element of the list has the format
+        (column_name, column_dtype, list of child dtypes), where
+        the list of child dtypes is an empty list if the child is not
+        a nested type (list or struct dtype), and is of format
+        (column_child_name, column_child_type, list of grandchild dtypes).
+    compression: CompressionType, default CompressionType.AUTO
+        The compression format of the JSON source.
+    byte_range_offset : size_type, default 0
+        Number of bytes to skip from source start.
+    byte_range_size : size_type, default 0
+        Number of bytes to read. By default, will read all bytes.
+    keep_quotes : bool, default False
+        Whether the reader should keep quotes of string values.
+    mixed_types_as_string : bool, default False
+        If True, mixed type columns are returned as string columns.
+        If `False` parsing mixed type columns will thrown an error.
+    prune_columns : bool, default False
+        Whether to only read columns specified in dtypes.
+    recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL
+        Whether to raise an error or set corresponding values to null
+        when encountering an invalid JSON line.
+
+    Returns
+    -------
+    TableWithMetadata
+        The Table and its corresponding metadata (column names) that were read in.
+    """
+    cdef json_reader_options opts = _setup_json_reader_options(
+        source_info=source_info,
+        dtypes=dtypes,
+        compression=compression,
+        lines=lines,
+        byte_range_offset=byte_range_offset,
+        byte_range_size=byte_range_size,
+        keep_quotes=keep_quotes,
+        mixed_types_as_string=mixed_types_as_string,
+        prune_columns=prune_columns,
+        recovery_mode=recovery_mode,
+    )
+
+    # Read JSON
+    cdef table_with_metadata c_result
+
+    with nogil:
+        c_result = move(cpp_read_json(opts))
+
+    return TableWithMetadata.from_libcudf(c_result)
 
 
 cpdef void write_json(
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
index 88daf54f33b..0094bf6032c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
@@ -28,11 +28,19 @@ cdef class TableWithMetadata:
 
     cdef vector[column_name_info] _make_column_info(self, list column_names)
 
+    cdef list _make_columns_list(self, dict child_dict)
+
+    @staticmethod
+    cdef dict _parse_col_names(vector[column_name_info] infos)
+
     @staticmethod
     cdef TableWithMetadata from_libcudf(table_with_metadata& tbl)
 
 cdef class SourceInfo:
     cdef source_info c_obj
+    # Keep the bytes converted from stringio alive
+    # (otherwise we end up with a use after free when they get gc'ed)
+    cdef list byte_sources
 
 cdef class SinkInfo:
     # This vector just exists to keep the unique_ptrs to the sinks alive
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
index f94e20970a4..68498ff88f4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
@@ -22,6 +22,11 @@ import errno
 import io
 import os
 
+from cudf._lib.pylibcudf.libcudf.io.json import \
+    json_recovery_mode_t as JSONRecoveryMode  # no-cython-lint
+from cudf._lib.pylibcudf.libcudf.io.types import \
+    compression_type as CompressionType  # no-cython-lint
+
 
 cdef class TableWithMetadata:
     """A container holding a table and its associated metadata
@@ -69,16 +74,44 @@ cdef class TableWithMetadata:
         """
         return self.tbl.columns()
 
-    @property
-    def column_names(self):
+    cdef list _make_columns_list(self, dict child_dict):
+        cdef list names = []
+        for child in child_dict:
+            grandchildren = self._make_columns_list(child_dict[child])
+            names.append((child, grandchildren))
+        return names
+
+    def column_names(self, include_children=False):
         """
         Return a list containing the column names of the table
         """
         cdef list names = []
+        cdef str name
+        cdef dict child_names = self.child_names
         for col_info in self.metadata.schema_info:
-            # TODO: Handle nesting (columns with child columns)
-            assert col_info.children.size() == 0, "Child column names are not handled!"
-            names.append(col_info.name.decode())
+            name = col_info.name.decode()
+            if include_children:
+                children = self._make_columns_list(child_names[name])
+                names.append((name, children))
+            else:
+                names.append(name)
+        return names
+
+    @property
+    def child_names(self):
+        """
+        Return a dictionary mapping the names of columns with children
+        to the names of their child columns
+        """
+        return TableWithMetadata._parse_col_names(self.metadata.schema_info)
+
+    @staticmethod
+    cdef dict _parse_col_names(vector[column_name_info] infos):
+        cdef dict child_names = dict()
+        cdef dict names = dict()
+        for col_info in infos:
+            child_names = TableWithMetadata._parse_col_names(col_info.children)
+            names[col_info.name.decode()] = child_names
         return names
 
     @staticmethod
@@ -137,6 +170,15 @@ cdef class SourceInfo:
         cdef vector[host_buffer] c_host_buffers
         cdef const unsigned char[::1] c_buffer
         cdef bint empty_buffer = False
+        cdef list new_sources = []
+
+        if isinstance(sources[0], io.StringIO):
+            for buffer in sources:
+                if not isinstance(buffer, io.StringIO):
+                    raise ValueError("All sources must be of the same type!")
+                new_sources.append(buffer.read().encode())
+            sources = new_sources
+            self.byte_sources = sources
         if isinstance(sources[0], bytes):
             empty_buffer = True
             for buffer in sources:
@@ -156,7 +198,10 @@ cdef class SourceInfo:
                                                      c_buffer.shape[0]))
         else:
             raise ValueError("Sources must be a list of str/paths, "
-                             "bytes, io.BytesIO, or a Datasource")
+                             "bytes, io.BytesIO, io.StringIO, or a Datasource")
+
+        if empty_buffer is True:
+            c_host_buffers.push_back(host_buffer(<char*>NULL, 0))
 
         self.c_obj = source_info(c_host_buffers)
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pyx b/python/cudf/cudf/_lib/pylibcudf/join.pyx
index 308b1b39291..2ded84d84d1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/join.pyx
@@ -10,12 +10,7 @@ from rmm._lib.device_buffer cimport device_buffer
 from cudf._lib.pylibcudf.libcudf cimport join as cpp_join
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport (
-    data_type,
-    null_equality,
-    size_type,
-    type_id,
-)
+from cudf._lib.pylibcudf.libcudf.types cimport null_equality
 
 from .column cimport Column
 from .table cimport Table
@@ -23,15 +18,11 @@ from .table cimport Table
 
 cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map):
     # helper to convert a gather map to a Column
-    cdef device_buffer c_empty
-    cdef size_type size = dereference(gather_map.get()).size()
     return Column.from_libcudf(
         move(
             make_unique[column](
-                data_type(type_id.INT32),
-                size,
-                dereference(gather_map.get()).release(),
-                move(c_empty),
+                move(dereference(gather_map.get())),
+                device_buffer(),
                 0
             )
         )
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
index 6c66d01ca57..b04e94f1546 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
@@ -12,8 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources aggregation.pyx binaryop.pyx copying.pyx reduce.pyx replace.pyx round.pyx
-                   stream_compaction.pyx types.pyx unary.pyx
+set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx reduce.pyx replace.pyx
+                   round.pyx stream_compaction.pyx types.pyx unary.pyx
 )
 
 set(linked_libraries cudf::cudf)
@@ -22,4 +22,5 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp
 )
+add_subdirectory(io)
 add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
index 0eda7d34ff9..b34fea6a775 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
@@ -1,9 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
+from cudf._lib.exception_handler cimport cudf_exception_handler
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
@@ -19,9 +21,20 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         TRUE_DIV
         FLOOR_DIV
         MOD
+        PMOD
         PYMOD
         POW
         INT_POW
+        LOG_BASE
+        ATAN2
+        SHIFT_LEFT
+        SHIFT_RIGHT
+        SHIFT_RIGHT_UNSIGNED
+        BITWISE_AND
+        BITWISE_OR
+        BITWISE_XOR
+        LOGICAL_AND
+        LOGICAL_OR
         EQUAL
         NOT_EQUAL
         LESS
@@ -29,38 +42,46 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         LESS_EQUAL
         GREATER_EQUAL
         NULL_EQUALS
+        NULL_MAX
+        NULL_MIN
         NULL_NOT_EQUALS
-        BITWISE_AND
-        BITWISE_OR
-        BITWISE_XOR
-        LOGICAL_AND
-        LOGICAL_OR
         GENERIC_BINARY
+        NULL_LOGICAL_AND
+        NULL_LOGICAL_OR
+        INVALID_BINARY
 
     cdef unique_ptr[column] binary_operation (
         const scalar& lhs,
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const scalar& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         const string& op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
+
+cdef extern from "cudf/binaryop.hpp" namespace "cudf::binops" nogil:
+    cdef bool is_supported_operation(
+        data_type output_type,
+        data_type lhs_type,
+        data_type rhs_type,
+        binary_operator op
+    ) except +cudf_exception_handler
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
index 279d969db50..427e16d4ff8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
@@ -14,63 +15,63 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil:
-    ctypedef enum ast_operator:
+    cpdef enum class ast_operator(int32_t):
         # Binary operators
-        ADD "cudf::ast::ast_operator::ADD"
-        SUB "cudf::ast::ast_operator::SUB"
-        MUL "cudf::ast::ast_operator::MUL"
-        DIV "cudf::ast::ast_operator::DIV"
-        TRUE_DIV "cudf::ast::ast_operator::TRUE_DIV"
-        FLOOR_DIV "cudf::ast::ast_operator::FLOOR_DIV"
-        MOD "cudf::ast::ast_operator::MOD"
-        PYMOD "cudf::ast::ast_operator::PYMOD"
-        POW "cudf::ast::ast_operator::POW"
-        EQUAL "cudf::ast::ast_operator::EQUAL"
-        NULL_EQUAL "cudf::ast::ast_operator::NULL_EQUAL"
-        NOT_EQUAL "cudf::ast::ast_operator::NOT_EQUAL"
-        LESS "cudf::ast::ast_operator::LESS"
-        GREATER "cudf::ast::ast_operator::GREATER"
-        LESS_EQUAL "cudf::ast::ast_operator::LESS_EQUAL"
-        GREATER_EQUAL "cudf::ast::ast_operator::GREATER_EQUAL"
-        BITWISE_AND "cudf::ast::ast_operator::BITWISE_AND"
-        BITWISE_OR "cudf::ast::ast_operator::BITWISE_OR"
-        BITWISE_XOR "cudf::ast::ast_operator::BITWISE_XOR"
-        NULL_LOGICAL_AND "cudf::ast::ast_operator::NULL_LOGICAL_AND"
-        LOGICAL_AND "cudf::ast::ast_operator::LOGICAL_AND"
-        NULL_LOGICAL_OR "cudf::ast::ast_operator::NULL_LOGICAL_OR"
-        LOGICAL_OR "cudf::ast::ast_operator::LOGICAL_OR"
+        ADD
+        SUB
+        MUL
+        DIV
+        TRUE_DIV
+        FLOOR_DIV
+        MOD
+        PYMOD
+        POW
+        EQUAL
+        NULL_EQUAL
+        NOT_EQUAL
+        LESS
+        GREATER
+        LESS_EQUAL
+        GREATER_EQUAL
+        BITWISE_AND
+        BITWISE_OR
+        BITWISE_XOR
+        NULL_LOGICAL_AND
+        LOGICAL_AND
+        NULL_LOGICAL_OR
+        LOGICAL_OR
         # Unary operators
-        IDENTITY "cudf::ast::ast_operator::IDENTITY"
-        IS_NULL "cudf::ast::ast_operator::IS_NULL"
-        SIN "cudf::ast::ast_operator::SIN"
-        COS "cudf::ast::ast_operator::COS"
-        TAN "cudf::ast::ast_operator::TAN"
-        ARCSIN "cudf::ast::ast_operator::ARCSIN"
-        ARCCOS "cudf::ast::ast_operator::ARCCOS"
-        ARCTAN "cudf::ast::ast_operator::ARCTAN"
-        SINH "cudf::ast::ast_operator::SINH"
-        COSH "cudf::ast::ast_operator::COSH"
-        TANH "cudf::ast::ast_operator::TANH"
-        ARCSINH "cudf::ast::ast_operator::ARCSINH"
-        ARCCOSH "cudf::ast::ast_operator::ARCCOSH"
-        ARCTANH "cudf::ast::ast_operator::ARCTANH"
-        EXP "cudf::ast::ast_operator::EXP"
-        LOG "cudf::ast::ast_operator::LOG"
-        SQRT "cudf::ast::ast_operator::SQRT"
-        CBRT "cudf::ast::ast_operator::CBRT"
-        CEIL "cudf::ast::ast_operator::CEIL"
-        FLOOR "cudf::ast::ast_operator::FLOOR"
-        ABS "cudf::ast::ast_operator::ABS"
-        RINT "cudf::ast::ast_operator::RINT"
-        BIT_INVERT "cudf::ast::ast_operator::BIT_INVERT"
-        NOT "cudf::ast::ast_operator::NOT"
+        IDENTITY
+        IS_NULL
+        SIN
+        COS
+        TAN
+        ARCSIN
+        ARCCOS
+        ARCTAN
+        SINH
+        COSH
+        TANH
+        ARCSINH
+        ARCCOSH
+        ARCTANH
+        EXP
+        LOG
+        SQRT
+        CBRT
+        CEIL
+        FLOOR
+        ABS
+        RINT
+        BIT_INVERT
+        NOT
 
     cdef cppclass expression:
         pass
 
-    ctypedef enum table_reference:
-        LEFT "cudf::ast::table_reference::LEFT"
-        RIGHT "cudf::ast::table_reference::RIGHT"
+    cpdef enum class table_reference(int32_t):
+        LEFT
+        RIGHT
 
     cdef cppclass literal(expression):
         # Due to https://github.com/cython/cython/issues/3198, we need to
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt
new file mode 100644
index 00000000000..6831063ecb9
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt
@@ -0,0 +1,26 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources json.pyx types.pyx)
+
+set(linked_libraries cudf::cudf)
+
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_io_
+)
+
+set(targets_using_arrow_headers cpp_io_json cpp_io_types)
+link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
index 2e50cccd132..86621ae184f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int32_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -19,9 +19,9 @@ cdef extern from "cudf/io/json.hpp" \
         data_type type
         map[string, schema_element] child_types
 
-    cdef enum json_recovery_mode_t:
-        FAIL "cudf::io::json_recovery_mode_t::FAIL"
-        RECOVER_WITH_NULL "cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL"
+    cpdef enum class json_recovery_mode_t(int32_t):
+        FAIL
+        RECOVER_WITH_NULL
 
     cdef cppclass json_reader_options:
         json_reader_options() except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
index 0ef6553db56..c38f39f7749 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
@@ -78,6 +78,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         size_t get_max_page_size_bytes() except +
         size_type get_max_page_size_rows() except +
         size_t get_max_dictionary_size() except +
+        bool is_enabled_write_arrow_schema() except +
 
         void set_metadata(
             cudf_io_types.table_input_metadata m
@@ -103,6 +104,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_max_page_size_rows(size_type val) except +
         void set_max_dictionary_size(size_t val) except +
         void enable_write_v2_headers(bool val) except +
+        void enable_write_arrow_schema(bool val) except +
         void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except +
 
     cdef cppclass parquet_writer_options(parquet_writer_options_base):
@@ -143,6 +145,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         BuilderT& utc_timestamps(
             bool enabled
         ) except +
+        BuilderT& write_arrow_schema(
+            bool enabled
+        ) except +
         BuilderT& row_group_size_bytes(
             size_t val
         ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
index 38bdd4db0bb..ba57a839fbc 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
@@ -9,4 +9,4 @@ from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
 
 
 cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil:
-    cdef unique_ptr[column] count_elements(const lists_column_view) except +
+    cdef unique_ptr[column] count_elements(const lists_column_view&) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
index caa12f41914..53609ba8830 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
@@ -11,10 +11,10 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 cdef extern from "cudf/lists/extract.hpp" namespace "cudf::lists" nogil:
     cdef unique_ptr[column] extract_list_element(
-        const lists_column_view,
+        const lists_column_view&,
         size_type
     ) except +
     cdef unique_ptr[column] extract_list_element(
-        const lists_column_view,
-        column_view
+        const lists_column_view&,
+        const column_view&
     ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
index 17b4c1877a6..ab7ed141365 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
@@ -10,6 +10,6 @@ from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
 
 cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil:
     cdef unique_ptr[column] segmented_gather(
-        const lists_column_view source_column,
-        const lists_column_view gather_map_list
+        const lists_column_view& source_column,
+        const lists_column_view& gather_map_list
     ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
index fd21e7b334b..8917a6ac899 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
@@ -10,7 +10,9 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil:
     cdef cppclass lists_column_view(column_view):
         lists_column_view() except +
+        lists_column_view(const lists_column_view& lists_column) except +
         lists_column_view(const column_view& lists_column) except +
+        lists_column_view& operator=(const lists_column_view&) except +
         column_view parent() except +
         column_view offsets() except +
         column_view child() except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd
new file mode 100644
index 00000000000..0382a5d42c3
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+
+
+cdef extern from "cudf/lists/reverse.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] reverse(
+        const lists_column_view& lists_column,
+    ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
index 7f8ae2b7617..2a1b189af51 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
@@ -43,5 +44,6 @@ cdef extern from "cudf/unary.hpp" namespace "cudf" nogil:
     cdef extern unique_ptr[column] cast(
         column_view input,
         data_type out_type) except +
+    cdef extern bool is_supported_cast(data_type from_, data_type to) noexcept
     cdef extern unique_ptr[column] is_nan(column_view input) except +
     cdef extern unique_ptr[column] is_not_nan(column_view input) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd
new file mode 100644
index 00000000000..0cc58af735b
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd
@@ -0,0 +1,27 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
+
+
+cdef extern from "cudf/utilities/traits.hpp" namespace "cudf" nogil:
+    cdef bool is_relationally_comparable(data_type)
+    cdef bool is_equality_comparable(data_type)
+    cdef bool is_numeric(data_type)
+    cdef bool is_index_type(data_type)
+    cdef bool is_unsigned(data_type)
+    cdef bool is_integral(data_type)
+    cdef bool is_integral_not_bool(data_type)
+    cdef bool is_floating_point(data_type)
+    cdef bool is_boolean(data_type)
+    cdef bool is_timestamp(data_type)
+    cdef bool is_fixed_point(data_type)
+    cdef bool is_duration(data_type)
+    cdef bool is_chrono(data_type)
+    cdef bool is_dictionary(data_type)
+    cdef bool is_fixed_width(data_type)
+    cdef bool is_compound(data_type)
+    cdef bool is_nested(data_type)
+    cdef bool is_bit_castable(data_type, data_type)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd
new file mode 100644
index 00000000000..890fca3a662
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.libcudf.types cimport type_id
+
+
+cdef extern from "cudf/utilities/type_dispatcher.hpp" namespace "cudf" nogil:
+    cdef type_id type_to_id[T]()
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index 2ccf0139e90..38eb575ee8d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -12,6 +12,10 @@ ctypedef fused ColumnOrScalar:
     Column
     Scalar
 
+ctypedef fused ColumnOrSizeType:
+    Column
+    size_type
+
 cpdef Table explode_outer(Table, size_type explode_column_idx)
 
 cpdef Column concatenate_rows(Table)
@@ -23,3 +27,11 @@ cpdef Column contains(Column, ColumnOrScalar)
 cpdef Column contains_nulls(Column)
 
 cpdef Column index_of(Column, ColumnOrScalar, bool)
+
+cpdef Column reverse(Column)
+
+cpdef Column segmented_gather(Column, Column)
+
+cpdef Column extract_list_element(Column, ColumnOrSizeType)
+
+cpdef Column count_elements(Column)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index a94d940accd..ea469642dd5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -9,15 +9,23 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.lists cimport (
     contains as cpp_contains,
     explode as cpp_explode,
+    gather as cpp_gather,
+    reverse as cpp_reverse,
 )
 from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_list_elements as cpp_concatenate_list_elements,
     concatenate_null_policy,
     concatenate_rows as cpp_concatenate_rows,
 )
+from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
+    count_elements as cpp_count_elements,
+)
+from cudf._lib.pylibcudf.libcudf.lists.extract cimport (
+    extract_list_element as cpp_extract_list_element,
+)
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.pylibcudf.lists cimport ColumnOrScalar
+from cudf._lib.pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType
 
 from .column cimport Column, ListColumnView
 from .scalar cimport Scalar
@@ -206,3 +214,109 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o
             find_option,
         ))
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column reverse(Column input):
+    """Reverse the element order within each list of the input column.
+
+    For details, see :cpp:func:`reverse`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+
+    Returns
+    -------
+    Column
+        A new Column with reversed lists.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+
+    with nogil:
+        c_result = move(cpp_reverse.reverse(
+            list_view.view(),
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column segmented_gather(Column input, Column gather_map_list):
+    """Create a column with elements gathered based on the indices in gather_map_list
+
+    For details, see :cpp:func:`segmented_gather`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    gather_map_list : Column
+        The indices of the lists column to gather.
+
+    Returns
+    -------
+    Column
+        A new Column with elements in list of rows
+        gathered based on gather_map_list
+    """
+
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view1 = input.list_view()
+    cdef ListColumnView list_view2 = gather_map_list.list_view()
+
+    with nogil:
+        c_result = move(cpp_gather.segmented_gather(
+            list_view1.view(),
+            list_view2.view(),
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column extract_list_element(Column input, ColumnOrSizeType index):
+    """Create a column of extracted list elements.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    index : Union[Column, size_type]
+        The selection index or indices.
+
+    Returns
+    -------
+    Column
+        A new Column with elements extracted.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+
+    with nogil:
+        c_result = move(cpp_extract_list_element(
+            list_view.view(),
+            index.view() if ColumnOrSizeType is Column else index,
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column count_elements(Column input):
+    """Count the number of rows in each
+    list element in the given lists column.
+    For details, see :cpp:func:`count_elements`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column
+
+    Returns
+    -------
+    Column
+        A new Column of the lengths of each list element
+    """
+    cdef ListColumnView list_view = input.list_view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_count_elements(list_view.view()))
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/traits.pxd
new file mode 100644
index 00000000000..668fa775202
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/traits.pxd
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+from .types cimport DataType
+
+
+cpdef bool is_relationally_comparable(DataType typ)
+cpdef bool is_equality_comparable(DataType typ)
+cpdef bool is_numeric(DataType typ)
+cpdef bool is_index_type(DataType typ)
+cpdef bool is_unsigned(DataType typ)
+cpdef bool is_integral(DataType typ)
+cpdef bool is_integral_not_bool(DataType typ)
+cpdef bool is_floating_point(DataType typ)
+cpdef bool is_boolean(DataType typ)
+cpdef bool is_timestamp(DataType typ)
+cpdef bool is_fixed_point(DataType typ)
+cpdef bool is_duration(DataType typ)
+cpdef bool is_chrono(DataType typ)
+cpdef bool is_dictionary(DataType typ)
+cpdef bool is_fixed_width(DataType typ)
+cpdef bool is_compound(DataType typ)
+cpdef bool is_nested(DataType typ)
+cpdef bool is_bit_castable(DataType source, DataType target)
diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pyx b/python/cudf/cudf/_lib/pylibcudf/traits.pyx
new file mode 100644
index 00000000000..d2370f8d641
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/traits.pyx
@@ -0,0 +1,151 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+from cudf._lib.pylibcudf.libcudf.utilities cimport traits
+
+from .types cimport DataType
+
+
+cpdef bool is_relationally_comparable(DataType typ):
+    """Checks if the given data type supports relational comparisons.
+
+    For details, see :cpp:func:`is_relationally_comparable`.
+    """
+    return traits.is_relationally_comparable(typ.c_obj)
+
+
+cpdef bool is_equality_comparable(DataType typ):
+    """Checks if the given data type supports equality comparisons.
+
+    For details, see :cpp:func:`is_equality_comparable`.
+    """
+    return traits.is_equality_comparable(typ.c_obj)
+
+
+cpdef bool is_numeric(DataType typ):
+    """Checks if the given data type is numeric.
+
+    For details, see :cpp:func:`is_numeric`.
+    """
+    return traits.is_numeric(typ.c_obj)
+
+
+cpdef bool is_index_type(DataType typ):
+    """Checks if the given data type is an index type.
+
+    For details, see :cpp:func:`is_index_type`.
+    """
+    return traits.is_index_type(typ.c_obj)
+
+
+cpdef bool is_unsigned(DataType typ):
+    """Checks if the given data type is an unsigned type.
+
+    For details, see :cpp:func:`is_unsigned`.
+    """
+    return traits.is_unsigned(typ.c_obj)
+
+
+cpdef bool is_integral(DataType typ):
+    """Checks if the given data type is an integral type.
+
+    For details, see :cpp:func:`is_integral`.
+    """
+    return traits.is_integral(typ.c_obj)
+
+
+cpdef bool is_integral_not_bool(DataType typ):
+    """Checks if the given data type is an integral type excluding booleans.
+
+    For details, see :cpp:func:`is_integral_not_bool`.
+    """
+    return traits.is_integral_not_bool(typ.c_obj)
+
+
+cpdef bool is_floating_point(DataType typ):
+    """Checks if the given data type is a floating point type.
+
+    For details, see :cpp:func:`is_floating_point`.
+    """
+    return traits.is_floating_point(typ.c_obj)
+
+
+cpdef bool is_boolean(DataType typ):
+    """Checks if the given data type is a boolean type.
+
+    For details, see :cpp:func:`is_boolean`.
+    """
+    return traits.is_boolean(typ.c_obj)
+
+
+cpdef bool is_timestamp(DataType typ):
+    """Checks if the given data type is a timestamp type.
+
+    For details, see :cpp:func:`is_timestamp`.
+    """
+    return traits.is_timestamp(typ.c_obj)
+
+
+cpdef bool is_fixed_point(DataType typ):
+    """Checks if the given data type is a fixed point type.
+
+    For details, see :cpp:func:`is_fixed_point`.
+    """
+    return traits.is_fixed_point(typ.c_obj)
+
+
+cpdef bool is_duration(DataType typ):
+    """Checks if the given data type is a duration type.
+
+    For details, see :cpp:func:`is_duration`.
+    """
+    return traits.is_duration(typ.c_obj)
+
+
+cpdef bool is_chrono(DataType typ):
+    """Checks if the given data type is a chrono type.
+
+    For details, see :cpp:func:`is_chrono`.
+    """
+    return traits.is_chrono(typ.c_obj)
+
+
+cpdef bool is_dictionary(DataType typ):
+    """Checks if the given data type is a dictionary type.
+
+    For details, see :cpp:func:`is_dictionary`.
+    """
+    return traits.is_dictionary(typ.c_obj)
+
+
+cpdef bool is_fixed_width(DataType typ):
+    """Checks if the given data type is a fixed width type.
+
+    For details, see :cpp:func:`is_fixed_width`.
+    """
+    return traits.is_fixed_width(typ.c_obj)
+
+
+cpdef bool is_compound(DataType typ):
+    """Checks if the given data type is a compound type.
+
+    For details, see :cpp:func:`is_compound`.
+    """
+    return traits.is_compound(typ.c_obj)
+
+
+cpdef bool is_nested(DataType typ):
+    """Checks if the given data type is a nested type.
+
+    For details, see :cpp:func:`is_nested`.
+    """
+    return traits.is_nested(typ.c_obj)
+
+
+cpdef bool is_bit_castable(DataType source, DataType target):
+    """Checks if the source type is bit-castable to the target type.
+
+    For details, see :cpp:func:`is_bit_castable`.
+    """
+    return traits.is_bit_castable(source.c_obj, target.c_obj)
diff --git a/python/cudf/cudf/_lib/pylibcudf/transform.pxd b/python/cudf/cudf/_lib/pylibcudf/transform.pxd
new file mode 100644
index 00000000000..4b21feffe25
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/transform.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from .column cimport Column
+from .gpumemoryview cimport gpumemoryview
+
+
+cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input)
diff --git a/python/cudf/cudf/_lib/pylibcudf/transform.pyx b/python/cudf/cudf/_lib/pylibcudf/transform.pyx
new file mode 100644
index 00000000000..a734e71b820
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/transform.pyx
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move, pair
+
+from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+
+from cudf._lib.pylibcudf.libcudf cimport transform as cpp_transform
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+from .column cimport Column
+from .gpumemoryview cimport gpumemoryview
+
+
+cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
+    """Create a null mask preserving existing nulls and converting nans to null.
+
+    Parameters
+    ----------
+    input : Column
+        Column to produce new mask from.
+
+    Returns
+    -------
+    Two-tuple of a gpumemoryview wrapping the null mask and the new null count.
+    """
+    cdef pair[unique_ptr[device_buffer], size_type] c_result
+
+    with nogil:
+        c_result = move(cpp_transform.nans_to_nulls(input.view()))
+
+    return (
+        gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))),
+        c_result.second
+    )
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index 6dbb287f3c4..c45c6071bb3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -2,7 +2,8 @@
 
 from libc.stdint cimport int32_t
 
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type, type_id
+from cudf._lib.pylibcudf.libcudf.utilities.type_dispatcher cimport type_to_id
 
 from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId  # no-cython-lint, isort:skip
 from cudf._lib.pylibcudf.libcudf.types import nan_policy as NanPolicy  # no-cython-lint, isort:skip
@@ -67,3 +68,7 @@ cdef class DataType:
         cdef DataType ret = DataType.__new__(DataType, type_id.EMPTY)
         ret.c_obj = dt
         return ret
+
+
+SIZE_TYPE = DataType(type_to_id[size_type]())
+SIZE_TYPE_ID = SIZE_TYPE.id()
diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pxd b/python/cudf/cudf/_lib/pylibcudf/unary.pxd
index 4aa4543bb80..d07df838172 100644
--- a/python/cudf/cudf/_lib/pylibcudf/unary.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/unary.pxd
@@ -1,5 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libcpp cimport bool
+
 from cudf._lib.pylibcudf.libcudf.unary cimport unary_operator
 
 from .column cimport Column
@@ -17,3 +19,5 @@ cpdef Column cast(Column input, DataType data_type)
 cpdef Column is_nan(Column input)
 
 cpdef Column is_not_nan(Column input)
+
+cpdef bool is_supported_cast(DataType from_, DataType to)
diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pyx b/python/cudf/cudf/_lib/pylibcudf/unary.pyx
index 0879b501a49..8da46f0a832 100644
--- a/python/cudf/cudf/_lib/pylibcudf/unary.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/unary.pyx
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
@@ -154,3 +155,23 @@ cpdef Column is_not_nan(Column input):
         result = move(cpp_unary.is_not_nan(input.view()))
 
     return Column.from_libcudf(move(result))
+
+cpdef bool is_supported_cast(DataType from_, DataType to):
+    """Check if a cast between datatypes is supported.
+
+    For details, see :cpp:func:`is_supported_cast`.
+
+    Parameters
+    ----------
+    from_
+        The source datatype
+    to
+        The target datatype
+
+    Returns
+    -------
+    bool
+        True if the cast is supported.
+    """
+    with nogil:
+        return cpp_unary.is_supported_cast(from_.c_obj, to.c_obj)
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 56bfa0ba332..64634b7a6f9 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -1,4 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+import warnings
 
 import cudf
 from cudf.core.buffer import acquire_spill_lock
@@ -26,11 +27,15 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
         A numpy data type to use for the output, defaults
         to the same type as the input column
     """
-
-    col_dtype = (
-        dtype if dtype is not None
-        else incol._reduction_result_dtype(reduction_op)
-    )
+    if dtype is not None:
+        warnings.warn(
+            "dtype is deprecated and will be remove in a future release. "
+            "Cast the result (e.g. .astype) after the operation instead.",
+            FutureWarning
+        )
+        col_dtype = dtype
+    else:
+        col_dtype = incol._reduction_result_dtype(reduction_op)
 
     # check empty case
     if len(incol) <= incol.null_count:
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index b325173f20d..622725e06a3 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -19,7 +19,8 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 cimport cudf._lib.pylibcudf.libcudf.transform as libcudf_transform
 from cudf._lib.column cimport Column
-from cudf._lib.expressions cimport Expression
+from cudf._lib.pylibcudf cimport transform as plc_transform
+from cudf._lib.pylibcudf.expressions cimport Expression
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.expressions cimport expression
@@ -82,18 +83,10 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit):
 
 @acquire_spill_lock()
 def nans_to_nulls(Column input):
-    cdef column_view c_input = input.view()
-    cdef pair[unique_ptr[device_buffer], size_type] c_output
-    cdef unique_ptr[device_buffer] c_buffer
-
-    with nogil:
-        c_output = move(libcudf_transform.nans_to_nulls(c_input))
-        c_buffer = move(c_output.first)
-
-    if c_output.second == 0:
-        return None
-
-    return as_buffer(DeviceBuffer.c_from_unique_ptr(move(c_buffer)))
+    (mask, _) = plc_transform.nans_to_nulls(
+        input.to_pylibcudf(mode="read")
+    )
+    return as_buffer(mask)
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index 895e1afc502..253fdf7b0d9 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -21,8 +21,6 @@ from cudf._lib.types cimport (
 import cudf
 from cudf._lib import pylibcudf
 
-size_type_dtype = np.dtype("int32")
-
 
 class TypeId(IntEnum):
     EMPTY = <underlying_type_t_type_id> libcudf_types.type_id.EMPTY
@@ -150,6 +148,8 @@ datetime_unit_map = {
     TypeId.TIMESTAMP_NANOSECONDS: "ns",
 }
 
+size_type_dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[pylibcudf.types.SIZE_TYPE_ID]
+
 
 class Interpolation(IntEnum):
     LINEAR = (
@@ -239,6 +239,9 @@ cdef dtype_from_column_view(column_view cv):
         ]
 
 cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
+    # Note: This function is to be phased out in favor of
+    # dtype_to_pylibcudf_type which will return a pylibcudf
+    # DataType object
     cdef libcudf_types.type_id tid
     if isinstance(dtype, cudf.ListDtype):
         tid = libcudf_types.type_id.LIST
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 99850d549a1..1d55f7218dc 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -19,3 +19,4 @@ cdef table_view table_view_from_table(tbl, ignore_index=*) except*
 cdef columns_from_unique_ptr(unique_ptr[table] c_tbl)
 cdef columns_from_table_view(table_view tv, object owners)
 cdef columns_from_pylibcudf_table(tbl)
+cdef _data_from_columns(columns, column_names, index_names=*)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index de6b9f690b6..f136cd997a7 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -322,7 +322,7 @@ cdef data_from_pylibcudf_io(tbl_with_meta):
     """
     return _data_from_columns(
         columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns],
-        column_names=tbl_with_meta.column_names,
+        column_names=tbl_with_meta.column_names(include_children=False),
         index_names=None
     )
 
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index d97e9c815b6..294ae2fd985 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -90,7 +90,7 @@ def is_integer(obj):
     bool
     """
     if isinstance(obj, cudf.Scalar):
-        return pd.api.types.is_integer_dtype(obj.dtype)
+        return obj.dtype.kind in "iu"
     return pd.api.types.is_integer(obj)
 
 
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index e160fa697ee..c38352009de 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -19,15 +19,7 @@
 )
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
-from cudf.api.types import (
-    is_bool_dtype,
-    is_integer,
-    is_integer_dtype,
-    is_list_like,
-    is_scalar,
-    is_signed_integer_dtype,
-    is_unsigned_integer_dtype,
-)
+from cudf.api.types import is_integer, is_list_like, is_scalar
 from cudf.core.abc import Serializable
 from cudf.core.column import ColumnBase, column
 from cudf.errors import MixedTypeError
@@ -38,6 +30,8 @@
 if TYPE_CHECKING:
     from collections.abc import Generator
 
+    import cupy
+
     from cudf.core.column_accessor import ColumnAccessor
 
 
@@ -61,6 +55,12 @@ def copy(self, deep: bool = True) -> Self:
     def __len__(self):
         raise NotImplementedError
 
+    def __bool__(self):
+        raise ValueError(
+            f"The truth value of a {type(self).__name__} is ambiguous. Use "
+            "a.empty, a.bool(), a.item(), a.any() or a.all()."
+        )
+
     @property
     def size(self):
         # The size of an index is always its length irrespective of dimension.
@@ -608,20 +608,14 @@ def union(self, other, sort=None):
             )
 
         if cudf.get_option("mode.pandas_compatible"):
-            if (
-                is_bool_dtype(self.dtype) and not is_bool_dtype(other.dtype)
-            ) or (
-                not is_bool_dtype(self.dtype) and is_bool_dtype(other.dtype)
+            if (self.dtype.kind == "b" and other.dtype.kind != "b") or (
+                self.dtype.kind != "b" and other.dtype.kind == "b"
             ):
                 # Bools + other types will result in mixed type.
                 # This is not yet consistent in pandas and specific to APIs.
                 raise MixedTypeError("Cannot perform union with mixed types")
-            if (
-                is_signed_integer_dtype(self.dtype)
-                and is_unsigned_integer_dtype(other.dtype)
-            ) or (
-                is_unsigned_integer_dtype(self.dtype)
-                and is_signed_integer_dtype(other.dtype)
+            if (self.dtype.kind == "i" and other.dtype.kind == "u") or (
+                self.dtype.kind == "u" and other.dtype.kind == "i"
             ):
                 # signed + unsigned types will result in
                 # mixed type for union in pandas.
@@ -2001,7 +1995,7 @@ def drop_duplicates(
             self._column_names,
         )
 
-    def duplicated(self, keep="first"):
+    def duplicated(self, keep="first") -> cupy.ndarray:
         """
         Indicate duplicate index values.
 
@@ -2098,7 +2092,7 @@ def _gather(self, gather_map, nullify=False, check_bounds=True):
 
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
-        if not is_integer_dtype(gather_map.dtype):
+        if gather_map.dtype.kind not in "iu":
             gather_map = gather_map.astype(size_type_dtype)
 
         if not _gather_map_is_valid(
@@ -2152,7 +2146,7 @@ def _apply_boolean_mask(self, boolean_mask):
         Rows corresponding to `False` is dropped.
         """
         boolean_mask = cudf.core.column.as_column(boolean_mask)
-        if not is_bool_dtype(boolean_mask.dtype):
+        if boolean_mask.dtype.kind != "b":
             raise ValueError("boolean_mask is not boolean type.")
 
         return self._from_columns_like_self(
diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py
index 393a68dd844..63714a78572 100644
--- a/python/cudf/cudf/core/_internals/expressions.py
+++ b/python/cudf/cudf/core/_internals/expressions.py
@@ -4,7 +4,10 @@
 import ast
 import functools
 
-from cudf._lib.expressions import (
+import pyarrow as pa
+
+import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.expressions import (
     ASTOperator,
     ColumnReference,
     Expression,
@@ -122,7 +125,9 @@ def visit_Constant(self, node):
                 f"Unsupported literal {repr(node.value)} of type "
                 "{type(node.value).__name__}"
             )
-        self.stack.append(Literal(node.value))
+        self.stack.append(
+            Literal(plc.interop.from_arrow(pa.scalar(node.value)))
+        )
 
     def visit_UnaryOp(self, node):
         self.visit(node.operand)
@@ -132,7 +137,7 @@ def visit_UnaryOp(self, node):
             # operand, so there's no way to know whether this should be a float
             # or an int. We should maybe see what Spark does, and this will
             # probably require casting.
-            self.nodes.append(Literal(-1))
+            self.nodes.append(Literal(plc.interop.from_arrow(pa.scalar(-1))))
             op = ASTOperator.MUL
             self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2]))
         elif isinstance(node.op, ast.UAdd):
diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 44ce0ddef25..18ab32d2c9e 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -7,18 +7,9 @@
 import numpy as np
 
 import cudf
-from cudf.api.types import (
-    _is_non_decimal_numeric_dtype,
-    is_bool_dtype,
-    is_scalar,
-)
+from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.dtypes import CategoricalDtype
-from cudf.utils.dtypes import (
-    _can_cast,
-    _dtype_can_hold_element,
-    find_common_type,
-    is_mixed_with_object_dtype,
-)
+from cudf.utils.dtypes import find_common_type, is_mixed_with_object_dtype
 
 if TYPE_CHECKING:
     from cudf._typing import ScalarLike
@@ -48,19 +39,25 @@ def _check_and_cast_columns_with_other(
     inplace: bool,
 ) -> tuple[ColumnBase, ScalarLike | ColumnBase]:
     # Returns type-casted `source_col` & `other` based on `inplace`.
+    from cudf.core.column import as_column
+
     source_dtype = source_col.dtype
     if isinstance(source_dtype, CategoricalDtype):
         return _normalize_categorical(source_col, other)
 
     other_is_scalar = is_scalar(other)
     if other_is_scalar:
-        if (isinstance(other, float) and not np.isnan(other)) and (
-            source_dtype.type(other) != other
-        ):
-            raise TypeError(
-                f"Cannot safely cast non-equivalent "
-                f"{type(other).__name__} to {source_dtype.name}"
-            )
+        if isinstance(other, (float, np.floating)) and not np.isnan(other):
+            try:
+                is_safe = source_dtype.type(other) == other
+            except OverflowError:
+                is_safe = False
+
+            if not is_safe:
+                raise TypeError(
+                    f"Cannot safely cast non-equivalent "
+                    f"{type(other).__name__} to {source_dtype.name}"
+                )
 
         if cudf.utils.utils.is_na_like(other):
             return _normalize_categorical(
@@ -84,15 +81,9 @@ def _check_and_cast_columns_with_other(
             )
         return _normalize_categorical(source_col, other.astype(source_dtype))
 
-    if _is_non_decimal_numeric_dtype(source_dtype) and _can_cast(
-        other, source_dtype
-    ):
-        common_dtype = source_dtype
-    elif (
-        isinstance(source_col, cudf.core.column.NumericalColumn)
-        and other_is_scalar
-        and _dtype_can_hold_element(source_dtype, other)
-    ):
+    if _is_non_decimal_numeric_dtype(source_dtype) and as_column(
+        other
+    ).can_cast_safely(source_dtype):
         common_dtype = source_dtype
     else:
         common_dtype = find_common_type(
@@ -106,7 +97,7 @@ def _check_and_cast_columns_with_other(
         other = cudf.Scalar(other)
 
     if is_mixed_with_object_dtype(other, source_col) or (
-        is_bool_dtype(source_dtype) and not is_bool_dtype(common_dtype)
+        source_dtype.kind == "b" and common_dtype.kind != "b"
     ):
         raise TypeError(mixed_err)
 
@@ -128,3 +119,58 @@ def _make_categorical_like(result, column):
             ordered=column.ordered,
         )
     return result
+
+
+def _can_cast(from_dtype, to_dtype):
+    """
+    Utility function to determine if we can cast
+    from `from_dtype` to `to_dtype`. This function primarily calls
+    `np.can_cast` but with some special handling around
+    cudf specific dtypes.
+    """
+    if cudf.utils.utils.is_na_like(from_dtype):
+        return True
+    if isinstance(from_dtype, type):
+        from_dtype = cudf.dtype(from_dtype)
+    if isinstance(to_dtype, type):
+        to_dtype = cudf.dtype(to_dtype)
+
+    # TODO : Add precision & scale checking for
+    # decimal types in future
+
+    if isinstance(from_dtype, cudf.core.dtypes.DecimalDtype):
+        if isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
+            return True
+        elif isinstance(to_dtype, np.dtype):
+            if to_dtype.kind in {"i", "f", "u", "U", "O"}:
+                return True
+            else:
+                return False
+    elif isinstance(from_dtype, np.dtype):
+        if isinstance(to_dtype, np.dtype):
+            return np.can_cast(from_dtype, to_dtype)
+        elif isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
+            if from_dtype.kind in {"i", "f", "u", "U", "O"}:
+                return True
+            else:
+                return False
+        elif isinstance(to_dtype, cudf.core.types.CategoricalDtype):
+            return True
+        else:
+            return False
+    elif isinstance(from_dtype, cudf.core.dtypes.ListDtype):
+        # TODO: Add level based checks too once casting of
+        # list columns is supported
+        if isinstance(to_dtype, cudf.core.dtypes.ListDtype):
+            return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type)
+        else:
+            return False
+    elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype):
+        if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype):
+            return True
+        elif isinstance(to_dtype, np.dtype):
+            return np.can_cast(from_dtype._categories.dtype, to_dtype)
+        else:
+            return False
+    else:
+        return np.can_cast(from_dtype, to_dtype)
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index e8b82ff60c2..6c69fbd2637 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -1,17 +1,22 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
 import warnings
+from typing import TYPE_CHECKING
 
 import cupy as cp
 import numpy as np
 
 from cudf.core.column import as_column
-from cudf.core.copy_types import BooleanMask
 from cudf.core.index import RangeIndex, ensure_index
-from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.scalar import Scalar
 from cudf.options import get_option
 from cudf.utils.dtypes import can_convert_to_column
 
+if TYPE_CHECKING:
+    from cudf.core.column.column import ColumnBase
+    from cudf.core.index import BaseIndex
+
 
 def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
     """Encode the input values as integer labels
@@ -110,55 +115,31 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
     return labels, cats.values if return_cupy_array else ensure_index(cats)
 
 
-def _linear_interpolation(column, index=None):
-    """
-    Interpolate over a float column. Implicitly assumes that values are
-    evenly spaced with respect to the x-axis, for example the data
-    [1.0, NaN, 3.0] will be interpolated assuming the NaN is half way
-    between the two valid values, yielding [1.0, 2.0, 3.0]
-    """
-
-    index = RangeIndex(start=0, stop=len(column), step=1)
-    return _index_or_values_interpolation(column, index=index)
-
-
-def _index_or_values_interpolation(column, index=None):
+def _interpolation(column: ColumnBase, index: BaseIndex) -> ColumnBase:
     """
     Interpolate over a float column. assumes a linear interpolation
     strategy using the index of the data to denote spacing of the x
     values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4]
-    would result in [1.0, 3.0, 4.0]
+    would result in [1.0, 3.0, 4.0].
     """
     # figure out where the nans are
-    mask = cp.isnan(column)
+    mask = column.isnull()
 
     # trivial cases, all nan or no nans
-    num_nan = mask.sum()
-    if num_nan == 0 or num_nan == len(column):
-        return column
+    if not mask.any() or mask.all():
+        return column.copy()
 
-    to_interp = IndexedFrame(data={None: column}, index=index)
-    known_x_and_y = to_interp._apply_boolean_mask(
-        BooleanMask(~mask, len(to_interp))
-    )
-
-    known_x = known_x_and_y.index.to_cupy()
-    known_y = known_x_and_y._data.columns[0].values
+    valid_locs = ~mask
+    if isinstance(index, RangeIndex):
+        # Each point is evenly spaced, index values don't matter
+        known_x = cp.flatnonzero(valid_locs.values)
+    else:
+        known_x = index._column.apply_boolean_mask(valid_locs).values  # type: ignore[attr-defined]
+    known_y = column.apply_boolean_mask(valid_locs).values
 
     result = cp.interp(index.to_cupy(), known_x, known_y)
 
     # find the first nan
-    first_nan_idx = (mask == 0).argmax().item()
+    first_nan_idx = valid_locs.values.argmax().item()
     result[:first_nan_idx] = np.nan
-    return result
-
-
-def get_column_interpolator(method):
-    interpolator = {
-        "linear": _linear_interpolation,
-        "index": _index_or_values_interpolation,
-        "values": _index_or_values_interpolation,
-    }.get(method, None)
-    if not interpolator:
-        raise ValueError(f"Interpolation method `{method}` not found")
-    return interpolator
+    return as_column(result)
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 231af30c06d..9aaccca349d 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -47,7 +47,9 @@
     )
 
 
-_DEFAULT_CATEGORICAL_VALUE = -1
+# Using np.int8(-1) to allow silent wrap-around when casting to uint
+# it may make sense to make this dtype specific or a function.
+_DEFAULT_CATEGORICAL_VALUE = np.int8(-1)
 
 
 class CategoricalAccessor(ColumnMethods):
@@ -1113,24 +1115,18 @@ def is_monotonic_decreasing(self) -> bool:
     def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn:
         if isinstance(dtype, str) and dtype == "category":
             return self
+        if isinstance(dtype, pd.CategoricalDtype):
+            dtype = cudf.CategoricalDtype.from_pandas(dtype)
         if (
-            isinstance(
-                dtype, (cudf.core.dtypes.CategoricalDtype, pd.CategoricalDtype)
-            )
-            and (dtype.categories is None)
-            and (dtype.ordered is None)
+            isinstance(dtype, cudf.CategoricalDtype)
+            and dtype.categories is None
+            and dtype.ordered is None
         ):
             return self
-
-        if isinstance(dtype, pd.CategoricalDtype):
-            dtype = CategoricalDtype(
-                categories=dtype.categories, ordered=dtype.ordered
-            )
-
-        if not isinstance(dtype, CategoricalDtype):
+        elif not isinstance(dtype, CategoricalDtype):
             raise ValueError("dtype must be CategoricalDtype")
 
-        if not isinstance(self.categories, type(dtype.categories._values)):
+        if not isinstance(self.categories, type(dtype.categories._column)):
             # If both categories are of different Column types,
             # return a column full of Nulls.
             return _create_empty_categorical_column(self, dtype)
@@ -1142,26 +1138,14 @@ def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn:
     def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
         return self._get_decategorized_column().as_numerical_column(dtype)
 
-    def as_string_column(
-        self, dtype, format: str | None = None
-    ) -> StringColumn:
-        return self._get_decategorized_column().as_string_column(
-            dtype, format=format
-        )
+    def as_string_column(self) -> StringColumn:
+        return self._get_decategorized_column().as_string_column()
 
-    def as_datetime_column(
-        self, dtype, format: str | None = None
-    ) -> DatetimeColumn:
-        return self._get_decategorized_column().as_datetime_column(
-            dtype, format
-        )
+    def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
+        return self._get_decategorized_column().as_datetime_column(dtype)
 
-    def as_timedelta_column(
-        self, dtype, format: str | None = None
-    ) -> TimeDeltaColumn:
-        return self._get_decategorized_column().as_timedelta_column(
-            dtype, format
-        )
+    def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn:
+        return self._get_decategorized_column().as_timedelta_column(dtype)
 
     def _get_decategorized_column(self) -> ColumnBase:
         if self.null_count == len(self):
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index e7a2863da8c..32e6aade65b 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -41,7 +41,6 @@
     _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
     infer_dtype,
-    is_bool_dtype,
     is_dtype_equal,
     is_scalar,
     is_string_dtype,
@@ -72,7 +71,7 @@
     get_time_unit,
     is_column_like,
     is_mixed_with_object_dtype,
-    min_scalar_type,
+    min_signed_type,
     min_unsigned_type,
 )
 from cudf.utils.utils import _array_ufunc, mask_dtype
@@ -262,7 +261,7 @@ def all(self, skipna: bool = True) -> bool:
         if self.null_count == self.size:
             return True
 
-        return libcudf.reduce.reduce("all", self, dtype=np.bool_)
+        return libcudf.reduce.reduce("all", self)
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
@@ -272,10 +271,13 @@ def any(self, skipna: bool = True) -> bool:
         elif skipna and self.null_count == self.size:
             return False
 
-        return libcudf.reduce.reduce("any", self, dtype=np.bool_)
+        return libcudf.reduce.reduce("any", self)
 
     def dropna(self) -> Self:
-        return drop_nulls([self])[0]._with_type_metadata(self.dtype)
+        if self.has_nulls():
+            return drop_nulls([self])[0]._with_type_metadata(self.dtype)
+        else:
+            return self.copy()
 
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
@@ -619,7 +621,7 @@ def _scatter_by_column(
         key: cudf.core.column.NumericalColumn,
         value: cudf.core.scalar.Scalar | ColumnBase,
     ) -> Self:
-        if is_bool_dtype(key.dtype):
+        if key.dtype.kind == "b":
             # `key` is boolean mask
             if len(key) != len(self):
                 raise ValueError(
@@ -644,7 +646,7 @@ def _scatter_by_column(
 
         self._check_scatter_key_length(num_keys, value)
 
-        if is_bool_dtype(key.dtype):
+        if key.dtype.kind == "b":
             return libcudf.copying.boolean_mask_scatter([value], [self], key)[
                 0
             ]._with_type_metadata(self.dtype)
@@ -700,6 +702,9 @@ def fillna(
 
     def isnull(self) -> ColumnBase:
         """Identify missing values in a Column."""
+        if not self.has_nulls(include_nan=self.dtype.kind == "f"):
+            return as_column(False, length=len(self))
+
         result = libcudf.unary.is_null(self)
 
         if self.dtype.kind == "f":
@@ -711,6 +716,9 @@ def isnull(self) -> ColumnBase:
 
     def notnull(self) -> ColumnBase:
         """Identify non-missing values in a Column."""
+        if not self.has_nulls(include_nan=self.dtype.kind == "f"):
+            return as_column(True, length=len(self))
+
         result = libcudf.unary.is_valid(self)
 
         if self.dtype.kind == "f":
@@ -721,7 +729,7 @@ def notnull(self) -> ColumnBase:
         return result
 
     def indices_of(
-        self, value: ScalarLike | Self
+        self, value: ScalarLike
     ) -> cudf.core.column.NumericalColumn:
         """
         Find locations of value in the column
@@ -735,10 +743,10 @@ def indices_of(
         -------
         Column of indices that match value
         """
-        if not isinstance(value, ColumnBase):
-            value = as_column([value], dtype=self.dtype)
+        if not is_scalar(value):
+            raise ValueError("value must be a scalar")
         else:
-            assert len(value) == 1
+            value = as_column(value, dtype=self.dtype, length=1)
         mask = libcudf.search.contains(value, self)
         return apply_boolean_mask(
             [as_column(range(0, len(self)), dtype=size_type_dtype)], mask
@@ -923,15 +931,16 @@ def as_mask(self) -> Buffer:
 
     @property
     def is_unique(self) -> bool:
+        # distinct_count might already be cached
         return self.distinct_count(dropna=False) == len(self)
 
-    @property
+    @cached_property
     def is_monotonic_increasing(self) -> bool:
         return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
             [self], [True], None
         )
 
-    @property
+    @cached_property
     def is_monotonic_decreasing(self) -> bool:
         return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
             [self], [False], None
@@ -942,6 +951,10 @@ def sort_values(
         ascending: bool = True,
         na_position: str = "last",
     ) -> ColumnBase:
+        if (not ascending and self.is_monotonic_decreasing) or (
+            ascending and self.is_monotonic_increasing
+        ):
+            return self.copy()
         return libcudf.sort.sort(
             [self], column_order=[ascending], null_precedence=[na_position]
         )[0]
@@ -962,59 +975,59 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
         if len(self) == 0:
             dtype = cudf.dtype(dtype)
             if self.dtype == dtype:
-                if copy:
-                    return self.copy()
-                else:
-                    return self
+                result = self
             else:
-                return column_empty(0, dtype=dtype, masked=self.nullable)
-        if copy:
-            col = self.copy()
-        else:
-            col = self
-        if dtype == "category":
+                result = column_empty(0, dtype=dtype, masked=self.nullable)
+        elif dtype == "category":
             # TODO: Figure out why `cudf.dtype("category")`
             # astype's different than just the string
-            return col.as_categorical_column(dtype)
+            result = self.as_categorical_column(dtype)
         elif (
             isinstance(dtype, str)
             and dtype == "interval"
             and isinstance(self.dtype, cudf.IntervalDtype)
         ):
             # astype("interval") (the string only) should no-op
-            return col
-        was_object = dtype == object or dtype == np.dtype(object)
-        dtype = cudf.dtype(dtype)
-        if self.dtype == dtype:
-            return col
-        elif isinstance(dtype, CategoricalDtype):
-            return col.as_categorical_column(dtype)
-        elif isinstance(dtype, IntervalDtype):
-            return col.as_interval_column(dtype)
-        elif isinstance(dtype, (ListDtype, StructDtype)):
-            if not col.dtype == dtype:
-                raise NotImplementedError(
-                    f"Casting {self.dtype} columns not currently supported"
-                )
-            return col
-        elif isinstance(dtype, cudf.core.dtypes.DecimalDtype):
-            return col.as_decimal_column(dtype)
-        elif dtype.kind == "M":
-            return col.as_datetime_column(dtype)
-        elif dtype.kind == "m":
-            return col.as_timedelta_column(dtype)
-        elif dtype.kind == "O":
-            if cudf.get_option("mode.pandas_compatible") and was_object:
-                raise ValueError(
-                    f"Casting to {dtype} is not supported, use "
-                    "`.astype('str')` instead."
-                )
-            return col.as_string_column(dtype)
+            result = self
         else:
-            return col.as_numerical_column(dtype)
+            was_object = dtype == object or dtype == np.dtype(object)
+            dtype = cudf.dtype(dtype)
+            if self.dtype == dtype:
+                result = self
+            elif isinstance(dtype, CategoricalDtype):
+                result = self.as_categorical_column(dtype)
+            elif isinstance(dtype, IntervalDtype):
+                result = self.as_interval_column(dtype)
+            elif isinstance(dtype, (ListDtype, StructDtype)):
+                if not self.dtype == dtype:
+                    raise NotImplementedError(
+                        f"Casting {self.dtype} columns not currently supported"
+                    )
+                result = self
+            elif isinstance(dtype, cudf.core.dtypes.DecimalDtype):
+                result = self.as_decimal_column(dtype)
+            elif dtype.kind == "M":
+                result = self.as_datetime_column(dtype)
+            elif dtype.kind == "m":
+                result = self.as_timedelta_column(dtype)
+            elif dtype.kind == "O":
+                if cudf.get_option("mode.pandas_compatible") and was_object:
+                    raise ValueError(
+                        f"Casting to {dtype} is not supported, use "
+                        "`.astype('str')` instead."
+                    )
+                result = self.as_string_column()
+            else:
+                result = self.as_numerical_column(dtype)
+
+        if copy and result is self:
+            return result.copy()
+        return result
 
     def as_categorical_column(self, dtype) -> ColumnBase:
-        if isinstance(dtype, (cudf.CategoricalDtype, pd.CategoricalDtype)):
+        if isinstance(dtype, pd.CategoricalDtype):
+            dtype = cudf.CategoricalDtype.from_pandas(dtype)
+        if isinstance(dtype, cudf.CategoricalDtype):
             ordered = dtype.ordered
         else:
             ordered = False
@@ -1023,14 +1036,11 @@ def as_categorical_column(self, dtype) -> ColumnBase:
         if (
             isinstance(dtype, cudf.CategoricalDtype)
             and dtype._categories is not None
-        ) or (
-            isinstance(dtype, pd.CategoricalDtype)
-            and dtype.categories is not None
         ):
-            labels = self._label_encoding(cats=as_column(dtype.categories))
-
+            cat_col = dtype._categories
+            labels = self._label_encoding(cats=cat_col)
             return build_categorical_column(
-                categories=as_column(dtype.categories),
+                categories=cat_col,
                 codes=labels,
                 mask=self.mask,
                 ordered=dtype.ordered,
@@ -1062,8 +1072,8 @@ def as_numerical_column(
         raise NotImplementedError
 
     def as_datetime_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.DatetimeColumn":
+        self, dtype: Dtype
+    ) -> cudf.core.column.DatetimeColumn:
         raise NotImplementedError
 
     def as_interval_column(
@@ -1072,13 +1082,11 @@ def as_interval_column(
         raise NotImplementedError
 
     def as_timedelta_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.TimeDeltaColumn":
+        self, dtype: Dtype
+    ) -> cudf.core.column.TimeDeltaColumn:
         raise NotImplementedError
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
+    def as_string_column(self) -> cudf.core.column.StringColumn:
         raise NotImplementedError
 
     def as_decimal_column(
@@ -1088,7 +1096,7 @@ def as_decimal_column(
 
     def apply_boolean_mask(self, mask) -> ColumnBase:
         mask = as_column(mask)
-        if not is_bool_dtype(mask.dtype):
+        if mask.dtype.kind != "b":
             raise ValueError("boolean_mask is not boolean type.")
 
         return apply_boolean_mask([self], mask)[0]._with_type_metadata(
@@ -1096,11 +1104,22 @@ def apply_boolean_mask(self, mask) -> ColumnBase:
         )
 
     def argsort(
-        self, ascending: bool = True, na_position: str = "last"
-    ) -> "cudf.core.column.NumericalColumn":
-        return libcudf.sort.order_by(
-            [self], [ascending], na_position, stable=True
-        )
+        self,
+        ascending: bool = True,
+        na_position: Literal["first", "last"] = "last",
+    ) -> cudf.core.column.NumericalColumn:
+        if (ascending and self.is_monotonic_increasing) or (
+            not ascending and self.is_monotonic_decreasing
+        ):
+            return as_column(range(len(self)))
+        elif (ascending and self.is_monotonic_decreasing) or (
+            not ascending and self.is_monotonic_increasing
+        ):
+            return as_column(range(len(self) - 1, -1, -1))
+        else:
+            return libcudf.sort.order_by(
+                [self], [ascending], na_position, stable=True
+            )
 
     def __arrow_array__(self, type=None):
         raise TypeError(
@@ -1163,9 +1182,12 @@ def unique(self) -> ColumnBase:
         """
         Get unique values in the data
         """
-        return drop_duplicates([self], keep="first")[0]._with_type_metadata(
-            self.dtype
-        )
+        if self.is_unique:
+            return self.copy()
+        else:
+            return drop_duplicates([self], keep="first")[
+                0
+            ]._with_type_metadata(self.dtype)
 
     def serialize(self) -> tuple[dict, list]:
         # data model:
@@ -1283,7 +1305,10 @@ def _reduce(
             skipna=skipna, min_count=min_count
         )
         if isinstance(preprocessed, ColumnBase):
-            return libcudf.reduce.reduce(op, preprocessed, **kwargs)
+            dtype = kwargs.pop("dtype", None)
+            return libcudf.reduce.reduce(
+                op, preprocessed, dtype=dtype, **kwargs
+            )
         return preprocessed
 
     def _process_for_reduction(
@@ -1314,6 +1339,8 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
         Determine the correct dtype to pass to libcudf based on
         the input dtype, data dtype, and specific reduction op
         """
+        if reduction_op in {"any", "all"}:
+            return np.dtype(np.bool_)
         return self.dtype
 
     def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
@@ -1329,7 +1356,7 @@ def _label_encoding(
         self,
         cats: ColumnBase,
         dtype: Dtype | None = None,
-        na_sentinel: ScalarLike | None = None,
+        na_sentinel: cudf.Scalar | None = None,
     ):
         """
         Convert each value in `self` into an integer code, with `cats`
@@ -1369,7 +1396,7 @@ def _return_sentinel_column():
             return as_column(na_sentinel, dtype=dtype, length=len(self))
 
         if dtype is None:
-            dtype = min_scalar_type(max(len(cats), na_sentinel), 8)
+            dtype = min_signed_type(max(len(cats), na_sentinel.value), 8)
 
         if is_mixed_with_object_dtype(self, cats):
             return _return_sentinel_column()
@@ -1431,9 +1458,10 @@ def column_empty_like(
     return column_empty(row_count, dtype, masked)
 
 
-def _has_any_nan(arbitrary):
+def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
+    """Check if an object dtype Series or array contains NaN."""
     return any(
-        ((isinstance(x, float) or isinstance(x, np.floating)) and np.isnan(x))
+        isinstance(x, (float, np.floating)) and np.isnan(x)
         for x in np.asarray(arbitrary)
     )
 
@@ -2191,25 +2219,26 @@ def as_column(
                 and arbitrary.null_count > 0
             ):
                 arbitrary = arbitrary.cast(pa.float64())
-            if cudf.get_option(
-                "default_integer_bitwidth"
-            ) and pa.types.is_integer(arbitrary.type):
-                dtype = _maybe_convert_to_default_type("int")
-            elif cudf.get_option(
-                "default_float_bitwidth"
-            ) and pa.types.is_floating(arbitrary.type):
-                dtype = _maybe_convert_to_default_type("float")
+            if (
+                cudf.get_option("default_integer_bitwidth")
+                and pa.types.is_integer(arbitrary.type)
+            ) or (
+                cudf.get_option("default_float_bitwidth")
+                and pa.types.is_floating(arbitrary.type)
+            ):
+                dtype = _maybe_convert_to_default_type(
+                    cudf.dtype(arbitrary.type.to_pandas_dtype())
+                )
         except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
             arbitrary = pd.Series(arbitrary)
-            if cudf.get_option(
-                "default_integer_bitwidth"
-            ) and arbitrary.dtype.kind in set("iu"):
-                dtype = _maybe_convert_to_default_type("int")
-            elif (
+            if (
+                cudf.get_option("default_integer_bitwidth")
+                and arbitrary.dtype.kind in set("iu")
+            ) or (
                 cudf.get_option("default_float_bitwidth")
                 and arbitrary.dtype.kind == "f"
             ):
-                dtype = _maybe_convert_to_default_type("float")
+                dtype = _maybe_convert_to_default_type(arbitrary.dtype)
         return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
 
 
@@ -2285,9 +2314,8 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     # Notice, we can always cast pure null columns
     not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)]
     if len(not_null_col_dtypes) and all(
-        _is_non_decimal_numeric_dtype(dtyp)
-        and np.issubdtype(dtyp, np.datetime64)
-        for dtyp in not_null_col_dtypes
+        _is_non_decimal_numeric_dtype(dtype) and dtype.kind == "M"
+        for dtype in not_null_col_dtypes
     ):
         common_dtype = find_common_type(not_null_col_dtypes)
         # Cast all columns to the common dtype
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index c10aceba9f4..73902789c11 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -18,7 +18,6 @@
 from cudf import _lib as libcudf
 from cudf._lib.labeling import label_bins
 from cudf._lib.search import search_sorted
-from cudf.api.types import is_datetime64_dtype, is_timedelta64_dtype
 from cudf.core._compat import PANDAS_GE_220
 from cudf.core._internals.timezones import (
     check_ambiguous_and_nonexistent,
@@ -178,43 +177,6 @@ def _resolve_mixed_dtypes(
     return cudf.dtype(f"{base_type}[{units[max(lhs_unit, rhs_unit)]}]")
 
 
-def _get_datetime_format(col, dtype, time_unit):
-    format = _dtype_to_format_conversion.get(dtype.name, "%Y-%m-%d %H:%M:%S")
-    if format.endswith("f"):
-        sub_second_res_len = 3
-    else:
-        sub_second_res_len = 0
-
-    has_nanos = time_unit in {"ns"} and col.get_dt_field("nanosecond").any()
-    has_micros = (
-        time_unit in {"ns", "us"} and col.get_dt_field("microsecond").any()
-    )
-    has_millis = (
-        time_unit in {"ns", "us", "ms"}
-        and col.get_dt_field("millisecond").any()
-    )
-    has_seconds = col.get_dt_field("second").any()
-    has_minutes = col.get_dt_field("minute").any()
-    has_hours = col.get_dt_field("hour").any()
-    if sub_second_res_len:
-        if has_nanos:
-            # format should be intact and rest of the
-            # following conditions shouldn't execute.
-            pass
-        elif has_micros:
-            format = format[:-sub_second_res_len] + "%6f"
-        elif has_millis:
-            format = format[:-sub_second_res_len] + "%3f"
-        elif has_seconds or has_minutes or has_hours:
-            format = format[:-4]
-        else:
-            format = format.split(" ")[0]
-    else:
-        if not (has_seconds or has_minutes or has_hours):
-            format = format.split(" ")[0]
-    return format
-
-
 class DatetimeColumn(column.ColumnBase):
     """
     A Column implementation for Date-time types.
@@ -381,9 +343,7 @@ def round(self, freq: str) -> ColumnBase:
 
     def isocalendar(self) -> dict[str, ColumnBase]:
         return {
-            field: self.as_string_column("str", format=directive).astype(
-                "uint32"
-            )
+            field: self.strftime(format=directive).astype("uint32")
             for field, directive in zip(
                 ["year", "week", "day"], ["%G", "%V", "%u"]
             )
@@ -445,17 +405,12 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
 
         return NotImplemented
 
-    def as_datetime_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> DatetimeColumn:
-        dtype = cudf.dtype(dtype)
+    def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype=dtype)
 
-    def as_timedelta_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.TimeDeltaColumn":
+    def as_timedelta_column(self, dtype: Dtype) -> None:  # type: ignore[override]
         raise TypeError(
             f"cannot astype a datetimelike from {self.dtype} to {dtype}"
         )
@@ -472,40 +427,69 @@ def as_numerical_column(
         )
         return cast("cudf.core.column.NumericalColumn", col.astype(dtype))
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
-        if format is None:
-            format = _dtype_to_format_conversion.get(
-                self.dtype.name, "%Y-%m-%d %H:%M:%S"
+    def strftime(self, format: str) -> cudf.core.column.StringColumn:
+        if len(self) == 0:
+            return cast(
+                cudf.core.column.StringColumn,
+                column.column_empty(0, dtype="object", masked=False),
             )
-            if cudf.get_option("mode.pandas_compatible"):
-                format = _get_datetime_format(
-                    self, dtype=self.dtype, time_unit=self.time_unit
-                )
         if format in _DATETIME_SPECIAL_FORMATS:
             names = as_column(_DATETIME_NAMES)
         else:
             names = cudf.core.column.column_empty(
                 0, dtype="object", masked=False
             )
-        if len(self) > 0:
-            return string._datetime_to_str_typecast_functions[
-                cudf.dtype(self.dtype)
-            ](self, format, names)
-        else:
-            return cast(
-                "cudf.core.column.StringColumn",
-                column.column_empty(0, dtype="object", masked=False),
-            )
+        return string._datetime_to_str_typecast_functions[self.dtype](
+            self, format, names
+        )
+
+    def as_string_column(self) -> cudf.core.column.StringColumn:
+        format = _dtype_to_format_conversion.get(
+            self.dtype.name, "%Y-%m-%d %H:%M:%S"
+        )
+        if cudf.get_option("mode.pandas_compatible"):
+            if format.endswith("f"):
+                sub_second_res_len = 3
+            else:
+                sub_second_res_len = 0
 
-    def mean(
-        self, skipna=None, min_count: int = 0, dtype=np.float64
-    ) -> ScalarLike:
+            has_nanos = (
+                self.time_unit in {"ns"}
+                and self.get_dt_field("nanosecond").any()
+            )
+            has_micros = (
+                self.time_unit in {"ns", "us"}
+                and self.get_dt_field("microsecond").any()
+            )
+            has_millis = (
+                self.time_unit in {"ns", "us", "ms"}
+                and self.get_dt_field("millisecond").any()
+            )
+            has_seconds = self.get_dt_field("second").any()
+            has_minutes = self.get_dt_field("minute").any()
+            has_hours = self.get_dt_field("hour").any()
+            if sub_second_res_len:
+                if has_nanos:
+                    # format should be intact and rest of the
+                    # following conditions shouldn't execute.
+                    pass
+                elif has_micros:
+                    format = format[:-sub_second_res_len] + "%6f"
+                elif has_millis:
+                    format = format[:-sub_second_res_len] + "%3f"
+                elif has_seconds or has_minutes or has_hours:
+                    format = format[:-4]
+                else:
+                    format = format.split(" ")[0]
+            elif not (has_seconds or has_minutes or has_hours):
+                format = format.split(" ")[0]
+        return self.strftime(format)
+
+    def mean(self, skipna=None, min_count: int = 0) -> ScalarLike:
         return pd.Timestamp(
             cast(
                 "cudf.core.column.NumericalColumn", self.astype("int64")
-            ).mean(skipna=skipna, min_count=min_count, dtype=dtype),
+            ).mean(skipna=skipna, min_count=min_count),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -513,12 +497,11 @@ def std(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype: Dtype = np.float64,
         ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
             cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
-                skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+                skipna=skipna, min_count=min_count, ddof=ddof
             )
             * _unit_to_nanoseconds_conversion[self.time_unit],
         ).as_unit(self.time_unit)
@@ -578,10 +561,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
 
         # We check this on `other` before reflection since we already know the
         # dtype of `self`.
-        other_is_timedelta = is_timedelta64_dtype(other.dtype)
-        other_is_datetime64 = not other_is_timedelta and is_datetime64_dtype(
-            other.dtype
-        )
+        other_is_timedelta = other.dtype.kind == "m"
+        other_is_datetime64 = other.dtype.kind == "M"
         lhs, rhs = (other, self) if reflect else (self, other)
         out_dtype = None
 
@@ -645,9 +626,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
     def indices_of(
         self, value: ScalarLike
     ) -> cudf.core.column.NumericalColumn:
-        value = column.as_column(
-            pd.to_datetime(value), dtype=self.dtype
-        ).astype("int64")
+        value = (
+            pd.to_datetime(value).to_numpy().astype(self.dtype).astype("int64")
+        )
         return self.astype("int64").indices_of(value)
 
     @property
@@ -658,7 +639,7 @@ def isin(self, values: Sequence) -> ColumnBase:
         return cudf.core.tools.datetimes._isin_datetimelike(self, values)
 
     def can_cast_safely(self, to_dtype: Dtype) -> bool:
-        if np.issubdtype(to_dtype, np.datetime64):
+        if to_dtype.kind == "M":  # type: ignore[union-attr]
             to_res, _ = np.datetime_data(to_dtype)
             self_res, _ = np.datetime_data(self.dtype)
 
@@ -872,10 +853,11 @@ def _local_time(self):
         offsets_from_utc = offsets.take(indices, nullify=True)
         return self + offsets_from_utc
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
-        return self._local_time.as_string_column(dtype, format)
+    def strftime(self, format: str) -> cudf.core.column.StringColumn:
+        return self._local_time.strftime(format)
+
+    def as_string_column(self) -> cudf.core.column.StringColumn:
+        return self._local_time.as_string_column()
 
     def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 3e238d65cff..6a7f338b065 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -15,7 +15,7 @@
 from cudf._lib.strings.convert.convert_fixed_point import (
     from_decimal as cpp_from_decimal,
 )
-from cudf.api.types import is_integer_dtype, is_scalar
+from cudf.api.types import is_scalar
 from cudf.core.buffer import as_buffer
 from cudf.core.column import ColumnBase
 from cudf.core.dtypes import (
@@ -62,9 +62,7 @@ def as_decimal_column(
             return self
         return libcudf.unary.cast(self, dtype)
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
+    def as_string_column(self) -> cudf.core.column.StringColumn:
         if len(self) > 0:
             return cpp_from_decimal(self)
         else:
@@ -152,7 +150,7 @@ def _validate_fillna_value(
     def normalize_binop_value(self, other):
         if isinstance(other, ColumnBase):
             if isinstance(other, cudf.core.column.NumericalColumn):
-                if not is_integer_dtype(other.dtype):
+                if other.dtype.kind not in "iu":
                     raise TypeError(
                         "Decimal columns only support binary operations with "
                         "integer numerical columns."
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index c548db67344..1b7cd95b3d0 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -73,10 +73,15 @@ def memory_usage(self):
             child0_size = (
                 current_base_child.size + 1 - current_offset
             ) * current_base_child.base_children[0].dtype.itemsize
-            current_offset = current_base_child.base_children[
-                0
-            ].element_indexing(current_offset)
             n += child0_size
+            current_offset_col = current_base_child.base_children[0]
+            if not len(current_offset_col):
+                # See https://github.com/rapidsai/cudf/issues/16164 why
+                # offset column can be uninitialized
+                break
+            current_offset = current_offset_col.element_indexing(
+                current_offset
+            )
             current_base_child = current_base_child.base_children[1]
 
         n += (
@@ -248,15 +253,11 @@ def from_sequences(
         )
         return res
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
+    def as_string_column(self) -> cudf.core.column.StringColumn:
         """
         Create a strings column from a list column
         """
-        lc = self._transform_leaves(
-            lambda col, dtype: col.as_string_column(dtype), dtype
-        )
+        lc = self._transform_leaves(lambda col: col.as_string_column())
 
         # Separator strings to match the Python format
         separators = as_column([", ", "[", "]"])
@@ -563,10 +564,11 @@ def take(self, lists_indices: ColumnLike) -> ParentType:
             raise ValueError(
                 "lists_indices and list column is of different " "size."
             )
-        if not _is_non_decimal_numeric_dtype(
-            lists_indices_col.children[1].dtype
-        ) or not np.issubdtype(
-            lists_indices_col.children[1].dtype, np.integer
+        if (
+            not _is_non_decimal_numeric_dtype(
+                lists_indices_col.children[1].dtype
+            )
+            or lists_indices_col.children[1].dtype.kind not in "iu"
         ):
             raise TypeError(
                 "lists_indices should be column of values of index types."
@@ -645,9 +647,17 @@ def sort_values(
         dtype: list
 
         .. pandas-compat::
-            **ListMethods.sort_values**
+            `pandas.Series.list.sort_values`
+
+            This method does not exist in pandas but it can be run
+            as:
 
-            The ``inplace`` and ``kind`` arguments are currently not supported.
+            >>> import pandas as pd
+            >>> s = pd.Series([[3, 2, 1], [2, 4, 3]])
+            >>> print(s.apply(sorted))
+            0    [1, 2, 3]
+            1    [2, 3, 4]
+            dtype: object
         """
         if inplace:
             raise NotImplementedError("`inplace` not currently implemented.")
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 76c64e1aea0..f9404eb3b40 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -5,7 +5,6 @@
 import functools
 from typing import TYPE_CHECKING, Any, Callable, Sequence, cast
 
-import cupy as cp
 import numpy as np
 import pandas as pd
 from typing_extensions import Self
@@ -13,14 +12,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib import pylibcudf
-from cudf._lib.types import size_type_dtype
-from cudf.api.types import (
-    is_bool_dtype,
-    is_float_dtype,
-    is_integer,
-    is_integer_dtype,
-    is_scalar,
-)
+from cudf.api.types import is_integer, is_scalar
 from cudf.core.column import (
     ColumnBase,
     as_column,
@@ -32,10 +24,10 @@
 from cudf.core.mixins import BinaryOperand
 from cudf.errors import MixedTypeError
 from cudf.utils.dtypes import (
+    find_common_type,
     min_column_type,
     min_signed_type,
     np_dtypes_to_pandas_dtypes,
-    numeric_normalize_types,
 )
 
 from .numerical_base import NumericalBaseColumn
@@ -131,12 +123,8 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn:
             and self.dtype.kind in {"c", "f"}
             and np.isnan(value)
         ):
-            return column.as_column(
-                cp.argwhere(
-                    cp.isnan(self.data_array_view(mode="read"))
-                ).flatten(),
-                dtype=size_type_dtype,
-            )
+            nan_col = libcudf.unary.is_nan(self)
+            return nan_col.indices_of(True)
         else:
             return super().indices_of(value)
 
@@ -165,7 +153,7 @@ def __setitem__(self, key: Any, value: Any):
             else as_column(value)
         )
 
-        if not is_bool_dtype(self.dtype) and is_bool_dtype(device_value.dtype):
+        if self.dtype.kind != "b" and device_value.dtype.kind == "b":
             raise TypeError(f"Invalid value {value} for dtype {self.dtype}")
         else:
             device_value = device_value.astype(self.dtype)
@@ -232,25 +220,17 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 tmp = self if reflect else other
                 # Guard against division by zero for integers.
                 if (
-                    (tmp.dtype.type in int_float_dtype_mapping)
-                    and (tmp.dtype.type != np.bool_)
-                    and (
-                        (
-                            (
-                                np.isscalar(tmp)
-                                or (
-                                    isinstance(tmp, cudf.Scalar)
-                                    # host to device copy
-                                    and tmp.is_valid()
-                                )
-                            )
-                            and (0 == tmp)
-                        )
-                        or ((isinstance(tmp, NumericalColumn)) and (0 in tmp))
-                    )
+                    tmp.dtype.type in int_float_dtype_mapping
+                    and tmp.dtype.kind != "b"
                 ):
-                    out_dtype = cudf.dtype("float64")
-
+                    if isinstance(tmp, NumericalColumn) and 0 in tmp:
+                        out_dtype = cudf.dtype("float64")
+                    elif isinstance(tmp, cudf.Scalar):
+                        if tmp.is_valid() and tmp == 0:
+                            # tmp == 0 can return NA
+                            out_dtype = cudf.dtype("float64")
+                    elif is_scalar(tmp) and tmp == 0:
+                        out_dtype = cudf.dtype("float64")
         if op in {
             "__lt__",
             "__gt__",
@@ -264,19 +244,19 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             out_dtype = "bool"
 
         if op in {"__and__", "__or__", "__xor__"}:
-            if is_float_dtype(self.dtype) or is_float_dtype(other.dtype):
+            if self.dtype.kind == "f" or other.dtype.kind == "f":
                 raise TypeError(
                     f"Operation 'bitwise {op[2:-2]}' not supported between "
                     f"{self.dtype.type.__name__} and "
                     f"{other.dtype.type.__name__}"
                 )
-            if is_bool_dtype(self.dtype) or is_bool_dtype(other.dtype):
+            if self.dtype.kind == "b" or other.dtype.kind == "b":
                 out_dtype = "bool"
 
         if (
             op == "__pow__"
-            and is_integer_dtype(self.dtype)
-            and (is_integer(other) or is_integer_dtype(other.dtype))
+            and self.dtype.kind in "iu"
+            and (is_integer(other) or other.dtype.kind in "iu")
         ):
             op = "INT_POW"
 
@@ -301,15 +281,28 @@ def normalize_binop_value(
         if isinstance(other, cudf.Scalar):
             if self.dtype == other.dtype:
                 return other
+
             # expensive device-host transfer just to
             # adjust the dtype
             other = other.value
+
+            # NumPy 2 needs a Python scalar to do weak promotion, but
+            # pandas forces weak promotion always
+            # TODO: We could use 0, 0.0, and 0j for promotion to avoid copies.
+            if other.dtype.kind in "ifc":
+                other = other.item()
+        elif not isinstance(other, (int, float, complex)):
+            # Go via NumPy to get the value
+            other = np.array(other)
+            if other.dtype.kind in "ifc":
+                other = other.item()
+
         # Try and match pandas and hence numpy. Deduce the common
-        # dtype via the _value_ of other, and the dtype of self. TODO:
-        # When NEP50 is accepted, this might want changed or
-        # simplified.
-        # This is not at all simple:
-        # np.result_type(np.int64(0), np.uint8)
+        # dtype via the _value_ of other, and the dtype of self on NumPy 1.x
+        # with NumPy 2, we force weak promotion even for our/NumPy scalars
+        # to match pandas 2.2.
+        # Weak promotion is not at all simple:
+        # np.result_type(0, np.uint8)
         #   => np.uint8
         # np.result_type(np.asarray([0], dtype=np.int64), np.uint8)
         #   => np.int64
@@ -331,9 +324,7 @@ def int2ip(self) -> "cudf.core.column.StringColumn":
 
         return libcudf.string_casting.int2ip(self)
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
+    def as_string_column(self) -> cudf.core.column.StringColumn:
         if len(self) > 0:
             return string._numeric_to_str_typecast_functions[
                 cudf.dtype(self.dtype)
@@ -345,8 +336,8 @@ def as_string_column(
             )
 
     def as_datetime_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.DatetimeColumn":
+        self, dtype: Dtype
+    ) -> cudf.core.column.DatetimeColumn:
         return cast(
             "cudf.core.column.DatetimeColumn",
             build_column(
@@ -359,8 +350,8 @@ def as_datetime_column(
         )
 
     def as_timedelta_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.TimeDeltaColumn":
+        self, dtype: Dtype
+    ) -> cudf.core.column.TimeDeltaColumn:
         return cast(
             "cudf.core.column.TimeDeltaColumn",
             build_column(
@@ -391,7 +382,7 @@ def all(self, skipna: bool = True) -> bool:
         if result_col.null_count == result_col.size:
             return True
 
-        return libcudf.reduce.reduce("all", result_col, dtype=np.bool_)
+        return libcudf.reduce.reduce("all", result_col)
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
@@ -402,7 +393,7 @@ def any(self, skipna: bool = True) -> bool:
         elif skipna and result_col.null_count == result_col.size:
             return False
 
-        return libcudf.reduce.reduce("any", result_col, dtype=np.bool_)
+        return libcudf.reduce.reduce("any", result_col)
 
     @functools.cached_property
     def nan_count(self) -> int:
@@ -513,11 +504,15 @@ def find_and_replace(
             )
         elif len(replacement_col) == 1 and len(to_replace_col) == 0:
             return self.copy()
-        to_replace_col, replacement_col, replaced = numeric_normalize_types(
-            to_replace_col, replacement_col, self
+        common_type = find_common_type(
+            (to_replace_col.dtype, replacement_col.dtype, self.dtype)
         )
+        replaced = self.astype(common_type)
         df = cudf.DataFrame._from_data(
-            {"old": to_replace_col, "new": replacement_col}
+            {
+                "old": to_replace_col.astype(common_type),
+                "new": replacement_col.astype(common_type),
+            }
         )
         df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
         if df._data["old"].null_count == 1:
@@ -628,7 +623,9 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
             min_, max_ = iinfo.min, iinfo.max
 
             # best we can do is hope to catch it here and avoid compare
-            if (self.min() >= min_) and (self.max() <= max_):
+            # Use Python floats, which have precise comparison for float64.
+            # NOTE(seberg): it would make sense to limit to the mantissa range.
+            if (float(self.min()) >= min_) and (float(self.max()) <= max_):
                 filled = self.fillna(0)
                 return (cudf.Series(filled) % 1 == 0).all()
             else:
@@ -678,15 +675,16 @@ def to_pandas(
             return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
 
     def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
-        col_dtype = self.dtype
         if reduction_op in {"sum", "product"}:
-            col_dtype = (
-                col_dtype if col_dtype.kind == "f" else np.dtype("int64")
-            )
+            if self.dtype.kind == "f":
+                return self.dtype
+            return np.dtype("int64")
         elif reduction_op == "sum_of_squares":
-            col_dtype = np.result_dtype(col_dtype, np.dtype("uint64"))
+            return np.result_dtype(self.dtype, np.dtype("uint64"))
+        elif reduction_op in {"var", "std", "mean"}:
+            return np.dtype("float64")
 
-        return col_dtype
+        return super()._reduction_result_dtype(reduction_op)
 
 
 def _normalize_find_and_replace_input(
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 95c78c5efcb..f41010062c8 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -144,32 +144,27 @@ def mean(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype=np.float64,
     ):
-        return self._reduce(
-            "mean", skipna=skipna, min_count=min_count, dtype=dtype
-        )
+        return self._reduce("mean", skipna=skipna, min_count=min_count)
 
     def var(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype=np.float64,
         ddof=1,
     ):
         return self._reduce(
-            "var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+            "var", skipna=skipna, min_count=min_count, ddof=ddof
         )
 
     def std(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype=np.float64,
         ddof=1,
     ):
         return self._reduce(
-            "std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+            "std", skipna=skipna, min_count=min_count, ddof=ddof
         )
 
     def median(self, skipna: bool | None = None) -> NumericalBaseColumn:
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 936cd1eccb0..ec95c50f455 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -612,7 +612,7 @@ def extract(
         dtype: object
 
         .. pandas-compat::
-            **StringMethods.extract**
+            :meth:`pandas.Series.str.extract`
 
             The `flags` parameter currently only supports re.DOTALL and
             re.MULTILINE.
@@ -738,7 +738,7 @@ def contains(
         dtype: bool
 
         .. pandas-compat::
-            **StringMethods.contains**
+            :meth:`pandas.Series.str.contains`
 
             The parameters `case` and `na` are not yet supported and will
             raise a NotImplementedError if anything other than the default
@@ -974,7 +974,7 @@ def replace(
         dtype: object
 
         .. pandas-compat::
-            **StringMethods.replace**
+            :meth:`pandas.Series.str.replace`
 
             The parameters `case` and `flags` are not yet supported and will
             raise a `NotImplementedError` if anything other than the default
@@ -2803,7 +2803,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
                    )
 
         .. pandas-compat::
-            **StringMethods.partition**
+            :meth:`pandas.Series.str.partition`
 
             The parameter `expand` is not yet supported and will raise a
             `NotImplementedError` if anything other than the default
@@ -3527,7 +3527,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         Index([0, 0, 2, 1], dtype='int64')
 
         .. pandas-compat::
-            **StringMethods.count**
+            :meth:`pandas.Series.str.count`
 
             -   `flags` parameter currently only supports re.DOTALL
                 and re.MULTILINE.
@@ -3607,7 +3607,7 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         dtype: list
 
         .. pandas-compat::
-            **StringMethods.findall**
+            :meth:`pandas.Series.str.findall`
 
             The `flags` parameter currently only supports re.DOTALL and
             re.MULTILINE.
@@ -3811,7 +3811,7 @@ def endswith(self, pat: str) -> SeriesOrIndex:
         dtype: bool
 
         .. pandas-compat::
-            **StringMethods.endswith**
+            :meth:`pandas.Series.str.endswith`
 
             `na` parameter is not yet supported, as cudf uses
             native strings instead of Python objects.
@@ -4264,7 +4264,7 @@ def match(
         dtype: bool
 
         .. pandas-compat::
-            **StringMethods.match**
+            :meth:`pandas.Series.str.match`
 
             Parameters `case` and `na` are currently not supported.
             The `flags` parameter currently only supports re.DOTALL and
@@ -5669,16 +5669,25 @@ def as_numerical_column(
         result_col = _str_to_numeric_typecast_functions[out_dtype](string_col)
         return result_col
 
-    def _as_datetime_or_timedelta_column(self, dtype, format):
-        if len(self) == 0:
-            return cudf.core.column.column_empty(0, dtype=dtype)
-
-        # Check for None strings
-        if (self == "None").any():
-            raise ValueError("Could not convert `None` value to datetime")
-
-        is_nat = self == "NaT"
-        if dtype.kind == "M":
+    def strptime(
+        self, dtype: Dtype, format: str
+    ) -> cudf.core.column.DatetimeColumn | cudf.core.column.TimeDeltaColumn:
+        if dtype.kind not in "Mm":  # type: ignore[union-attr]
+            raise ValueError(
+                f"dtype must be datetime or timedelta type, not {dtype}"
+            )
+        elif self.null_count == len(self):
+            return column.column_empty(len(self), dtype=dtype, masked=True)  # type: ignore[return-value]
+        elif (self == "None").any():
+            raise ValueError(
+                "Cannot convert `None` value to datetime or timedelta."
+            )
+        elif dtype.kind == "M":  # type: ignore[union-attr]
+            if format.endswith("%z"):
+                raise NotImplementedError(
+                    "cuDF does not yet support timezone-aware datetimes"
+                )
+            is_nat = self == "NaT"
             without_nat = self.apply_boolean_mask(is_nat.unary_operator("not"))
             all_same_length = (
                 libstrings.count_characters(without_nat).distinct_count(
@@ -5699,61 +5708,43 @@ def _as_datetime_or_timedelta_column(self, dtype, format):
             if not valid.all():
                 raise ValueError(f"Column contains invalid data for {format=}")
 
-        casting_func = (
-            str_cast.timestamp2int
-            if dtype.type == np.datetime64
-            else str_cast.timedelta2int
-        )
+            casting_func = str_cast.timestamp2int
+            add_back_nat = is_nat.any()
+        elif dtype.kind == "m":  # type: ignore[union-attr]
+            casting_func = str_cast.timedelta2int
+            add_back_nat = False
+
         result_col = casting_func(self, dtype, format)
 
-        if is_nat.any():
+        if add_back_nat:
             result_col[is_nat] = None
 
         return result_col
 
     def as_datetime_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.DatetimeColumn":
-        out_dtype = cudf.api.types.dtype(dtype)
-
-        # infer on host from the first not na element
-        # or return all null column if all values
-        # are null in current column
-        if format is None:
-            if self.null_count == len(self):
-                return cast(
-                    "cudf.core.column.DatetimeColumn",
-                    column.column_empty(
-                        len(self), dtype=out_dtype, masked=True
-                    ),
-                )
-            else:
-                format = datetime.infer_format(
-                    self.apply_boolean_mask(self.notnull()).element_indexing(0)
-                )
-
-        if format.endswith("%z"):
-            raise NotImplementedError(
-                "cuDF does not yet support timezone-aware datetimes"
-            )
-        return self._as_datetime_or_timedelta_column(out_dtype, format)
+        self, dtype: Dtype
+    ) -> cudf.core.column.DatetimeColumn:
+        not_null = self.apply_boolean_mask(self.notnull())
+        if len(not_null) == 0:
+            # We should hit the self.null_count == len(self) condition
+            # so format doesn't matter
+            format = ""
+        else:
+            # infer on host from the first not na element
+            format = datetime.infer_format(not_null.element_indexing(0))
+        return self.strptime(dtype, format)  # type: ignore[return-value]
 
     def as_timedelta_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.TimeDeltaColumn":
-        out_dtype = cudf.api.types.dtype(dtype)
-        if format is None:
-            format = "%D days %H:%M:%S"
-        return self._as_datetime_or_timedelta_column(out_dtype, format)
+        self, dtype: Dtype
+    ) -> cudf.core.column.TimeDeltaColumn:
+        return self.strptime(dtype, "%D days %H:%M:%S")  # type: ignore[return-value]
 
     def as_decimal_column(
         self, dtype: Dtype
     ) -> "cudf.core.column.DecimalBaseColumn":
         return libstrings.to_decimal(self, dtype)
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> StringColumn:
+    def as_string_column(self) -> StringColumn:
         return self
 
     @property
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 5a0171bbbdc..59ea1cc002c 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -12,7 +12,7 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf.api.types import is_scalar, is_timedelta64_dtype
+from cudf.api.types import is_scalar
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column import ColumnBase, column, string
 from cudf.utils.dtypes import np_to_pa_dtype
@@ -153,7 +153,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         this: ColumnBinaryOperand = self
         out_dtype = None
 
-        if is_timedelta64_dtype(other.dtype):
+        if other.dtype.kind == "m":
             # TODO: pandas will allow these operators to work but return false
             # when comparing to non-timedelta dtypes. We should do the same.
             if op in {
@@ -263,41 +263,35 @@ def as_numerical_column(
         )
         return cast("cudf.core.column.NumericalColumn", col.astype(dtype))
 
-    def as_datetime_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.DatetimeColumn":
+    def as_datetime_column(self, dtype: Dtype) -> None:  # type: ignore[override]
         raise TypeError(
             f"cannot astype a timedelta from {self.dtype} to {dtype}"
         )
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
-        if format is None:
-            format = "%D days %H:%M:%S"
-        if len(self) > 0:
-            return string._timedelta_to_str_typecast_functions[
-                cudf.dtype(self.dtype)
-            ](self, format=format)
-        else:
+    def strftime(self, format: str) -> cudf.core.column.StringColumn:
+        if len(self) == 0:
             return cast(
-                "cudf.core.column.StringColumn",
+                cudf.core.column.StringColumn,
                 column.column_empty(0, dtype="object", masked=False),
             )
+        else:
+            return string._timedelta_to_str_typecast_functions[self.dtype](
+                self, format=format
+            )
 
-    def as_timedelta_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> TimeDeltaColumn:
-        dtype = cudf.dtype(dtype)
+    def as_string_column(self) -> cudf.core.column.StringColumn:
+        return self.strftime("%D days %H:%M:%S")
+
+    def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn:
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype=dtype)
 
-    def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta:
+    def mean(self, skipna=None) -> pd.Timedelta:
         return pd.Timedelta(
             cast(
                 "cudf.core.column.NumericalColumn", self.astype("int64")
-            ).mean(skipna=skipna, dtype=dtype),
+            ).mean(skipna=skipna),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -351,12 +345,11 @@ def std(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype: Dtype = np.float64,
         ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
             cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
-                skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype
+                skipna=skipna, min_count=min_count, ddof=ddof
             ),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index d9f62f51f92..197f46ee9fe 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -188,9 +188,6 @@ def cut(
         # adjust bin edges decimal precision
         int_label_bins = np.around(bins, precision)
 
-    # the inputs is a column of the values in the array x
-    input_arr = as_column(x)
-
     # checking for the correct inclusivity values
     if right:
         closed = "right"
@@ -242,6 +239,9 @@ def cut(
                 labels if len(set(labels)) == len(labels) else None
             )
 
+    # the inputs is a column of the values in the array x
+    input_arr = as_column(x)
+
     if isinstance(bins, pd.IntervalIndex):
         # get the left and right edges of the bins as columns
         # we cannot typecast an IntervalIndex, so we need to
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b249410c2e4..7e07078c95b 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -32,8 +32,6 @@
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
-    is_bool_dtype,
-    is_datetime_dtype,
     is_dict_like,
     is_dtype_equal,
     is_list_like,
@@ -85,8 +83,7 @@
     cudf_dtype_from_pydata_dtype,
     find_common_type,
     is_column_like,
-    min_scalar_type,
-    numeric_normalize_types,
+    min_signed_type,
 )
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
@@ -105,20 +102,6 @@
     "var": "nanvar",
 }
 
-_numeric_reduction_ops = (
-    "mean",
-    "min",
-    "max",
-    "sum",
-    "product",
-    "prod",
-    "std",
-    "var",
-    "kurtosis",
-    "kurt",
-    "skew",
-)
-
 
 def _shape_mismatch_error(x, y):
     raise ValueError(
@@ -172,7 +155,7 @@ def _can_downcast_to_series(self, df, arg):
             ):
                 return False
             else:
-                if is_bool_dtype(as_column(arg[0]).dtype) and not isinstance(
+                if as_column(arg[0]).dtype.kind == "b" and not isinstance(
                     arg[1], slice
                 ):
                     return True
@@ -321,7 +304,7 @@ def _getitem_tuple_arg(self, arg):
                     tmp_arg[1],
                 )
 
-                if is_bool_dtype(tmp_arg[0].dtype):
+                if tmp_arg[0].dtype.kind == "b":
                     df = columns_df._apply_boolean_mask(
                         BooleanMask(tmp_arg[0], len(columns_df))
                     )
@@ -430,7 +413,7 @@ def _setitem_tuple_arg(self, key, value):
 
             else:
                 value = cupy.asarray(value)
-                if cupy.ndim(value) == 2:
+                if value.ndim == 2:
                     # If the inner dimension is 1, it's broadcastable to
                     # all columns of the dataframe.
                     indexed_shape = columns_df.loc[key[0]].shape
@@ -462,6 +445,10 @@ def _setitem_tuple_arg(self, key, value):
                             self._frame[col].loc[key[0]] = value[i]
 
 
+class _DataFrameAtIndexer(_DataFrameLocIndexer):
+    pass
+
+
 class _DataFrameIlocIndexer(_DataFrameIndexer):
     """
     For selection by index.
@@ -563,7 +550,7 @@ def _setitem_tuple_arg(self, key, value):
             # TODO: consolidate code path with identical counterpart
             # in `_DataFrameLocIndexer._setitem_tuple_arg`
             value = cupy.asarray(value)
-            if cupy.ndim(value) == 2:
+            if value.ndim == 2:
                 indexed_shape = columns_df.iloc[key[0]].shape
                 if value.shape[1] == 1:
                     if value.shape[0] != indexed_shape[0]:
@@ -584,6 +571,10 @@ def _setitem_tuple_arg(self, key, value):
                         self._frame[col].iloc[key[0]] = value[i]
 
 
+class _DataFrameiAtIndexer(_DataFrameIlocIndexer):
+    pass
+
+
 class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     """
     A GPU Dataframe object.
@@ -603,6 +594,9 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     dtype : dtype, default None
         Data type to force. Only a single dtype is allowed.
         If None, infer.
+    copy : bool or None, default None
+        Copy data from inputs.
+        Currently not implemented.
     nan_as_null : bool, Default True
         If ``None``/``True``, converts ``np.nan`` values to
         ``null`` values.
@@ -689,8 +683,11 @@ def __init__(
         index=None,
         columns=None,
         dtype=None,
+        copy=None,
         nan_as_null=no_default,
     ):
+        if copy is not None:
+            raise NotImplementedError("copy is not currently implemented.")
         super().__init__()
         if nan_as_null is no_default:
             nan_as_null = not cudf.get_option("mode.pandas_compatible")
@@ -917,7 +914,8 @@ def _init_from_series_list(self, data, columns, index):
             final_index = ensure_index(index)
 
         series_lengths = list(map(len, data))
-        data = numeric_normalize_types(*data)
+        common_dtype = find_common_type([obj.dtype for obj in data])
+        data = [obj.astype(common_dtype) for obj in data]
         if series_lengths.count(series_lengths[0]) == len(series_lengths):
             # Calculating the final dataframe columns by
             # getting union of all `index` of the Series objects.
@@ -1532,6 +1530,25 @@ def __array_function__(self, func, types, args, kwargs):
             pass
         return NotImplemented
 
+    def __arrow_c_stream__(self, requested_schema=None):
+        """
+        Export the cudf DataFrame as an Arrow C stream PyCapsule.
+
+        Parameters
+        ----------
+        requested_schema : PyCapsule, default None
+            The schema to which the dataframe should be casted, passed as a
+            PyCapsule containing a C ArrowSchema representation of the
+            requested schema. Currently not implemented.
+
+        Returns
+        -------
+        PyCapsule
+        """
+        if requested_schema is not None:
+            raise NotImplementedError("requested_schema is not supported")
+        return self.to_arrow().__arrow_c_stream__()
+
     # The _get_numeric_data method is necessary for dask compatibility.
     @_performance_tracking
     def _get_numeric_data(self):
@@ -2192,8 +2209,8 @@ def from_dict(
 
         orient = orient.lower()
         if orient == "index":
-            if len(data) > 0 and isinstance(
-                next(iter(data.values())), (cudf.Series, cupy.ndarray)
+            if isinstance(
+                next(iter(data.values()), None), (cudf.Series, cupy.ndarray)
             ):
                 result = cls(data).T
                 result.columns = (
@@ -2243,6 +2260,7 @@ def to_dict(
         self,
         orient: str = "dict",
         into: type[dict] = dict,
+        index: bool = True,
     ) -> dict | list[dict]:
         """
         Convert the DataFrame to a dictionary.
@@ -2276,6 +2294,13 @@ def to_dict(
             instance of the mapping type you want.  If you want a
             collections.defaultdict, you must pass it initialized.
 
+        index : bool, default True
+            Whether to include the index item (and index_names item if `orient`
+            is 'tight') in the returned dictionary. Can only be ``False``
+            when `orient` is 'split' or 'tight'. Note that when `orient` is
+            'records', this parameter does not take effect (index item always
+            not included).
+
         Returns
         -------
         dict, list or collections.abc.Mapping
@@ -2357,7 +2382,7 @@ def to_dict(
                 raise TypeError(f"unsupported type: {into}")
             return cons(self.items())  # type: ignore[misc]
 
-        return self.to_pandas().to_dict(orient=orient, into=into)
+        return self.to_pandas().to_dict(orient=orient, into=into, index=index)
 
     @_performance_tracking
     def scatter_by_map(
@@ -2581,14 +2606,14 @@ def iat(self):
         """
         Alias for ``DataFrame.iloc``; provided for compatibility with Pandas.
         """
-        return self.iloc
+        return _DataFrameiAtIndexer(self)
 
     @property
     def at(self):
         """
         Alias for ``DataFrame.loc``; provided for compatibility with Pandas.
         """
-        return self.loc
+        return _DataFrameAtIndexer(self)
 
     @property  # type: ignore
     @_external_only_api(
@@ -2744,7 +2769,7 @@ def reindex(
         Chrome                200          0.02
 
         .. pandas-compat::
-            **DataFrame.reindex**
+            :meth:`pandas.DataFrame.reindex`
 
             Note: One difference from Pandas is that ``NA`` is used for rows
             that do not match, rather than ``NaN``. One side effect of this is
@@ -3012,7 +3037,12 @@ def fillna(
         )
 
     @_performance_tracking
-    def where(self, cond, other=None, inplace=False):
+    def where(self, cond, other=None, inplace=False, axis=None, level=None):
+        if axis is not None:
+            raise NotImplementedError("axis is not supported.")
+        elif level is not None:
+            raise NotImplementedError("level is not supported.")
+
         from cudf.core._internals.where import (
             _check_and_cast_columns_with_other,
             _make_categorical_like,
@@ -3344,7 +3374,7 @@ def diff(self, periods=1, axis=0):
         5     2     5    20
 
         .. pandas-compat::
-            **DataFrame.diff**
+            :meth:`pandas.DataFrame.diff`
 
             Diff currently only supports numeric dtype columns.
         """
@@ -3549,7 +3579,7 @@ def rename(
         30  3  6
 
         .. pandas-compat::
-            **DataFrame.rename**
+            :meth:`pandas.DataFrame.rename`
 
             * Not Supporting: level
 
@@ -3585,15 +3615,15 @@ def rename(
 
             if level is not None and isinstance(self.index, MultiIndex):
                 level = self.index._get_level_label(level)
-                out_index = self.index.copy(deep=copy)
-                level_values = out_index.get_level_values(level)
-                level_values.to_frame().replace(
+                level_values = self.index.get_level_values(level)
+                ca = self.index._data.copy(deep=copy)
+                ca[level] = level_values._column.find_and_replace(
                     to_replace=list(index.keys()),
-                    value=list(index.values()),
-                    inplace=True,
+                    replacement=list(index.values()),
+                )
+                out_index = type(self.index)._from_data(
+                    ca, name=self.index.name
                 )
-                out_index._data[level] = column.as_column(level_values)
-                out_index._compute_levels_and_codes()
             else:
                 to_replace = list(index.keys())
                 vals = list(index.values())
@@ -3622,7 +3652,9 @@ def rename(
         return result
 
     @_performance_tracking
-    def add_prefix(self, prefix):
+    def add_prefix(self, prefix, axis=None):
+        if axis is not None:
+            raise NotImplementedError("axis is currently not implemented.")
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
         out.columns = [
@@ -3664,15 +3696,15 @@ def agg(self, aggs, axis=None):
             ``DataFrame`` is returned.
 
         .. pandas-compat::
-            **DataFrame.agg**
+            :meth:`pandas.DataFrame.agg`
 
             * Not supporting: ``axis``, ``*args``, ``**kwargs``
 
         """
         dtypes = [self[col].dtype for col in self._column_names]
         common_dtype = find_common_type(dtypes)
-        if not is_bool_dtype(common_dtype) and any(
-            is_bool_dtype(dtype) for dtype in dtypes
+        if common_dtype.kind != "b" and any(
+            dtype.kind == "b" for dtype in dtypes
         ):
             raise MixedTypeError("Cannot create a column with mixed types")
 
@@ -3837,7 +3869,7 @@ def nlargest(self, n, columns, keep="first"):
         Brunei      434000    12128      BN
 
         .. pandas-compat::
-            **DataFrame.nlargest**
+            :meth:`pandas.DataFrame.nlargest`
 
             - Only a single column is supported in *columns*
         """
@@ -3909,7 +3941,7 @@ def nsmallest(self, n, columns, keep="first"):
         Nauru         337000  182      NR
 
         .. pandas-compat::
-            **DataFrame.nsmallest**
+            :meth:`pandas.DataFrame.nsmallest`
 
             - Only a single column is supported in *columns*
         """
@@ -3991,7 +4023,7 @@ def transpose(self):
         a new (ncol x nrow) dataframe. self is (nrow x ncol)
 
         .. pandas-compat::
-            **DataFrame.transpose, DataFrame.T**
+            :meth:`pandas.DataFrame.transpose`, :attr:`pandas.DataFrame.T`
 
             Not supporting *copy* because default and only behavior is
             copy=True
@@ -4182,7 +4214,7 @@ def merge(
         from both sides.
 
         .. pandas-compat::
-            **DataFrame.merge**
+            :meth:`pandas.DataFrame.merge`
 
             DataFrames merges in cuDF result in non-deterministic row
             ordering.
@@ -4238,6 +4270,7 @@ def join(
         lsuffix="",
         rsuffix="",
         sort=False,
+        validate: str | None = None,
     ):
         """Join columns with other DataFrame on index or on a key column.
 
@@ -4251,19 +4284,33 @@ def join(
             column names when avoiding conflicts.
         sort : bool
             Set to True to ensure sorted ordering.
+        validate : str, optional
+            If specified, checks if join is of specified type.
+
+            * "one_to_one" or "1:1": check if join keys are unique in both left
+              and right datasets.
+            * "one_to_many" or "1:m": check if join keys are unique in left dataset.
+            * "many_to_one" or "m:1": check if join keys are unique in right dataset.
+            * "many_to_many" or "m:m": allowed, but does not result in checks.
+
+            Currently not supported.
 
         Returns
         -------
         joined : DataFrame
 
         .. pandas-compat::
-            **DataFrame.join**
+            :meth:`pandas.DataFrame.join`
 
             - *other* must be a single DataFrame for now.
             - *on* is not supported yet due to lack of multi-index support.
         """
         if on is not None:
             raise NotImplementedError("The on parameter is not yet supported")
+        elif validate is not None:
+            raise NotImplementedError(
+                "The validate parameter is not yet supported"
+            )
 
         df = self.merge(
             other,
@@ -4300,7 +4347,6 @@ def groupby(
         as_index=True,
         sort=no_default,
         group_keys=False,
-        squeeze=False,
         observed=True,
         dropna=True,
     ):
@@ -4311,7 +4357,6 @@ def groupby(
             as_index,
             sort,
             group_keys,
-            squeeze,
             observed,
             dropna,
         )
@@ -4379,7 +4424,7 @@ def query(self, expr, local_dict=None):
         1 2018-10-08
 
         .. pandas-compat::
-            **DataFrame.query**
+            :meth:`pandas.DataFrame.query`
 
             One difference from pandas is that ``query`` currently only
             supports numeric, datetime, timedelta, or bool dtypes.
@@ -4414,7 +4459,16 @@ def query(self, expr, local_dict=None):
 
     @_performance_tracking
     def apply(
-        self, func, axis=1, raw=False, result_type=None, args=(), **kwargs
+        self,
+        func,
+        axis=1,
+        raw=False,
+        result_type=None,
+        args=(),
+        by_row: Literal[False, "compat"] = "compat",
+        engine: Literal["python", "numba"] = "python",
+        engine_kwargs: dict[str, bool] | None = None,
+        **kwargs,
     ):
         """
         Apply a function along an axis of the DataFrame.
@@ -4442,6 +4496,25 @@ def apply(
             Not yet supported
         args: tuple
             Positional arguments to pass to func in addition to the dataframe.
+        by_row : False or "compat", default "compat"
+            Only has an effect when ``func`` is a listlike or dictlike of funcs
+            and the func isn't a string.
+            If "compat", will if possible first translate the func into pandas
+            methods (e.g. ``Series().apply(np.sum)`` will be translated to
+            ``Series().sum()``). If that doesn't work, will try call to apply again with
+            ``by_row=True`` and if that fails, will call apply again with
+            ``by_row=False`` (backward compatible).
+            If False, the funcs will be passed the whole Series at once.
+
+            Currently not supported.
+
+        engine : {'python', 'numba'}, default 'python'
+            Unused. Added for compatibility with pandas.
+        engine_kwargs : dict
+            Unused. Added for compatibility with pandas.
+        **kwargs
+            Additional keyword arguments to pass as keywords arguments to
+            `func`.
 
         Examples
         --------
@@ -4592,13 +4665,17 @@ def apply(
         <https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs.html>
         """
         if axis != 1:
-            raise ValueError(
+            raise NotImplementedError(
                 "DataFrame.apply currently only supports row wise ops"
             )
         if raw:
-            raise ValueError("The `raw` kwarg is not yet supported.")
+            raise NotImplementedError("The `raw` kwarg is not yet supported.")
         if result_type is not None:
-            raise ValueError("The `result_type` kwarg is not yet supported.")
+            raise NotImplementedError(
+                "The `result_type` kwarg is not yet supported."
+            )
+        if by_row != "compat":
+            raise NotImplementedError("by_row is currently not supported.")
 
         return self._apply(func, _get_row_kernel, *args, **kwargs)
 
@@ -5441,10 +5518,11 @@ def from_arrow(cls, table):
         2  3  6
 
         .. pandas-compat::
-            **DataFrame.from_arrow**
+            `pandas.DataFrame.from_arrow`
 
-            -   Does not support automatically setting index column(s) similar
-                to how ``to_pandas`` works for PyArrow Tables.
+            This method does not exist in pandas but it is similar to
+            how :meth:`pyarrow.Table.to_pandas` works for PyArrow Tables i.e.
+            it does not support automatically setting index column(s).
         """
         index_col = None
         col_index_names = None
@@ -5498,7 +5576,7 @@ def from_arrow(cls, table):
         return out
 
     @_performance_tracking
-    def to_arrow(self, preserve_index=None):
+    def to_arrow(self, preserve_index=None) -> pa.Table:
         """
         Convert to a PyArrow Table.
 
@@ -5588,18 +5666,36 @@ def to_arrow(self, preserve_index=None):
         return out.replace_schema_metadata(metadata)
 
     @_performance_tracking
-    def to_records(self, index=True):
+    def to_records(self, index=True, column_dtypes=None, index_dtypes=None):
         """Convert to a numpy recarray
 
         Parameters
         ----------
         index : bool
             Whether to include the index in the output.
+        column_dtypes : str, type, dict, default None
+            If a string or type, the data type to store all columns. If
+            a dictionary, a mapping of column names and indices (zero-indexed)
+            to specific data types. Currently not supported.
+        index_dtypes : str, type, dict, default None
+            If a string or type, the data type to store all index levels. If
+            a dictionary, a mapping of index level names and indices
+            (zero-indexed) to specific data types.
+            This mapping is applied only if `index=True`.
+            Currently not supported.
 
         Returns
         -------
         numpy recarray
         """
+        if column_dtypes is not None:
+            raise NotImplementedError(
+                "column_dtypes is currently not supported."
+            )
+        elif index_dtypes is not None:
+            raise NotImplementedError(
+                "column_dtypes is currently not supported."
+            )
         members = [("index", self.index.dtype)] if index else []
         members += [(col, self[col].dtype) for col in self._data.names]
         dtype = np.dtype(members)
@@ -5612,7 +5708,16 @@ def to_records(self, index=True):
 
     @classmethod
     @_performance_tracking
-    def from_records(cls, data, index=None, columns=None, nan_as_null=False):
+    def from_records(
+        cls,
+        data,
+        index=None,
+        exclude=None,
+        columns=None,
+        coerce_float: bool = False,
+        nrows: int | None = None,
+        nan_as_null=False,
+    ):
         """
         Convert structured or record ndarray to DataFrame.
 
@@ -5622,13 +5727,32 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         index : str, array-like
             The name of the index column in *data*.
             If None, the default index is used.
+        exclude : sequence, default None
+            Columns or fields to exclude.
+            Currently not implemented.
         columns : list of str
             List of column names to include.
+        coerce_float : bool, default False
+            Attempt to convert values of non-string, non-numeric objects (like
+            decimal.Decimal) to floating point, useful for SQL result sets.
+            Currently not implemented.
+        nrows : int, default None
+            Number of rows to read if data is an iterator.
+            Currently not implemented.
 
         Returns
         -------
         DataFrame
         """
+        if exclude is not None:
+            raise NotImplementedError("exclude is currently not supported.")
+        if coerce_float is not False:
+            raise NotImplementedError(
+                "coerce_float is currently not supported."
+            )
+        if nrows is not None:
+            raise NotImplementedError("nrows is currently not supported.")
+
         if data.ndim != 1 and data.ndim != 2:
             raise ValueError(
                 f"records dimension expected 1 or 2 but found {data.ndim}"
@@ -5691,7 +5815,13 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
 
     @classmethod
     @_performance_tracking
-    def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
+    def _from_arrays(
+        cls,
+        data: np.ndarray | cupy.ndarray,
+        index=None,
+        columns=None,
+        nan_as_null=False,
+    ):
         """Convert a numpy/cupy array to DataFrame.
 
         Parameters
@@ -5709,8 +5839,6 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
         -------
         DataFrame
         """
-
-        data = cupy.asarray(data)
         if data.ndim != 1 and data.ndim != 2:
             raise ValueError(
                 f"records dimension expected 1 or 2 but found: {data.ndim}"
@@ -5874,7 +6002,7 @@ def quantile(
         0.5  2.5  55.0
 
         .. pandas-compat::
-            **DataFrame.quantile**
+            :meth:`pandas.DataFrame.quantile`
 
             One notable difference from Pandas is when DataFrame is of
             non-numeric types and result is expected to be a Series in case of
@@ -6105,7 +6233,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
         else:
             filtered = self.copy(deep=False)
 
-        is_pure_dt = all(is_datetime_dtype(dt) for dt in filtered.dtypes)
+        is_pure_dt = all(dt.kind == "M" for dt in filtered.dtypes)
 
         common_dtype = find_common_type(filtered.dtypes)
         if (
@@ -6164,7 +6292,7 @@ def count(self, axis=0, numeric_only=False):
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.count**
+            :meth:`pandas.DataFrame.count`
 
             Parameters currently not supported are `axis` and `numeric_only`.
         """
@@ -6294,8 +6422,8 @@ def _reduce(
                     and any(
                         not is_object_dtype(dtype) for dtype in source_dtypes
                     )
-                    or not is_bool_dtype(common_dtype)
-                    and any(is_bool_dtype(dtype) for dtype in source_dtypes)
+                    or common_dtype.kind != "b"
+                    and any(dtype.kind == "b" for dtype in source_dtypes)
                 ):
                     raise TypeError(
                         "Columns must all have the same dtype to "
@@ -6402,7 +6530,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
         1  <NA>    2.0
 
         .. pandas-compat::
-            **DataFrame.mode**
+            :meth:`pandas.DataFrame.transpose`
 
             ``axis`` parameter is currently not supported.
         """
@@ -6502,7 +6630,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
                         cudf.utils.dtypes.get_min_float_dtype(
                             prepared._data[col]
                         )
-                        if not is_datetime_dtype(common_dtype)
+                        if common_dtype.kind != "M"
                         else cudf.dtype("float64")
                     )
                     .fillna(np.nan)
@@ -6529,7 +6657,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
             result_dtype = (
                 common_dtype
                 if method in type_coerced_methods
-                or is_datetime_dtype(common_dtype)
+                or (common_dtype is not None and common_dtype.kind == "M")
                 else None
             )
             result = column.as_column(result, dtype=result_dtype)
@@ -7050,12 +7178,8 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
         # Assemble the final index
         new_index_columns = [*repeated_index._columns, *tiled_index]
         index_names = [*self.index.names, *unique_named_levels.names]
-        new_index = MultiIndex.from_frame(
-            DataFrame._from_data(
-                dict(zip(range(0, len(new_index_columns)), new_index_columns))
-            ),
-            names=index_names,
-        )
+        new_index = MultiIndex._from_data(dict(enumerate(new_index_columns)))
+        new_index.names = index_names
 
         # Compute the column indices that serves as the input for
         # `interleave_columns`
@@ -7353,9 +7477,9 @@ def pivot_table(
 
     @_performance_tracking
     @copy_docstring(reshape.unstack)
-    def unstack(self, level=-1, fill_value=None):
+    def unstack(self, level=-1, fill_value=None, sort: bool = True):
         return cudf.core.reshape.unstack(
-            self, level=level, fill_value=fill_value
+            self, level=level, fill_value=fill_value, sort=sort
         )
 
     @_performance_tracking
@@ -7401,7 +7525,12 @@ def explode(self, column, ignore_index=False):
         return super()._explode(column, ignore_index)
 
     def pct_change(
-        self, periods=1, fill_method=no_default, limit=no_default, freq=None
+        self,
+        periods=1,
+        fill_method=no_default,
+        limit=no_default,
+        freq=None,
+        **kwargs,
     ):
         """
         Calculates the percent change between sequential elements
@@ -7426,6 +7555,9 @@ def pct_change(
         freq : str, optional
             Increment to use from time series API.
             Not yet implemented.
+        **kwargs
+            Additional keyword arguments are passed into
+            `DataFrame.shift`.
 
         Returns
         -------
@@ -7471,7 +7603,7 @@ def pct_change(
             data = self.fillna(method=fill_method, limit=limit)
 
         return data.diff(periods=periods) / data.shift(
-            periods=periods, freq=freq
+            periods=periods, freq=freq, **kwargs
         )
 
     def __dataframe__(
@@ -7588,7 +7720,7 @@ def interleave_columns(self):
         The interleaved columns as a single column
 
         .. pandas-compat::
-            **DataFrame.interleave_columns**
+            `pandas.DataFrame.interleave_columns`
 
             This method does not exist in pandas but it can be run
             as ``pd.Series(np.vstack(df.to_numpy()).reshape((-1,)))``.
@@ -7690,7 +7822,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
         4  5   2   7  3
 
         .. pandas-compat::
-            **DataFrame.eval**
+            :meth:`pandas.DataFrame.eval`
 
             * Additional kwargs are not supported.
             * Bitwise and logical operators are not dtype-dependent.
@@ -7841,7 +7973,26 @@ def value_counts(
         return result
 
 
-def from_dataframe(df, allow_copy=False):
+def from_dataframe(df, allow_copy: bool = False) -> DataFrame:
+    """
+    Build a :class:`DataFrame` from an object supporting the dataframe interchange protocol.
+
+    .. note::
+
+        If you have a ``pandas.DataFrame``, use :func:`from_pandas` instead.
+
+    Parameters
+    ----------
+    df : DataFrameXchg
+        Object supporting the interchange protocol, i.e. ``__dataframe__`` method.
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
+
+    Returns
+    -------
+    :class:`DataFrame`
+    """
     return df_protocol.from_dataframe(df, allow_copy=allow_copy)
 
 
@@ -8280,7 +8431,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
             )._column.unique()
             # Set the column dtype to the codes' dtype. The categories
             # will be re-assigned at the end
-            dtypes[idx] = min_scalar_type(len(categories[idx]))
+            dtypes[idx] = min_signed_type(len(categories[idx]))
         # Otherwise raise an error if columns have different dtypes
         elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols):
             raise ValueError("All columns must be the same type")
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 9cd573aceb9..a70a42c04af 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -648,7 +648,7 @@ def __dataframe__(
 
 def from_dataframe(
     df: DataFrameObject, allow_copy: bool = False
-) -> _CuDFDataFrame:
+) -> cudf.DataFrame:
     """
     Construct a ``DataFrame`` from ``df`` if it supports the
     dataframe interchange protocol (``__dataframe__``).
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 253d200f7d4..c82e073d7b7 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -389,7 +389,7 @@ def values_host(self) -> np.ndarray:
         return self.to_numpy()
 
     @_performance_tracking
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         raise TypeError(
             "Implicit conversion to a host NumPy array via __array__ is not "
             "allowed, To explicitly construct a GPU matrix, consider using "
@@ -591,7 +591,7 @@ def where(self, cond, other=None, inplace: bool = False) -> Self | None:
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.where, Series.where**
+            :meth:`pandas.DataFrame.where`, :meth:`pandas.Series.where`
 
             Note that ``where`` treats missing values as falsy,
             in parallel with pandas treatment of nullable data:
@@ -1189,7 +1189,7 @@ def searchsorted(
         side: Literal["left", "right"] = "left",
         ascending: bool = True,
         na_position: Literal["first", "last"] = "last",
-    ):
+    ) -> ScalarLike | cupy.ndarray:
         """Find indices where elements should be inserted to maintain order
 
         Parameters
@@ -1527,7 +1527,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
     @acquire_spill_lock()
     def _apply_cupy_ufunc_to_operands(
         self, ufunc, cupy_func, operands, **kwargs
-    ):
+    ) -> list[dict[Any, ColumnBase]]:
         # Note: There are some operations that may be supported by libcudf but
         # are not supported by pandas APIs. In particular, libcudf binary
         # operations support logical and/or operations as well as
@@ -1538,7 +1538,7 @@ def _apply_cupy_ufunc_to_operands(
         # without cupy.
 
         mask = None
-        data = [{} for _ in range(ufunc.nout)]
+        data: list[dict[Any, ColumnBase]] = [{} for _ in range(ufunc.nout)]
         for name, (left, right, _, _) in operands.items():
             cupy_inputs = []
             for inp in (left, right) if ufunc.nin == 2 else (left,):
@@ -1587,6 +1587,12 @@ def __pos__(self):
     def __abs__(self):
         return self._unaryop("abs")
 
+    def __bool__(self):
+        raise ValueError(
+            f"The truth value of a {type(self).__name__} is ambiguous. Use "
+            "a.empty, a.bool(), a.item(), a.any() or a.all()."
+        )
+
     # Reductions
     @classmethod
     @_performance_tracking
@@ -1641,7 +1647,7 @@ def min(
         1
 
         .. pandas-compat::
-            **DataFrame.min, Series.min**
+            :meth:`pandas.DataFrame.min`, :meth:`pandas.Series.min`
 
             Parameters currently not supported are `level`, `numeric_only`.
         """
@@ -1689,7 +1695,7 @@ def max(
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.max, Series.max**
+            :meth:`pandas.DataFrame.max`, :meth:`pandas.Series.max`
 
             Parameters currently not supported are `level`, `numeric_only`.
         """
@@ -1742,7 +1748,7 @@ def all(self, axis=0, skipna=True, **kwargs):
         dtype: bool
 
         .. pandas-compat::
-            **DataFrame.all, Series.all**
+            :meth:`pandas.DataFrame.all`, :meth:`pandas.Series.all`
 
             Parameters currently not supported are `axis`, `bool_only`,
             `level`.
@@ -1795,7 +1801,7 @@ def any(self, axis=0, skipna=True, **kwargs):
         dtype: bool
 
         .. pandas-compat::
-            **DataFrame.any, Series.any**
+            :meth:`pandas.DataFrame.any`, :meth:`pandas.Series.any`
 
             Parameters currently not supported are `axis`, `bool_only`,
             `level`.
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index eccb3acabf6..3f91be71f29 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -22,7 +22,7 @@
 from cudf._lib.sort import segmented_sort_by_key
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
-from cudf.api.types import is_bool_dtype, is_list_like, is_numeric_dtype
+from cudf.api.types import is_list_like, is_numeric_dtype
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.abc import Serializable
 from cudf.core.column.column import ColumnBase, StructDtype, as_column
@@ -35,7 +35,12 @@
 from cudf.utils.utils import GetAttrGetItemMixin
 
 if TYPE_CHECKING:
-    from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
+    from cudf._typing import (
+        AggType,
+        DataFrameOrSeries,
+        MultiColumnAggType,
+        ScalarLike,
+    )
 
 
 def _deprecate_collect():
@@ -357,7 +362,7 @@ def groups(self):
         )
 
     @cached_property
-    def indices(self):
+    def indices(self) -> dict[ScalarLike, cp.ndarray]:
         """
         Dict {group name -> group indices}.
 
@@ -739,7 +744,8 @@ def _reduce(
             Computed {op} of values within each group.
 
         .. pandas-compat::
-            **{cls}.{op}**
+            :meth:`pandas.core.groupby.DataFrameGroupBy.{op}`,
+             :meth:`pandas.core.groupby.SeriesGroupBy.{op}`
 
             The numeric_only, min_count
         """
@@ -1015,18 +1021,16 @@ def ngroup(self, ascending=True):
 
         if ascending:
             # Count ascending from 0 to num_groups - 1
-            group_ids = cudf.Series._from_data({None: cp.arange(num_groups)})
+            groups = range(num_groups)
         elif has_null_group:
             # Count descending from num_groups - 1 to 0, but subtract one more
             # for the null group making it num_groups - 2 to -1.
-            group_ids = cudf.Series._from_data(
-                {None: cp.arange(num_groups - 2, -2, -1)}
-            )
+            groups = range(num_groups - 2, -2, -1)
         else:
             # Count descending from num_groups - 1 to 0
-            group_ids = cudf.Series._from_data(
-                {None: cp.arange(num_groups - 1, -1, -1)}
-            )
+            groups = range(num_groups - 1, -1, -1)
+
+        group_ids = cudf.Series._from_data({None: as_column(groups)})
 
         if has_null_group:
             group_ids.iloc[-1] = cudf.NA
@@ -1479,7 +1483,8 @@ def mult(df):
           6    2    6   12
 
         .. pandas-compat::
-            **GroupBy.apply**
+            :meth:`pandas.core.groupby.DataFrameGroupBy.apply`,
+             :meth:`pandas.core.groupby.SeriesGroupBy.apply`
 
             cuDF's ``groupby.apply`` is limited compared to pandas.
             In some situations, Pandas returns the grouped keys as part of
@@ -1531,7 +1536,7 @@ def mult(df):
                 # For `sum` & `product`, boolean types
                 # will need to result in `int64` type.
                 for name, col in res._data.items():
-                    if is_bool_dtype(col.dtype):
+                    if col.dtype.kind == "b":
                         res._data[name] = col.astype("int")
             return res
 
@@ -1713,7 +1718,7 @@ def rolling_avg(val, avg):
         return grouped_values.apply_chunks(function, **kwargs)
 
     @_performance_tracking
-    def _broadcast(self, values):
+    def _broadcast(self, values: cudf.Series) -> cudf.Series:
         """
         Broadcast the results of an aggregation to the group
 
@@ -2355,7 +2360,8 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
             Object shifted within each group.
 
         .. pandas-compat::
-            **GroupBy.shift**
+            :meth:`pandas.core.groupby.DataFrameGroupBy.shift`,
+             :meth:`pandas.core.groupby.SeriesGroupBy.shift`
 
             Parameter ``freq`` is unsupported.
         """
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index b398ee2343e..ae20fcd5d9c 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -52,11 +52,9 @@
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
-    _NUMPY_SCTYPES,
     _maybe_convert_to_default_type,
     find_common_type,
     is_mixed_with_object_dtype,
-    numeric_normalize_types,
 )
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
@@ -103,7 +101,7 @@ def __subclasscheck__(self, subclass):
 
 def _lexsorted_equal_range(
     idx: Index | cudf.MultiIndex,
-    key_as_table: Frame,
+    keys: list[ColumnBase],
     is_sorted: bool,
 ) -> tuple[int, int, ColumnBase | None]:
     """Get equal range for key in lexicographically sorted index. If index
@@ -118,13 +116,13 @@ def _lexsorted_equal_range(
         sort_vals = idx
     lower_bound = search_sorted(
         [*sort_vals._data.columns],
-        [*key_as_table._columns],
+        keys,
         side="left",
         ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
     upper_bound = search_sorted(
         [*sort_vals._data.columns],
-        [*key_as_table._columns],
+        keys,
         side="right",
         ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
@@ -260,7 +258,9 @@ def searchsorted(
         ), "Invalid ascending flag"
         return search_range(value, self._range, side=side)
 
-    def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
+    def factorize(
+        self, sort: bool = False, use_na_sentinel: bool = True
+    ) -> tuple[cupy.ndarray, Self]:
         if sort and self.step < 0:
             codes = cupy.arange(len(self) - 1, -1, -1)
             uniques = self[::-1]
@@ -355,12 +355,10 @@ def _data(self):
     @_performance_tracking
     def __contains__(self, item):
         hash(item)
-        if isinstance(item, bool) or not isinstance(
-            item,
-            tuple(
-                _NUMPY_SCTYPES["int"] + _NUMPY_SCTYPES["float"] + [int, float]
-            ),
-        ):
+        if not isinstance(item, (np.floating, np.integer, int, float)):
+            return False
+        elif isinstance(item, (np.timedelta64, np.datetime64, bool)):
+            # Cases that would pass the above check
             return False
         try:
             int_item = int(item)
@@ -753,15 +751,16 @@ def difference(self, other, sort=None):
             super().difference(other, sort=sort)
         )
 
-    def _try_reconstruct_range_index(self, index):
-        if isinstance(index, RangeIndex) or index.dtype.kind == "f":
+    def _try_reconstruct_range_index(
+        self, index: BaseIndex
+    ) -> Self | BaseIndex:
+        if isinstance(index, RangeIndex) or index.dtype.kind not in "iu":
             return index
         # Evenly spaced values can return a
         # RangeIndex instead of a materialized Index.
-        if not index._column.has_nulls():
+        if not index._column.has_nulls():  # type: ignore[attr-defined]
             uniques = cupy.unique(cupy.diff(index.values))
-            if len(uniques) == 1 and uniques[0].get() != 0:
-                diff = uniques[0].get()
+            if len(uniques) == 1 and (diff := uniques[0].get()) != 0:
                 new_range = range(index[0], index[-1] + diff, diff)
                 return type(self)(new_range, name=index.name)
         return index
@@ -1309,7 +1308,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         return _return_get_indexer_result(result_series.to_cupy())
 
     @_performance_tracking
-    def get_loc(self, key):
+    def get_loc(self, key) -> int | slice | cupy.ndarray:
         if not is_scalar(key):
             raise TypeError("Should be a scalar-like")
 
@@ -1317,9 +1316,8 @@ def get_loc(self, key):
             self.is_monotonic_increasing or self.is_monotonic_decreasing
         )
 
-        target_as_table = cudf.core.frame.Frame({"None": as_column([key])})
         lower_bound, upper_bound, sort_inds = _lexsorted_equal_range(
-            self, target_as_table, is_sorted
+            self, [as_column([key])], is_sorted
         )
 
         if lower_bound == upper_bound:
@@ -1330,7 +1328,7 @@ def get_loc(self, key):
             return (
                 lower_bound
                 if is_sorted
-                else sort_inds.element_indexing(lower_bound)
+                else sort_inds.element_indexing(lower_bound)  # type: ignore[union-attr]
             )
 
         if is_sorted:
@@ -1339,8 +1337,8 @@ def get_loc(self, key):
             return slice(lower_bound, upper_bound)
 
         # Not sorted and not unique. Return a boolean mask
-        mask = cupy.full(self._data.nrows, False)
-        true_inds = sort_inds.slice(lower_bound, upper_bound).values
+        mask = cupy.full(len(self), False)
+        true_inds = sort_inds.slice(lower_bound, upper_bound).values  # type: ignore[union-attr]
         mask[true_inds] = True
         return mask
 
@@ -1458,18 +1456,19 @@ def notna(self):
     notnull = notna
 
     def _is_numeric(self):
-        return isinstance(
-            self._values, cudf.core.column.NumericalColumn
-        ) and self.dtype != cudf.dtype("bool")
+        return (
+            isinstance(self._values, cudf.core.column.NumericalColumn)
+            and self.dtype.kind != "b"
+        )
 
     def _is_boolean(self):
-        return self.dtype == cudf.dtype("bool")
+        return self.dtype.kind == "b"
 
     def _is_integer(self):
-        return cudf.api.types.is_integer_dtype(self.dtype)
+        return self.dtype.kind in "iu"
 
     def _is_floating(self):
-        return cudf.api.types.is_float_dtype(self.dtype)
+        return self.dtype.kind == "f"
 
     def _is_object(self):
         return isinstance(self._values, cudf.core.column.StringColumn)
@@ -1599,9 +1598,13 @@ def append(self, other):
                         f"either one of them to same dtypes."
                     )
 
-                if isinstance(self._values, cudf.core.column.NumericalColumn):
-                    if self.dtype != other.dtype:
-                        this, other = numeric_normalize_types(self, other)
+                if (
+                    isinstance(self._column, cudf.core.column.NumericalColumn)
+                    and self.dtype != other.dtype
+                ):
+                    common_type = find_common_type((self.dtype, other.dtype))
+                    this = this.astype(common_type)
+                    other = other.astype(common_type)
                 to_concat = [this, other]
 
         return self._concat(to_concat)
@@ -2076,7 +2079,7 @@ def day_of_year(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def is_leap_year(self):
+    def is_leap_year(self) -> cupy.ndarray:
         """
         Boolean indicator if the date belongs to a leap year.
 
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index ff10051c52d..60cd142db4b 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -26,6 +26,8 @@
 
 import cudf
 import cudf._lib as libcudf
+import cudf.core
+import cudf.core.algorithms
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -495,7 +497,7 @@ def empty(self):
         True
 
         .. pandas-compat::
-            **DataFrame.empty, Series.empty**
+            :attr:`pandas.DataFrame.empty`, :attr:`pandas.Series.empty`
 
             If DataFrame/Series contains only `null` values, it is still not
             considered empty. See the example above.
@@ -829,7 +831,7 @@ def replace(
         4    4    9  e
 
         .. pandas-compat::
-            **DataFrame.replace, Series.replace**
+            :meth:`pandas.DataFrame.replace`, :meth:`pandas.Series.replace`
 
             Parameters that are currently not supported are: `limit`, `regex`,
             `method`
@@ -1370,7 +1372,7 @@ def sum(
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.sum, Series.sum**
+           :meth:`pandas.DataFrame.sum`, :meth:`pandas.Series.sum`
 
             Parameters currently not supported are `level`, `numeric_only`.
         """
@@ -1431,7 +1433,7 @@ def product(
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.product, Series.product**
+            :meth:`pandas.DataFrame.product`, :meth:`pandas.Series.product`
 
             Parameters currently not supported are level`, `numeric_only`.
         """
@@ -1528,7 +1530,7 @@ def median(
         17.0
 
         .. pandas-compat::
-            **DataFrame.median, Series.median**
+            :meth:`pandas.DataFrame.median`, :meth:`pandas.Series.median`
 
             Parameters currently not supported are `level` and `numeric_only`.
         """
@@ -1584,7 +1586,7 @@ def std(
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.std, Series.std**
+            :meth:`pandas.DataFrame.std`, :meth:`pandas.Series.std`
 
             Parameters currently not supported are `level` and
             `numeric_only`
@@ -1643,7 +1645,7 @@ def var(
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.var, Series.var**
+            :meth:`pandas.DataFrame.var`, :meth:`pandas.Series.var`
 
             Parameters currently not supported are `level` and
             `numeric_only`
@@ -1699,7 +1701,7 @@ def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.kurtosis**
+            :meth:`pandas.DataFrame.kurtosis`
 
             Parameters currently not supported are `level` and `numeric_only`
         """
@@ -1761,7 +1763,7 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.skew, Series.skew, Frame.skew**
+            :meth:`pandas.DataFrame.skew`, :meth:`pandas.Series.skew`
 
             The `axis` parameter is not currently supported.
         """
@@ -1987,6 +1989,8 @@ def interpolate(
                 "Use obj.ffill() or obj.bfill() instead.",
                 FutureWarning,
             )
+        elif method not in {"linear", "values", "index"}:
+            raise ValueError(f"Interpolation method `{method}` not found")
 
         data = self
 
@@ -2000,7 +2004,10 @@ def interpolate(
                 )
             )
 
-        interpolator = cudf.core.algorithms.get_column_interpolator(method)
+        if method == "linear":
+            interp_index = RangeIndex(self._num_rows)
+        else:
+            interp_index = data.index
         columns = []
         for col in data._columns:
             if isinstance(col, cudf.core.column.StringColumn):
@@ -2012,8 +2019,9 @@ def interpolate(
             if col.nullable:
                 col = col.astype("float64").fillna(np.nan)
 
-            # Interpolation methods may or may not need the index
-            columns.append(interpolator(col, index=data.index))
+            columns.append(
+                cudf.core.algorithms._interpolation(col, index=interp_index)
+            )
 
         result = self._from_data_like_self(
             self._data._from_columns_like_self(columns)
@@ -2221,7 +2229,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True):
         2021-01-01 23:45:27  1  2
 
         .. pandas-compat::
-            **DataFrame.truncate, Series.truncate**
+            :meth:`pandas.DataFrame.truncate`, :meth:`pandas.Series.truncate`
 
             The ``copy`` parameter is only present for API compatibility, but
             ``copy=False`` is not supported. This method always generates a
@@ -2657,7 +2665,7 @@ def sort_index(
         2  3  1
 
         .. pandas-compat::
-            **DataFrame.sort_index, Series.sort_index**
+            :meth:`pandas.DataFrame.sort_index`, :meth:`pandas.Series.sort_index`
 
             * Not supporting: kind, sort_remaining=False
         """
@@ -3294,7 +3302,7 @@ def pad(self, value=None, axis=None, inplace=None, limit=None):
         )
         return self.ffill(value=value, axis=axis, inplace=inplace, limit=limit)
 
-    def add_prefix(self, prefix):
+    def add_prefix(self, prefix, axis=None):
         """
         Prefix labels with string `prefix`.
 
@@ -3456,6 +3464,7 @@ def sort_values(
         kind="quicksort",
         na_position="last",
         ignore_index=False,
+        key=None,
     ):
         """Sort by the values along either axis.
 
@@ -3471,6 +3480,14 @@ def sort_values(
             'first' puts nulls at the beginning, 'last' puts nulls at the end
         ignore_index : bool, default False
             If True, index will not be sorted.
+        key : callable, optional
+            Apply the key function to the values
+            before sorting. This is similar to the ``key`` argument in the
+            builtin ``sorted`` function, with the notable difference that
+            this ``key`` function should be *vectorized*. It should expect a
+            ``Series`` and return a Series with the same shape as the input.
+            It will be applied to each column in `by` independently.
+            Currently not supported.
 
         Returns
         -------
@@ -3489,7 +3506,7 @@ def sort_values(
         1  1  2
 
         .. pandas-compat::
-            **DataFrame.sort_values, Series.sort_values**
+            :meth:`pandas.DataFrame.sort_values`, :meth:`pandas.Series.sort_values`
 
             * Support axis='index' only.
             * Not supporting: inplace, kind
@@ -3510,6 +3527,8 @@ def sort_values(
             )
         if axis != 0:
             raise NotImplementedError("`axis` not currently implemented.")
+        if key is not None:
+            raise NotImplementedError("key is not currently supported.")
 
         if len(self) == 0:
             return self
@@ -4000,7 +4019,7 @@ def resample(
 
 
         .. pandas-compat::
-            **DataFrame.resample, Series.resample**
+            :meth:`pandas.DataFrame.resample`, :meth:`pandas.Series.resample`
 
             Note that the dtype of the index (or the 'on' column if using
             'on=') in the result will be of a frequency closest to the
@@ -4556,7 +4575,7 @@ def sample(
         1  2  4
 
         .. pandas-compat::
-            **DataFrame.sample, Series.sample**
+            :meth:`pandas.DataFrame.sample`, :meth:`pandas.Series.sample`
 
             When sampling from ``axis=0/'index'``, ``random_state`` can be
             either a numpy random state (``numpy.random.RandomState``)
@@ -5241,7 +5260,6 @@ def groupby(
         as_index=True,
         sort=no_default,
         group_keys=False,
-        squeeze=False,
         observed=True,
         dropna=True,
     ):
@@ -5251,11 +5269,6 @@ def groupby(
         if axis not in (0, "index"):
             raise NotImplementedError("axis parameter is not yet implemented")
 
-        if squeeze is not False:
-            raise NotImplementedError(
-                "squeeze parameter is not yet implemented"
-            )
-
         if not observed:
             raise NotImplementedError(
                 "observed parameter is not yet implemented"
@@ -6227,13 +6240,13 @@ def rank(
 
     def convert_dtypes(
         self,
-        infer_objects=True,
-        convert_string=True,
-        convert_integer=True,
-        convert_boolean=True,
-        convert_floating=True,
+        infer_objects: bool = True,
+        convert_string: bool = True,
+        convert_integer: bool = True,
+        convert_boolean: bool = True,
+        convert_floating: bool = True,
         dtype_backend=None,
-    ):
+    ) -> Self:
         """
         Convert columns to the best possible nullable dtypes.
 
@@ -6244,17 +6257,21 @@ def convert_dtypes(
         All other dtypes are always returned as-is as all dtypes in
         cudf are nullable.
         """
-        result = self.copy()
-
-        if convert_floating:
-            # cast any floating columns to int64 if
-            # they are all integer data:
-            for name, col in result._data.items():
+        if not (convert_floating and convert_integer):
+            return self.copy()
+        else:
+            cols = []
+            for col in self._columns:
                 if col.dtype.kind == "f":
                     col = col.fillna(0)
-                    if cp.allclose(col, col.astype("int64")):
-                        result._data[name] = col.astype("int64")
-        return result
+                    as_int = col.astype("int64")
+                    if cp.allclose(col, as_int):
+                        cols.append(as_int)
+                        continue
+                cols.append(col)
+            return self._from_data_like_self(
+                self._data._from_columns_like_self(cols, verify=False)
+            )
 
     @_warn_no_dask_cudf
     def __dask_tokenize__(self):
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index a5fed02cbed..a0089242909 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -8,12 +8,7 @@
 from typing_extensions import TypeAlias
 
 import cudf
-from cudf.api.types import (
-    _is_scalar_or_zero_d_array,
-    is_bool_dtype,
-    is_integer,
-    is_integer_dtype,
-)
+from cudf.api.types import _is_scalar_or_zero_d_array, is_integer
 from cudf.core.copy_types import BooleanMask, GatherMap
 
 
@@ -230,11 +225,11 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec:
         key = cudf.core.column.as_column(key)
         if isinstance(key, cudf.core.column.CategoricalColumn):
             key = key.astype(key.codes.dtype)
-        if is_bool_dtype(key.dtype):
+        if key.dtype.kind == "b":
             return MaskIndexer(BooleanMask(key, n))
         elif len(key) == 0:
             return EmptyIndexer()
-        elif is_integer_dtype(key.dtype):
+        elif key.dtype.kind in "iu":
             return MapIndexer(GatherMap(key, n, nullify=False))
         else:
             raise TypeError(
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index dd0a4f666a1..32c84763401 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 import cudf
-from cudf.api.types import is_decimal_dtype, is_dtype_equal
+from cudf.api.types import is_decimal_dtype, is_dtype_equal, is_numeric_dtype
 from cudf.core.column import CategoricalColumn
 from cudf.core.dtypes import CategoricalDtype
 
@@ -88,38 +88,25 @@ def _match_join_keys(
         )
 
     if (
-        np.issubdtype(ltype, np.number)
-        and np.issubdtype(rtype, np.number)
-        and not (
-            np.issubdtype(ltype, np.timedelta64)
-            or np.issubdtype(rtype, np.timedelta64)
-        )
+        is_numeric_dtype(ltype)
+        and is_numeric_dtype(rtype)
+        and not (ltype.kind == "m" or rtype.kind == "m")
     ):
         common_type = (
             max(ltype, rtype)
             if ltype.kind == rtype.kind
             else np.result_type(ltype, rtype)
         )
-    elif (
-        np.issubdtype(ltype, np.datetime64)
-        and np.issubdtype(rtype, np.datetime64)
-    ) or (
-        np.issubdtype(ltype, np.timedelta64)
-        and np.issubdtype(rtype, np.timedelta64)
+    elif (ltype.kind == "M" and rtype.kind == "M") or (
+        ltype.kind == "m" and rtype.kind == "m"
     ):
         common_type = max(ltype, rtype)
-    elif (
-        np.issubdtype(ltype, np.datetime64)
-        or np.issubdtype(ltype, np.timedelta64)
-    ) and not rcol.fillna(0).can_cast_safely(ltype):
+    elif ltype.kind in "mM" and not rcol.fillna(0).can_cast_safely(ltype):
         raise TypeError(
             f"Cannot join between {ltype} and {rtype}, please type-cast both "
             "columns to the same type."
         )
-    elif (
-        np.issubdtype(rtype, np.datetime64)
-        or np.issubdtype(rtype, np.timedelta64)
-    ) and not lcol.fillna(0).can_cast_safely(rtype):
+    elif rtype.kind in "mM" and not lcol.fillna(0).can_cast_safely(rtype):
         raise TypeError(
             f"Cannot join between {rtype} and {ltype}, please type-cast both "
             "columns to the same type."
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 9cbe863142b..ff4b06c6334 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -7,9 +7,7 @@
 import operator
 import pickle
 import warnings
-from collections import abc
 from functools import cached_property
-from numbers import Integral
 from typing import TYPE_CHECKING, Any, MutableMapping
 
 import cupy as cp
@@ -20,9 +18,10 @@
 import cudf._lib as libcudf
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
-from cudf.api.types import is_integer, is_list_like, is_object_dtype
+from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar
 from cudf.core import column
 from cudf.core._base_index import _return_get_indexer_result
+from cudf.core.algorithms import factorize
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
 from cudf.core.index import (
@@ -63,6 +62,20 @@ def _maybe_indices_to_slice(indices: cp.ndarray) -> slice | cp.ndarray:
     return indices
 
 
+def _compute_levels_and_codes(
+    data: MutableMapping,
+) -> tuple[list[cudf.Index], list[column.ColumnBase]]:
+    """Return MultiIndex level and codes from a ColumnAccessor-like mapping."""
+    levels = []
+    codes = []
+    for col in data.values():
+        code, cats = factorize(col)
+        codes.append(column.as_column(code.astype(np.int64)))
+        levels.append(cats)
+
+    return levels, codes
+
+
 class MultiIndex(Frame, BaseIndex, NotIterable):
     """A multi-level or hierarchical index.
 
@@ -145,50 +158,36 @@ def __init__(
             raise NotImplementedError(
                 "Use `names`, `name` is not yet supported"
             )
-        if len(levels) == 0:
-            raise ValueError("Must pass non-zero number of levels/codes")
-        if not isinstance(codes, cudf.DataFrame) and not isinstance(
-            codes[0], (abc.Sequence, np.ndarray, cp.ndarray)
-        ):
-            raise TypeError("Codes is not a Sequence of sequences")
-
-        if copy:
-            if isinstance(codes, cudf.DataFrame):
-                codes = codes.copy(deep=True)
-            if len(levels) > 0 and isinstance(
-                levels[0], (cudf.Index, cudf.Series)
-            ):
-                levels = [level.copy(deep=True) for level in levels]
-
-        if not isinstance(codes, cudf.DataFrame):
-            if len(levels) == len(codes):
-                codes = cudf.DataFrame._from_data(
-                    {
-                        i: column.as_column(code).astype(np.int64)
-                        for i, code in enumerate(codes)
-                    }
-                )
-            else:
-                raise ValueError(
-                    "MultiIndex has unequal number of levels and "
-                    "codes and is inconsistent!"
-                )
-
-        levels = [ensure_index(level) for level in levels]
-
-        if len(levels) != len(codes._data):
+        if levels is None or codes is None:
+            raise TypeError("Must pass both levels and codes")
+        elif not (is_list_like(levels) and len(levels) > 0):
+            raise ValueError("Must pass non-zero length sequence of levels")
+        elif not (is_list_like(codes) and len(codes) > 0):
+            raise ValueError("Must pass non-zero length sequence of codes")
+        elif len(codes) != len(levels):
             raise ValueError(
-                "MultiIndex has unequal number of levels and "
-                "codes and is inconsistent!"
-            )
-        if len({c.size for c in codes._data.columns}) != 1:
-            raise ValueError(
-                "MultiIndex length of codes does not match "
-                "and is inconsistent!"
+                f"levels must have the same length ({len(levels)}) "
+                f"as codes ({len(codes)})."
             )
 
+        new_levels = []
+        for level in levels:
+            new_level = ensure_index(level)
+            if copy and new_level is level:
+                new_level = new_level.copy(deep=True)
+            new_levels.append(new_level)
+
+        new_codes = []
+        for code in codes:
+            if not (is_list_like(code) or is_column_like(code)):
+                raise TypeError("Each code must be list-like")
+            new_code = column.as_column(code).astype("int64")
+            if copy and new_code is code:
+                new_code = new_code.copy(deep=True)
+            new_codes.append(new_code)
+
         source_data = {}
-        for (column_name, code), level in zip(codes._data.items(), levels):
+        for i, (code, level) in enumerate(zip(new_codes, new_levels)):
             if len(code):
                 lo, hi = libcudf.reduce.minmax(code)
                 if lo.value < -1 or hi.value > len(level) - 1:
@@ -201,13 +200,11 @@ def __init__(
             result_col = libcudf.copying.gather(
                 [level._column], code, nullify=True
             )
-            source_data[column_name] = result_col[0]._with_type_metadata(
-                level.dtype
-            )
+            source_data[i] = result_col[0]._with_type_metadata(level.dtype)
 
-        super().__init__(source_data)
-        self._levels = levels
-        self._codes = codes
+        super().__init__(ColumnAccessor(source_data))
+        self._levels = new_levels
+        self._codes = new_codes
         self._name = None
         self.names = names
 
@@ -349,10 +346,37 @@ def _from_data(
         data: MutableMapping,
         name: Any = None,
     ) -> MultiIndex:
-        obj = cls.from_frame(cudf.DataFrame._from_data(data=data))
-        if name is not None:
-            obj.name = name
-        return obj
+        """
+        Use when you have a ColumnAccessor-like mapping but no codes and levels.
+        """
+        levels, codes = _compute_levels_and_codes(data)
+        return cls._simple_new(
+            data=ColumnAccessor(data),
+            levels=levels,
+            codes=codes,
+            names=pd.core.indexes.frozen.FrozenList(data.keys()),
+            name=name,
+        )
+
+    @classmethod
+    def _simple_new(
+        cls,
+        data: ColumnAccessor,
+        levels: list[cudf.Index],
+        codes: list[column.ColumnBase],
+        names: pd.core.indexes.frozen.FrozenList,
+        name: Any = None,
+    ) -> Self:
+        """
+        Use when you have a ColumnAccessor-like mapping, codes, and levels.
+        """
+        mi = object.__new__(cls)
+        mi._data = data
+        mi._levels = levels
+        mi._codes = codes
+        mi._names = names
+        mi._name = name
+        return mi
 
     @property  # type: ignore
     @_performance_tracking
@@ -420,18 +444,17 @@ def copy(
         2020-08-28 AMZN  3401.80
                    MSFT   228.91
         """
-
-        mi = MultiIndex._from_data(self._data.copy(deep=deep))
-        if self._levels is not None:
-            mi._levels = [idx.copy(deep=deep) for idx in self._levels]
-        if self._codes is not None:
-            mi._codes = self._codes.copy(deep)
         if names is not None:
-            mi.names = names
-        elif self.names is not None:
-            mi.names = self.names.copy()
-
-        return mi
+            names = pd.core.indexes.frozen.FrozenList(names)
+        else:
+            names = self.names
+        return type(self)._simple_new(
+            data=self._data.copy(deep=deep),
+            levels=[idx.copy(deep=deep) for idx in self._levels],
+            codes=[code.copy(deep=deep) for code in self._codes],
+            names=names,
+            name=name,
+        )
 
     @_performance_tracking
     def __repr__(self):
@@ -477,14 +500,8 @@ def __repr__(self):
         data_output = "\n".join(lines)
         return output_prefix + data_output
 
-    @property
-    def _codes_frame(self):
-        if self._codes is None:
-            self._compute_levels_and_codes()
-        return self._codes
-
     @property  # type: ignore
-    @_external_only_api("Use ._codes_frame instead")
+    @_external_only_api("Use ._codes instead")
     @_performance_tracking
     def codes(self):
         """
@@ -504,7 +521,7 @@ def codes(self):
         FrozenList([[0, 1, 2], [0, 1, 2]])
         """
         return pd.core.indexes.frozen.FrozenList(
-            col.values for col in self._codes_frame._columns
+            col.values for col in self._codes
         )
 
     def get_slice_bound(self, label, side, kind=None):
@@ -518,13 +535,13 @@ def nlevels(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def levels(self):
+    def levels(self) -> list[cudf.Index]:
         """
         Returns list of levels in the MultiIndex
 
         Returns
         -------
-        List of Series objects
+        List of Index objects
 
         Examples
         --------
@@ -544,9 +561,9 @@ def levels(self):
         >>> midx.levels
         [Index([1, 2, 3], dtype='int64', name='a'), Index([10, 11, 12], dtype='int64', name='b')]
         """  # noqa: E501
-        if self._levels is None:
-            self._compute_levels_and_codes()
-        return self._levels
+        return [
+            idx.rename(name) for idx, name in zip(self._levels, self.names)
+        ]
 
     @property  # type: ignore
     @_performance_tracking
@@ -565,11 +582,10 @@ def _get_level_label(self, level):
             else if level is index of the level, then level
             label will be returned as per the index.
         """
-
-        if level in self._data.names:
+        if level in self.names:
             return level
         else:
-            return self._data.names[level]
+            return self.names[level]
 
     @_performance_tracking
     def isin(self, values, level=None):
@@ -670,20 +686,6 @@ def where(self, cond, other=None, inplace=False):
             ".where is not supported for MultiIndex operations"
         )
 
-    @_performance_tracking
-    def _compute_levels_and_codes(self):
-        levels = []
-
-        codes = {}
-        for name, col in self._data.items():
-            code, cats = cudf.Series._from_data({None: col}).factorize()
-            cats.name = name
-            codes[name] = code.astype(np.int64)
-            levels.append(cats)
-
-        self._levels = levels
-        self._codes = cudf.DataFrame._from_data(codes)
-
     @_performance_tracking
     def _compute_validity_mask(self, index, row_tuple, max_length):
         """Computes the valid set of indices of values in the lookup"""
@@ -822,7 +824,7 @@ def _index_and_downcast(self, result, index, index_key):
                 result.names = index.names[size:]
             index = MultiIndex(
                 levels=index.levels[size:],
-                codes=index._codes_frame.iloc[:, size:],
+                codes=index._codes[size:],
                 names=index.names[size:],
             )
 
@@ -839,10 +841,6 @@ def _get_row_major(
         | tuple[Any, ...]
         | list[tuple[Any, ...]],
     ) -> DataFrameOrSeries:
-        if pd.api.types.is_bool_dtype(
-            list(row_tuple) if isinstance(row_tuple, tuple) else row_tuple
-        ):
-            return df[row_tuple]
         if isinstance(row_tuple, slice):
             if row_tuple.start is None:
                 row_tuple = slice(self[0], row_tuple.stop, row_tuple.step)
@@ -932,28 +930,29 @@ def deserialize(cls, header, frames):
     def __getitem__(self, index):
         flatten = isinstance(index, int)
 
-        if isinstance(index, (Integral, abc.Sequence)):
-            index = np.array(index)
-        elif isinstance(index, slice):
+        if isinstance(index, slice):
             start, stop, step = index.indices(len(self))
-            index = column.as_column(range(start, stop, step))
-        result = MultiIndex.from_frame(
-            self.to_frame(index=False, name=range(0, self.nlevels)).take(
-                index
-            ),
-            names=self.names,
+            idx = range(start, stop, step)
+        elif is_scalar(index):
+            idx = [index]
+        else:
+            idx = index
+
+        indexer = column.as_column(idx)
+        ca = self._data._from_columns_like_self(
+            (col.take(indexer) for col in self._columns), verify=False
+        )
+        codes = [code.take(indexer) for code in self._codes]
+        result = type(self)._simple_new(
+            data=ca, codes=codes, levels=self._levels, names=self.names
         )
 
         # we are indexing into a single row of the MultiIndex,
         # return that row as a tuple:
         if flatten:
             return result.to_pandas()[0]
-
-        if self._codes_frame is not None:
-            result._codes = self._codes_frame.take(index)
-        if self._levels is not None:
-            result._levels = self._levels
-        return result
+        else:
+            return result
 
     @_performance_tracking
     def to_frame(self, index=True, name=no_default, allow_duplicates=False):
@@ -1269,25 +1268,12 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
                     ('NJ', 'Precip')],
                    names=['state', 'observation'])
         """
-        obj = cls.__new__(cls)
-        super(cls, obj).__init__()
-
-        source_data = df.copy(deep=False)
-        source_data.reset_index(drop=True, inplace=True)
-        if isinstance(source_data, pd.DataFrame):
-            source_data = cudf.DataFrame.from_pandas(source_data)
-
-        names = names if names is not None else source_data._data.names
-        # if names are unique
-        # try using those as the source_data column names:
-        if len(dict.fromkeys(names)) == len(names):
-            source_data.columns = names
-        obj._name = None
-        obj._data = source_data._data
-        obj.names = names
-        obj._codes = None
-        obj._levels = None
-        return obj
+        if isinstance(df, pd.DataFrame):
+            source_data = cudf.DataFrame.from_pandas(df)
+        else:
+            source_data = df
+        names = names if names is not None else source_data._column_names
+        return cls.from_arrays(source_data._columns, names=names)
 
     @classmethod
     @_performance_tracking
@@ -1373,9 +1359,6 @@ def from_arrays(
                     (2, 'blue')],
                    names=['number', 'color'])
         """
-        # Imported here due to circular import
-        from cudf.core.algorithms import factorize
-
         error_msg = "Input must be a list / sequence of array-likes."
         if not is_list_like(arrays):
             raise TypeError(error_msg)
@@ -1438,7 +1421,7 @@ def _poplevels(self, level):
 
         # update self
         self.names = names
-        self._compute_levels_and_codes()
+        self._levels, self._codes = _compute_levels_and_codes(self._data)
 
         return popped
 
@@ -1562,13 +1545,19 @@ def to_pandas(
     ) -> pd.MultiIndex:
         # cudf uses np.iinfo(size_type_dtype).min as missing code
         # pandas uses -1 as missing code
-        pd_codes = self._codes_frame.replace(np.iinfo(size_type_dtype).min, -1)
+        pd_codes = (
+            code.find_and_replace(
+                column.as_column(np.iinfo(size_type_dtype).min, length=1),
+                column.as_column(-1, length=1),
+            )
+            for code in self._codes
+        )
         return pd.MultiIndex(
             levels=[
                 level.to_pandas(nullable=nullable, arrow_type=arrow_type)
                 for level in self.levels
             ],
-            codes=[col.values_host for col in pd_codes._columns],
+            codes=[col.values_host for col in pd_codes],
             names=self.names,
         )
 
@@ -1743,13 +1732,9 @@ def _clean_nulls_from_index(self):
 
     @_performance_tracking
     def memory_usage(self, deep=False):
-        usage = sum(col.memory_usage for col in self._data.columns)
-        if self.levels:
-            for level in self.levels:
-                usage += level.memory_usage(deep=deep)
-        if self._codes_frame:
-            for col in self._codes_frame._data.columns:
-                usage += col.memory_usage
+        usage = sum(col.memory_usage for col in self._columns)
+        usage += sum(level.memory_usage(deep=deep) for level in self._levels)
+        usage += sum(code.memory_usage for code in self._codes)
         return usage
 
     @_performance_tracking
@@ -1937,17 +1922,18 @@ def get_loc(self, key):
 
         # Handle partial key search. If length of `key` is less than `nlevels`,
         # Only search levels up to `len(key)` level.
-        key_as_table = cudf.core.frame.Frame(
-            {i: column.as_column(k, length=1) for i, k in enumerate(key)}
-        )
         partial_index = self.__class__._from_data(
-            data=self._data.select_by_index(slice(key_as_table._num_columns))
+            data=self._data.select_by_index(slice(len(key)))
         )
         (
             lower_bound,
             upper_bound,
             sort_inds,
-        ) = _lexsorted_equal_range(partial_index, key_as_table, is_sorted)
+        ) = _lexsorted_equal_range(
+            partial_index,
+            [column.as_column(k, length=1) for k in key],
+            is_sorted,
+        )
 
         if lower_bound == upper_bound:
             raise KeyError(key)
@@ -1972,7 +1958,7 @@ def get_loc(self, key):
             return true_inds
 
         # Not sorted and not unique. Return a boolean mask
-        mask = cp.full(self._data.nrows, False)
+        mask = cp.full(len(self), False)
         mask[true_inds] = True
         return mask
 
@@ -2045,7 +2031,7 @@ def _union(self, other, sort=None):
             ignore_index=True,
         )
 
-        midx = MultiIndex.from_frame(result_df.iloc[:, : self.nlevels])
+        midx = type(self)._from_data(result_df.iloc[:, : self.nlevels]._data)
         midx.names = self.names if self.names == other.names else None
         if sort in {None, True} and len(other):
             return midx.sort_values()
@@ -2069,7 +2055,8 @@ def _intersection(self, other, sort=None):
             self_df.columns = col_names
 
         result_df = cudf.merge(self_df, other_df, how="inner")
-        midx = self.__class__.from_frame(result_df, names=res_name)
+        midx = type(self)._from_data(result_df._data)
+        midx.names = res_name
         if sort in {None, True} and len(other):
             return midx.sort_values()
         return midx
@@ -2079,6 +2066,7 @@ def _copy_type_metadata(self: Self, other: Self) -> Self:
         res = super()._copy_type_metadata(other)
         if isinstance(other, MultiIndex):
             res._names = other._names
+        self._levels, self._codes = _compute_levels_and_codes(res._data)
         return res
 
     @_performance_tracking
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index cdd4ec6f8e5..4e0c5bd86b9 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -13,9 +13,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import pickle
 import warnings
+from typing import TYPE_CHECKING
 
 import numpy as np
 import pandas as pd
@@ -23,7 +25,6 @@
 import cudf
 import cudf._lib.labeling
 import cudf.core.index
-from cudf._typing import DataFrameOrSeries
 from cudf.core.groupby.groupby import (
     DataFrameGroupBy,
     GroupBy,
@@ -31,6 +32,9 @@
     _Grouping,
 )
 
+if TYPE_CHECKING:
+    from cudf._typing import DataFrameOrSeries
+
 
 class _Resampler(GroupBy):
     grouping: "_ResampleGrouping"
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 1120642947b..b538ae34b6f 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1060,7 +1060,7 @@ def pivot(data, columns=None, index=no_default, values=no_default):
     return result
 
 
-def unstack(df, level, fill_value=None):
+def unstack(df, level, fill_value=None, sort: bool = True):
     """
     Pivot one or more levels of the (necessarily hierarchical) index labels.
 
@@ -1080,6 +1080,9 @@ def unstack(df, level, fill_value=None):
         levels of the index to pivot
     fill_value
         Non-functional argument provided for compatibility with Pandas.
+    sort : bool, default True
+        Sort the level(s) in the resulting MultiIndex columns.
+
 
     Returns
     -------
@@ -1156,6 +1159,8 @@ def unstack(df, level, fill_value=None):
 
     if fill_value is not None:
         raise NotImplementedError("fill_value is not supported.")
+    elif sort is False:
+        raise NotImplementedError(f"{sort=} is not supported.")
     if pd.api.types.is_list_like(level):
         if not level:
             return df
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 29460d8c67e..f6331aa1f49 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -8,7 +8,7 @@
 import pyarrow as pa
 
 import cudf
-from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
+from cudf.api.types import is_scalar
 from cudf.core.dtypes import ListDtype, StructDtype
 from cudf.core.missing import NA, NaT
 from cudf.core.mixins import BinaryOperand
@@ -245,11 +245,7 @@ def _preprocess_host_value(self, value, dtype):
             dtype = cudf.dtype(dtype)
 
         if not valid:
-            value = (
-                NaT
-                if is_datetime64_dtype(dtype) or is_timedelta64_dtype(dtype)
-                else NA
-            )
+            value = NaT if dtype.kind in "mM" else NA
 
         return value, dtype
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 4a60470fafa..d8dbaa897e7 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -22,10 +22,8 @@
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     _is_scalar_or_zero_d_array,
-    is_bool_dtype,
     is_dict_like,
     is_integer,
-    is_integer_dtype,
     is_scalar,
 )
 from cudf.core import indexing_utils
@@ -214,17 +212,17 @@ def __setitem__(self, key, value):
                         and self._frame.dtype.categories.dtype.kind == "f"
                     )
                 )
-                and isinstance(value, (np.float32, np.float64))
+                and isinstance(value, np.floating)
                 and np.isnan(value)
             ):
                 raise MixedTypeError(
                     f"Cannot assign {value=} to "
                     f"non-float dtype={self._frame.dtype}"
                 )
-            elif (
-                self._frame.dtype.kind == "b"
-                and not is_bool_dtype(value)
-                and value not in {None, cudf.NA}
+            elif self._frame.dtype.kind == "b" and not (
+                value in {None, cudf.NA}
+                or isinstance(value, (np.bool_, bool))
+                or (isinstance(value, cudf.Scalar) and value.dtype.kind == "b")
             ):
                 raise MixedTypeError(
                     f"Cannot assign {value=} to "
@@ -357,12 +355,10 @@ def _loc_to_iloc(self, arg):
             )
             if not _is_non_decimal_numeric_dtype(index_dtype) and not (
                 isinstance(index_dtype, cudf.CategoricalDtype)
-                and is_integer_dtype(index_dtype.categories.dtype)
+                and index_dtype.categories.dtype.kind in "iu"
             ):
                 # TODO: switch to cudf.utils.dtypes.is_integer(arg)
-                if isinstance(arg, cudf.Scalar) and is_integer_dtype(
-                    arg.dtype
-                ):
+                if isinstance(arg, cudf.Scalar) and arg.dtype.kind in "iu":
                     # Do not remove until pandas 3.0 support is added.
                     assert (
                         PANDAS_LT_300
@@ -961,7 +957,7 @@ def reindex(self, *args, **kwargs):
         dtype: int64
 
         .. pandas-compat::
-            **Series.reindex**
+            :meth:`pandas.Series.reindex`
 
             Note: One difference from Pandas is that ``NA`` is used for rows
             that do not match, rather than ``NaN``. One side effect of this is
@@ -1244,7 +1240,7 @@ def map(self, arg, na_action=None) -> "Series":
         dtype: int64
 
         .. pandas-compat::
-            **Series.map**
+            :meth:`pandas.Series.map`
 
             Please note map currently only supports fixed-width numeric
             type functions.
@@ -2064,6 +2060,7 @@ def sort_values(
         kind="quicksort",
         na_position="last",
         ignore_index=False,
+        key=None,
     ):
         """Sort by the values along either axis.
 
@@ -2077,6 +2074,14 @@ def sort_values(
             'first' puts nulls at the beginning, 'last' puts nulls at the end
         ignore_index : bool, default False
             If True, index will not be sorted.
+        key : callable, optional
+            Apply the key function to the values
+            before sorting. This is similar to the ``key`` argument in the
+            builtin ``sorted`` function, with the notable difference that
+            this ``key`` function should be *vectorized*. It should expect a
+            ``Series`` and return a Series with the same shape as the input.
+            It will be applied to each column in `by` independently.
+            Currently not supported.
 
         Returns
         -------
@@ -2095,7 +2100,7 @@ def sort_values(
         dtype: int64
 
         .. pandas-compat::
-            **Series.sort_values**
+            :meth:`pandas.Series.sort_values`
 
             * Support axis='index' only.
             * The inplace and kind argument is currently unsupported
@@ -2108,6 +2113,7 @@ def sort_values(
             kind=kind,
             na_position=na_position,
             ignore_index=ignore_index,
+            key=key,
         )
 
     @_performance_tracking
@@ -2551,7 +2557,7 @@ def count(self):
         5
 
         .. pandas-compat::
-            **Series.count**
+            :meth:`pandas.Series.count`
 
             Parameters currently not supported is `level`.
         """
@@ -2662,7 +2668,7 @@ def cov(self, other, min_periods=None):
         -0.015750000000000004
 
         .. pandas-compat::
-            **Series.cov**
+            :meth:`pandas.Series.cov`
 
             `min_periods` parameter is not yet supported.
         """
@@ -3221,7 +3227,7 @@ def describe(
             percentiles = np.array([0.25, 0.5, 0.75])
 
         dtype = "str"
-        if is_bool_dtype(self.dtype):
+        if self.dtype.kind == "b":
             data = _describe_categorical(self, percentiles)
         elif isinstance(self._column, cudf.core.column.NumericalColumn):
             data = _describe_numeric(self, percentiles)
@@ -3369,7 +3375,6 @@ def groupby(
         as_index=True,
         sort=no_default,
         group_keys=False,
-        squeeze=False,
         observed=True,
         dropna=True,
     ):
@@ -3380,7 +3385,6 @@ def groupby(
             as_index,
             sort,
             group_keys,
-            squeeze,
             observed,
             dropna,
         )
@@ -3423,7 +3427,7 @@ def rename(self, index=None, copy=True):
         'numeric_series'
 
         .. pandas-compat::
-            **Series.rename**
+            :meth:`pandas.Series.rename`
 
             - Supports scalar values only for changing name attribute
             - The ``inplace`` and ``level`` is not supported
@@ -3432,7 +3436,9 @@ def rename(self, index=None, copy=True):
         return Series._from_data(out_data, self.index, name=index)
 
     @_performance_tracking
-    def add_prefix(self, prefix):
+    def add_prefix(self, prefix, axis=None):
+        if axis is not None:
+            raise NotImplementedError("axis is currently not implemented.")
         return Series._from_data(
             # TODO: Change to deep=False when copy-on-write is default
             data=self._data.copy(deep=True),
@@ -3530,7 +3536,12 @@ def explode(self, ignore_index=False):
 
     @_performance_tracking
     def pct_change(
-        self, periods=1, fill_method=no_default, limit=no_default, freq=None
+        self,
+        periods=1,
+        fill_method=no_default,
+        limit=no_default,
+        freq=None,
+        **kwargs,
     ):
         """
         Calculates the percent change between sequential elements
@@ -3555,6 +3566,9 @@ def pct_change(
         freq : str, optional
             Increment to use from time series API.
             Not yet implemented.
+        **kwargs
+            Additional keyword arguments are passed into
+            `Series.shift`.
 
         Returns
         -------
@@ -3599,11 +3613,15 @@ def pct_change(
             warnings.simplefilter("ignore")
             data = self.fillna(method=fill_method, limit=limit)
         diff = data.diff(periods=periods)
-        change = diff / data.shift(periods=periods, freq=freq)
+        change = diff / data.shift(periods=periods, freq=freq, **kwargs)
         return change
 
     @_performance_tracking
-    def where(self, cond, other=None, inplace=False):
+    def where(self, cond, other=None, inplace=False, axis=None, level=None):
+        if axis is not None:
+            raise NotImplementedError("axis is not supported.")
+        elif level is not None:
+            raise NotImplementedError("level is not supported.")
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
             self._from_data_like_self(
@@ -4703,7 +4721,7 @@ def strftime(self, date_format: str, *args, **kwargs) -> Series:
         dtype: object
 
         .. pandas-compat::
-            **series.DatetimeProperties.strftime**
+            :meth:`pandas.DatetimeIndex.strftime`
 
             The following date format identifiers are not yet
             supported: ``%c``, ``%x``,``%X``
@@ -4731,9 +4749,7 @@ def strftime(self, date_format: str, *args, **kwargs) -> Series:
                     f"for tracking purposes."
                 )
         return self._return_result_like_self(
-            self.series._column.as_string_column(
-                dtype="str", format=date_format
-            )
+            self.series._column.strftime(format=date_format)
         )
 
     @copy_docstring(DatetimeIndex.tz_localize)
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index f9555aee6a2..b93528f9693 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -11,9 +11,7 @@
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
-    is_bool_dtype,
     is_integer,
-    is_integer_dtype,
     is_numeric_dtype,
 )
 from cudf.core.column import ColumnBase, as_column
@@ -92,12 +90,6 @@ def shape(self) -> tuple[int]:
         """Get a tuple representing the dimensionality of the Index."""
         return (len(self),)
 
-    def __bool__(self):
-        raise TypeError(
-            f"The truth value of a {type(self)} is ambiguous. Use "
-            "a.empty, a.bool(), a.item(), a.any() or a.all()."
-        )
-
     @property  # type: ignore
     @_performance_tracking
     def _num_columns(self) -> int:
@@ -359,9 +351,9 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase:
             arg = as_column(arg)
             if len(arg) == 0:
                 arg = cudf.core.column.column_empty(0, dtype="int32")
-            if is_integer_dtype(arg.dtype):
+            if arg.dtype.kind in "iu":
                 return self._column.take(arg)
-            if is_bool_dtype(arg.dtype):
+            if arg.dtype.kind == "b":
                 if (bn := len(arg)) != (n := len(self)):
                     raise IndexError(
                         f"Boolean mask has wrong length: {bn} not {n}"
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 397bfe1d472..c6e2b5d10e1 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -6,7 +6,6 @@
 import warnings
 from typing import Literal, Sequence
 
-import cupy as cp
 import numpy as np
 import pandas as pd
 import pandas.tseries.offsets as pd_offset
@@ -216,25 +215,25 @@ def to_datetime(
                 + arg[unit_rev["day"]].astype("str").str.zfill(2)
             )
             format = "%Y-%m-%d"
-            col = new_series._column.as_datetime_column(
-                "datetime64[s]", format=format
-            )
-
             for u in ["h", "m", "s", "ms", "us", "ns"]:
                 value = unit_rev.get(u)
                 if value is not None and value in arg:
                     arg_col = arg._data[value]
-                    if arg_col.dtype.kind in ("f"):
-                        col = new_series._column.as_datetime_column(
-                            "datetime64[ns]", format=format
+                    if arg_col.dtype.kind == "f":
+                        col = new_series._column.strptime(
+                            cudf.dtype("datetime64[ns]"), format=format
                         )
                         break
-                    elif arg_col.dtype.kind in ("O"):
+                    elif arg_col.dtype.kind == "O":
                         if not cpp_is_integer(arg_col).all():
-                            col = new_series._column.as_datetime_column(
-                                "datetime64[ns]", format=format
+                            col = new_series._column.strptime(
+                                cudf.dtype("datetime64[ns]"), format=format
                             )
                             break
+            else:
+                col = new_series._column.strptime(
+                    cudf.dtype("datetime64[s]"), format=format
+                )
 
             times_column = None
             for u in ["h", "m", "s", "ms", "us", "ns"]:
@@ -334,15 +333,15 @@ def _process_col(
             col = (
                 col.astype("int")
                 .astype("str")
-                .as_datetime_column(
-                    dtype="datetime64[us]"
+                .strptime(
+                    dtype=cudf.dtype("datetime64[us]")
                     if "%f" in format
-                    else "datetime64[s]",
+                    else cudf.dtype("datetime64[s]"),
                     format=format,
                 )
             )
         else:
-            col = col.as_datetime_column(dtype="datetime64[ns]")
+            col = col.astype(dtype="datetime64[ns]")
 
     elif col.dtype.kind in "iu":
         if unit in ("D", "h", "m"):
@@ -353,11 +352,11 @@ def _process_col(
             col = col * factor
 
         if format is not None:
-            col = col.astype("str").as_datetime_column(
-                dtype=_unit_dtype_map[unit], format=format
+            col = col.astype("str").strptime(
+                dtype=cudf.dtype(_unit_dtype_map[unit]), format=format
             )
         else:
-            col = col.as_datetime_column(dtype=_unit_dtype_map[unit])
+            col = col.astype(dtype=cudf.dtype(_unit_dtype_map[unit]))
 
     elif col.dtype.kind == "O":
         if unit not in (None, "ns") or col.null_count == len(col):
@@ -384,8 +383,8 @@ def _process_col(
                     element=col.element_indexing(0),
                     dayfirst=dayfirst,
                 )
-            col = col.as_datetime_column(
-                dtype=_unit_dtype_map[unit],
+            col = col.strptime(
+                dtype=cudf.dtype(_unit_dtype_map[unit]),
                 format=format,
             )
     elif col.dtype.kind != "M":
@@ -894,7 +893,7 @@ def date_range(
         # integers and divide the number range evenly with `periods` elements.
         start = cudf.Scalar(start, dtype=dtype).value.astype("int64")
         end = cudf.Scalar(end, dtype=dtype).value.astype("int64")
-        arr = cp.linspace(start=start, stop=end, num=periods)
+        arr = np.linspace(start=start, stop=end, num=periods)
         result = cudf.core.column.as_column(arr).astype("datetime64[ns]")
         return cudf.DatetimeIndex._from_data({name: result}).tz_localize(tz)
 
@@ -991,8 +990,10 @@ def date_range(
         stop = end_estim.astype("int64")
         start = start.value.astype("int64")
         step = _offset_to_nanoseconds_lower_bound(offset)
-        arr = cp.arange(start=start, stop=stop, step=step, dtype="int64")
-        res = cudf.core.column.as_column(arr).astype("datetime64[ns]")
+        arr = range(int(start), int(stop), step)
+        res = cudf.core.column.as_column(arr, dtype="int64").astype(
+            "datetime64[ns]"
+        )
 
     return cudf.DatetimeIndex._from_data({name: res}, freq=freq).tz_localize(
         tz
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index ef6b86a04a7..07158e4ee61 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -8,12 +8,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib import strings as libstrings
-from cudf.api.types import (
-    _is_non_decimal_numeric_dtype,
-    is_datetime_dtype,
-    is_string_dtype,
-    is_timedelta_dtype,
-)
+from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype
 from cudf.core.column import as_column
 from cudf.core.dtypes import CategoricalDtype
 from cudf.utils.dtypes import can_convert_to_column
@@ -85,7 +80,7 @@ def to_numeric(arg, errors="raise", downcast=None):
     dtype: float64
 
     .. pandas-compat::
-        **cudf.to_numeric**
+        :func:`pandas.to_numeric`
 
         An important difference from pandas is that this function does not
         accept mixed numeric/non-numeric type sequences.
@@ -114,7 +109,7 @@ def to_numeric(arg, errors="raise", downcast=None):
     col = as_column(arg)
     dtype = col.dtype
 
-    if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype):
+    if dtype.kind in "mM":
         col = col.astype(cudf.dtype("int64"))
     elif isinstance(dtype, CategoricalDtype):
         cat_dtype = col.dtype.type
diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
index 21693e106bd..bb153d4b549 100644
--- a/python/cudf/cudf/core/window/ewm.py
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -56,7 +56,7 @@ class ExponentialMovingWindow(_RollingBase):
     the equivalent pandas method.
 
     .. pandas-compat::
-        **cudf.core.window.ExponentialMovingWindow**
+        :meth:`pandas.DataFrame.ewm`
 
         The parameters ``min_periods``, ``ignore_na``, ``axis``, and ``times``
         are not yet supported. Behavior is defined only for data that begins
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index e909d96309e..0f2820a01e9 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -50,7 +50,7 @@ def read_csv(
     comment=None,
     delim_whitespace=False,
     byte_range=None,
-    use_python_file_object=True,
+    use_python_file_object=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 7082a85237a..289292b5182 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -10,6 +10,7 @@
 from cudf._lib import orc as liborc
 from cudf.api.types import is_list_like
 from cudf.utils import ioutils
+from cudf.utils.utils import maybe_filter_deprecation
 
 
 def _make_empty_df(filepath_or_buffer, columns):
@@ -280,7 +281,7 @@ def read_orc(
     num_rows=None,
     use_index=True,
     timestamp_type=None,
-    use_python_file_object=True,
+    use_python_file_object=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
@@ -320,6 +321,9 @@ def read_orc(
             )
 
     filepaths_or_buffers = []
+    have_nativefile = any(
+        isinstance(source, pa.NativeFile) for source in filepath_or_buffer
+    )
     for source in filepath_or_buffer:
         if ioutils.is_directory(
             path_or_data=source, storage_options=storage_options
@@ -360,17 +364,24 @@ def read_orc(
             stripes = selected_stripes
 
     if engine == "cudf":
-        return DataFrame._from_data(
-            *liborc.read_orc(
-                filepaths_or_buffers,
-                columns,
-                stripes,
-                skiprows,
-                num_rows,
-                use_index,
-                timestamp_type,
+        # Don't want to warn if use_python_file_object causes us to get
+        # a NativeFile (there is a separate deprecation warning for that)
+        with maybe_filter_deprecation(
+            not have_nativefile,
+            message="Support for reading pyarrow's NativeFile is deprecated",
+            category=FutureWarning,
+        ):
+            return DataFrame._from_data(
+                *liborc.read_orc(
+                    filepaths_or_buffers,
+                    columns,
+                    stripes,
+                    skiprows,
+                    num_rows,
+                    use_index,
+                    timestamp_type,
+                )
             )
-        )
     else:
         from pyarrow import orc
 
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 7733e770d99..0f0a240b5d0 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -15,6 +15,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from pyarrow import dataset as ds
 
 import cudf
@@ -23,6 +24,7 @@
 from cudf.core.column import as_column, build_categorical_column, column_empty
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
+from cudf.utils.utils import maybe_filter_deprecation
 
 BYTE_SIZES = {
     "kb": 1000,
@@ -73,6 +75,7 @@ def _write_parquet(
     column_encoding=None,
     column_type_length=None,
     output_as_binary=None,
+    write_arrow_schema=True,
 ):
     if is_list_like(paths) and len(paths) > 1:
         if partitions_info is None:
@@ -110,6 +113,7 @@ def _write_parquet(
         "column_encoding": column_encoding,
         "column_type_length": column_type_length,
         "output_as_binary": output_as_binary,
+        "write_arrow_schema": write_arrow_schema,
     }
     if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs):
         with ExitStack() as stack:
@@ -154,6 +158,7 @@ def write_to_dataset(
     column_encoding=None,
     column_type_length=None,
     output_as_binary=None,
+    store_schema=False,
 ):
     """Wraps `to_parquet` to write partitioned Parquet datasets.
     For each combination of partition group and value,
@@ -242,6 +247,9 @@ def write_to_dataset(
     output_as_binary : set, optional, default None
         If a column name is present in the set, that column will be output as
         unannotated binary, rather than the default 'UTF-8'.
+    store_schema : bool, default False
+        If ``True``, enable computing and writing arrow schema to Parquet
+        file footer's key-value metadata section for faithful round-tripping.
     """
 
     fs = ioutils._ensure_filesystem(fs, root_path, storage_options)
@@ -285,6 +293,7 @@ def write_to_dataset(
             column_encoding=column_encoding,
             column_type_length=column_type_length,
             output_as_binary=output_as_binary,
+            store_schema=store_schema,
         )
 
     else:
@@ -312,6 +321,7 @@ def write_to_dataset(
             column_encoding=column_encoding,
             column_type_length=column_type_length,
             output_as_binary=output_as_binary,
+            store_schema=store_schema,
         )
 
     return metadata
@@ -342,7 +352,7 @@ def read_parquet_metadata(filepath_or_buffer):
             path_or_data=source,
             compression=None,
             fs=fs,
-            use_python_file_object=True,
+            use_python_file_object=None,
             open_file_options=None,
             storage_options=None,
             bytes_per_thread=None,
@@ -524,7 +534,7 @@ def read_parquet(
     filters=None,
     row_groups=None,
     use_pandas_metadata=True,
-    use_python_file_object=True,
+    use_python_file_object=None,
     categorical_partitions=True,
     open_file_options=None,
     bytes_per_thread=None,
@@ -607,6 +617,9 @@ def read_parquet(
             row_groups=row_groups,
             fs=fs,
         )
+    have_nativefile = any(
+        isinstance(source, pa.NativeFile) for source in filepath_or_buffer
+    )
     for source in filepath_or_buffer:
         tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source,
@@ -654,19 +667,26 @@ def read_parquet(
         )
 
     # Convert parquet data to a cudf.DataFrame
-    df = _parquet_to_frame(
-        filepaths_or_buffers,
-        engine,
-        *args,
-        columns=columns,
-        row_groups=row_groups,
-        use_pandas_metadata=use_pandas_metadata,
-        partition_keys=partition_keys,
-        partition_categories=partition_categories,
-        dataset_kwargs=dataset_kwargs,
-        **kwargs,
-    )
 
+    # Don't want to warn if use_python_file_object causes us to get
+    # a NativeFile (there is a separate deprecation warning for that)
+    with maybe_filter_deprecation(
+        not have_nativefile,
+        message="Support for reading pyarrow's NativeFile is deprecated",
+        category=FutureWarning,
+    ):
+        df = _parquet_to_frame(
+            filepaths_or_buffers,
+            engine,
+            *args,
+            columns=columns,
+            row_groups=row_groups,
+            use_pandas_metadata=use_pandas_metadata,
+            partition_keys=partition_keys,
+            partition_categories=partition_categories,
+            dataset_kwargs=dataset_kwargs,
+            **kwargs,
+        )
     # Apply filters row-wise (if any are defined), and return
     df = _apply_post_filters(df, filters)
     if projected_columns:
@@ -908,7 +928,7 @@ def _read_parquet(
                 "cudf engine doesn't support the "
                 f"following positional arguments: {list(args)}"
             )
-        if cudf.get_option("mode.pandas_compatible"):
+        if cudf.get_option("io.parquet.low_memory"):
             return libparquet.ParquetReader(
                 filepaths_or_buffers,
                 columns=columns,
@@ -968,6 +988,7 @@ def to_parquet(
     column_encoding=None,
     column_type_length=None,
     output_as_binary=None,
+    store_schema=False,
     *args,
     **kwargs,
 ):
@@ -1023,6 +1044,7 @@ def to_parquet(
                 column_encoding=column_encoding,
                 column_type_length=column_type_length,
                 output_as_binary=output_as_binary,
+                store_schema=store_schema,
             )
 
         partition_info = (
@@ -1055,6 +1077,7 @@ def to_parquet(
             column_encoding=column_encoding,
             column_type_length=column_type_length,
             output_as_binary=output_as_binary,
+            write_arrow_schema=store_schema,
         )
 
     else:
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index 1f539e7f266..94e73021cec 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -325,6 +325,32 @@ def _integer_and_none_validator(val):
     _make_contains_validator([False, True]),
 )
 
+_register_option(
+    "io.parquet.low_memory",
+    False,
+    textwrap.dedent(
+        """
+        If set to `False`, reads entire parquet in one go.
+        If set to `True`, reads parquet file in chunks.
+        \tValid values are True or False. Default is False.
+    """
+    ),
+    _make_contains_validator([False, True]),
+)
+
+_register_option(
+    "io.json.low_memory",
+    False,
+    textwrap.dedent(
+        """
+        If set to `False`, reads entire json in one go.
+        If set to `True`, reads json file in chunks.
+        \tValid values are True or False. Default is False.
+    """
+    ),
+    _make_contains_validator([False, True]),
+)
+
 
 class option_context(ContextDecorator):
     """
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index a64bf7772fe..59a243dd7c4 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -260,6 +260,19 @@ def Index__new__(cls, *args, **kwargs):
     return self
 
 
+def Index__setattr__(self, name, value):
+    if name.startswith("_"):
+        object.__setattr__(self, name, value)
+        return
+    if name == "name":
+        setattr(self._fsproxy_wrapped, "name", value)
+    if name == "names":
+        setattr(self._fsproxy_wrapped, "names", value)
+    return _FastSlowAttribute("__setattr__").__get__(self, type(self))(
+        name, value
+    )
+
+
 Index = make_final_proxy_type(
     "Index",
     cudf.Index,
@@ -277,11 +290,13 @@ def Index__new__(cls, *args, **kwargs):
         "__iter__": custom_iter,
         "__init__": _DELETE,
         "__new__": Index__new__,
+        "__setattr__": Index__setattr__,
         "_constructor": _FastSlowAttribute("_constructor"),
         "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
         "_accessors": set(),
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
+        "name": _FastSlowAttribute("name"),
     },
 )
 
@@ -292,7 +307,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
+        "name": _FastSlowAttribute("name"),
+    },
 )
 
 SparseDtype = make_final_proxy_type(
@@ -319,7 +338,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
+        "name": _FastSlowAttribute("name"),
+    },
 )
 
 Categorical = make_final_proxy_type(
@@ -348,8 +371,10 @@ def Index__new__(cls, *args, **kwargs):
     bases=(Index,),
     additional_attributes={
         "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
+        "name": _FastSlowAttribute("name"),
     },
 )
 
@@ -383,8 +408,10 @@ def Index__new__(cls, *args, **kwargs):
     bases=(Index,),
     additional_attributes={
         "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
+        "name": _FastSlowAttribute("name"),
     },
 )
 
@@ -439,8 +466,10 @@ def Index__new__(cls, *args, **kwargs):
     bases=(Index,),
     additional_attributes={
         "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
+        "name": _FastSlowAttribute("name"),
     },
 )
 
@@ -474,6 +503,7 @@ def Index__new__(cls, *args, **kwargs):
     additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
 )
 
+
 MultiIndex = make_final_proxy_type(
     "MultiIndex",
     cudf.MultiIndex,
@@ -481,7 +511,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
+        "names": _FastSlowAttribute("names"),
+    },
 )
 
 TimeGrouper = make_intermediate_proxy_type(
@@ -667,8 +701,10 @@ def Index__new__(cls, *args, **kwargs):
     bases=(Index,),
     additional_attributes={
         "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
+        "name": _FastSlowAttribute("name"),
     },
 )
 
@@ -775,6 +811,18 @@ def Index__new__(cls, *args, **kwargs):
     pd.core.indexing._LocIndexer,
 )
 
+_AtIndexer = make_intermediate_proxy_type(
+    "_AtIndexer",
+    cudf.core.dataframe._DataFrameAtIndexer,
+    pd.core.indexing._AtIndexer,
+)
+
+_iAtIndexer = make_intermediate_proxy_type(
+    "_iAtIndexer",
+    cudf.core.dataframe._DataFrameiAtIndexer,
+    pd.core.indexing._iAtIndexer,
+)
+
 FixedForwardWindowIndexer = make_final_proxy_type(
     "FixedForwardWindowIndexer",
     _Unusable,
@@ -907,6 +955,12 @@ def Index__new__(cls, *args, **kwargs):
 
 _eval_func = _FunctionProxy(_Unusable(), pd.eval)
 
+register_proxy_func(pd.read_pickle)(
+    _FunctionProxy(_Unusable(), pd.read_pickle)
+)
+
+register_proxy_func(pd.to_pickle)(_FunctionProxy(_Unusable(), pd.to_pickle))
+
 
 def _get_eval_locals_and_globals(level, local_dict=None, global_dict=None):
     frame = sys._getframe(level + 3)
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index f8bfe340ae5..ed2c5ca06c9 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -4,17 +4,19 @@
 import io
 import os
 
+import numpy as np
 import pyarrow as pa
 import pytest
 
 from cudf._lib import pylibcudf as plc
+from cudf._lib.pylibcudf.io.types import CompressionType
 
 
 def metadata_from_arrow_type(
     pa_type: pa.Array,
     name: str = "",
 ) -> plc.interop.ColumnMetadata | None:
-    metadata = plc.interop.ColumnMetadata(name)  # None
+    metadata = plc.interop.ColumnMetadata(name)
     if pa.types.is_list(pa_type):
         child_meta = [plc.interop.ColumnMetadata("offsets")]
         for i in range(pa_type.num_fields):
@@ -39,9 +41,25 @@ def metadata_from_arrow_type(
 
 
 def assert_column_eq(
-    lhs: pa.Array | plc.Column, rhs: pa.Array | plc.Column
+    lhs: pa.Array | plc.Column,
+    rhs: pa.Array | plc.Column,
+    check_field_nullability=True,
 ) -> None:
-    """Verify that a pylibcudf array and PyArrow array are equal."""
+    """Verify that a pylibcudf array and PyArrow array are equal.
+
+    Parameters
+    ----------
+    lhs: Union[pa.Array, plc.Column]
+        The array with the expected values
+    rhs: Union[pa.Array, plc.Column]
+        The array to check
+    check_field_nullability:
+        For list/struct dtypes, whether to check if the nullable attributes
+        on child fields are equal.
+
+        Useful for checking roundtripping of lossy formats like JSON that may not
+        preserve this information.
+    """
     # Nested types require children metadata to be passed to the conversion function.
     if isinstance(lhs, (pa.Array, pa.ChunkedArray)) and isinstance(
         rhs, plc.Column
@@ -65,7 +83,47 @@ def assert_column_eq(
     if isinstance(rhs, pa.ChunkedArray):
         rhs = rhs.combine_chunks()
 
-    assert lhs.equals(rhs)
+    def _make_fields_nullable(typ):
+        new_fields = []
+        for i in range(typ.num_fields):
+            child_field = typ.field(i)
+            if not child_field.nullable:
+                child_type = child_field.type
+                if isinstance(child_field.type, (pa.StructType, pa.ListType)):
+                    child_type = _make_fields_nullable(child_type)
+                new_fields.append(
+                    pa.field(child_field.name, child_type, nullable=True)
+                )
+            else:
+                new_fields.append(child_field)
+
+        if isinstance(typ, pa.StructType):
+            return pa.struct(new_fields)
+        elif isinstance(typ, pa.ListType):
+            return pa.list_(new_fields[0])
+        return typ
+
+    if not check_field_nullability:
+        rhs_type = _make_fields_nullable(rhs.type)
+        rhs = rhs.cast(rhs_type)
+
+        lhs_type = _make_fields_nullable(lhs.type)
+        lhs = rhs.cast(lhs_type)
+
+    if pa.types.is_floating(lhs.type) and pa.types.is_floating(rhs.type):
+        lhs_nans = pa.compute.is_nan(lhs)
+        rhs_nans = pa.compute.is_nan(rhs)
+        assert lhs_nans.equals(rhs_nans)
+
+        if pa.compute.any(lhs_nans) or pa.compute.any(rhs_nans):
+            # masks must be equal at this point
+            mask = pa.compute.fill_null(pa.compute.invert(lhs_nans), True)
+            lhs = lhs.filter(mask)
+            rhs = rhs.filter(mask)
+
+        np.testing.assert_array_almost_equal(lhs, rhs)
+    else:
+        assert lhs.equals(rhs)
 
 
 def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None:
@@ -78,20 +136,32 @@ def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None:
 
 
 def assert_table_and_meta_eq(
-    plc_table_w_meta: plc.io.types.TableWithMetadata, pa_table: pa.Table
+    pa_table: pa.Table,
+    plc_table_w_meta: plc.io.types.TableWithMetadata,
+    check_field_nullability=True,
+    check_types_if_empty=True,
+    check_names=True,
 ) -> None:
     """Verify that the pylibcudf TableWithMetadata and PyArrow table are equal"""
 
     plc_table = plc_table_w_meta.tbl
 
     plc_shape = (plc_table.num_rows(), plc_table.num_columns())
-    assert plc_shape == pa_table.shape
+    assert (
+        plc_shape == pa_table.shape
+    ), f"{plc_shape} is not equal to {pa_table.shape}"
+
+    if not check_types_if_empty and plc_table.num_rows() == 0:
+        return
 
     for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns):
-        assert_column_eq(plc_col, pa_col)
+        assert_column_eq(pa_col, plc_col, check_field_nullability)
 
     # Check column name equality
-    assert plc_table_w_meta.column_names == pa_table.column_names
+    if check_names:
+        assert (
+            plc_table_w_meta.column_names() == pa_table.column_names
+        ), f"{plc_table_w_meta.column_names()} != {pa_table.column_names}"
 
 
 def cudf_raises(expected_exception: BaseException, *args, **kwargs):
@@ -102,49 +172,10 @@ def cudf_raises(expected_exception: BaseException, *args, **kwargs):
     return pytest.raises(expected_exception, *args, **kwargs)
 
 
-# TODO: Consider moving these type utilities into pylibcudf.types itself.
-def is_signed_integer(plc_dtype: plc.DataType):
-    return (
-        plc.TypeId.INT8.value <= plc_dtype.id().value <= plc.TypeId.INT64.value
-    )
-
-
-def is_integer(plc_dtype: plc.DataType):
-    return plc_dtype.id() in (
-        plc.TypeId.INT8,
-        plc.TypeId.INT16,
-        plc.TypeId.INT32,
-        plc.TypeId.INT64,
-        plc.TypeId.UINT8,
-        plc.TypeId.UINT16,
-        plc.TypeId.UINT32,
-        plc.TypeId.UINT64,
-    )
-
-
-def is_floating(plc_dtype: plc.DataType):
-    return plc_dtype.id() in (
-        plc.TypeId.FLOAT32,
-        plc.TypeId.FLOAT64,
-    )
-
-
-def is_boolean(plc_dtype: plc.DataType):
-    return plc_dtype.id() == plc.TypeId.BOOL8
-
-
 def is_string(plc_dtype: plc.DataType):
     return plc_dtype.id() == plc.TypeId.STRING
 
 
-def is_fixed_width(plc_dtype: plc.DataType):
-    return (
-        is_integer(plc_dtype)
-        or is_floating(plc_dtype)
-        or is_boolean(plc_dtype)
-    )
-
-
 def nesting_level(typ) -> tuple[int, int]:
     """Return list and struct nesting of a pyarrow type."""
     if isinstance(typ, pa.ListType):
@@ -165,6 +196,48 @@ def is_nested_list(typ):
     return nesting_level(typ)[0] > 1
 
 
+def _convert_numeric_types_to_floating(pa_table):
+    """
+    Useful little helper for testing the
+    dtypes option in I/O readers.
+
+    Returns a tuple containing the pylibcudf dtypes
+    and the new pyarrow schema
+    """
+    dtypes = []
+    new_fields = []
+    for i in range(len(pa_table.schema)):
+        field = pa_table.schema.field(i)
+        child_types = []
+
+        plc_type = plc.interop.from_arrow(field.type)
+        if pa.types.is_integer(field.type) or pa.types.is_unsigned_integer(
+            field.type
+        ):
+            plc_type = plc.interop.from_arrow(pa.float64())
+            field = field.with_type(pa.float64())
+
+        dtypes.append((field.name, plc_type, child_types))
+
+        new_fields.append(field)
+    return dtypes, new_fields
+
+
+def write_source_str(source, input_str):
+    """
+    Write a string to the source
+    (useful for testing CSV/JSON I/O)
+    """
+    if not isinstance(source, io.IOBase):
+        with open(source, "w") as source_f:
+            source_f.write(input_str)
+    else:
+        if isinstance(source, io.BytesIO):
+            input_str = input_str.encode("utf-8")
+        source.write(input_str)
+        source.seek(0)
+
+
 def sink_to_str(sink):
     """
     Takes a sink (e.g. StringIO/BytesIO, filepath, etc.)
@@ -183,6 +256,31 @@ def sink_to_str(sink):
     return str_result
 
 
+def make_source(path_or_buf, pa_table, format, **kwargs):
+    """
+    Write a pyarrow Table to a specific format using pandas
+    by dispatching to the appropriate to_* call.
+    The caller is responsible for making sure that no arguments
+    unsupported by pandas are passed in.
+    """
+    df = pa_table.to_pandas()
+    mode = "w"
+    if "compression" in kwargs:
+        kwargs["compression"] = COMPRESSION_TYPE_TO_PANDAS[
+            kwargs["compression"]
+        ]
+        if kwargs["compression"] is not None and format != "json":
+            # pandas json method only supports mode="w"/"a"
+            mode = "wb"
+    if format == "json":
+        df.to_json(path_or_buf, mode=mode, **kwargs)
+    elif format == "csv":
+        df.to_csv(path_or_buf, mode=mode, **kwargs)
+    if isinstance(path_or_buf, io.IOBase):
+        path_or_buf.seek(0)
+    return path_or_buf
+
+
 NUMERIC_PA_TYPES = [pa.int64(), pa.float64(), pa.uint64()]
 STRING_PA_TYPES = [pa.string()]
 BOOL_PA_TYPES = [pa.bool_()]
@@ -221,4 +319,26 @@ def sink_to_str(sink):
     + DEFAULT_PA_STRUCT_TESTING_TYPES
 )
 
+# Map pylibcudf compression types to pandas ones
+# Not all compression types map cleanly, read the comments to learn more!
+# If a compression type is unsupported, it maps to False.
+
+COMPRESSION_TYPE_TO_PANDAS = {
+    CompressionType.NONE: None,
+    # Users of this dict will have to special case
+    # AUTO
+    CompressionType.AUTO: None,
+    CompressionType.GZIP: "gzip",
+    CompressionType.BZIP2: "bz2",
+    CompressionType.ZIP: "zip",
+    CompressionType.XZ: "xz",
+    CompressionType.ZSTD: "zstd",
+    # Unsupported
+    CompressionType.ZLIB: False,
+    CompressionType.LZ4: False,
+    CompressionType.LZO: False,
+    # These only work for parquet
+    CompressionType.SNAPPY: "snappy",
+    CompressionType.BROTLI: "brotli",
+}
 ALL_PA_TYPES = DEFAULT_PA_TYPES
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
index e4760ea7ac8..4a7194a6d8d 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -11,6 +11,7 @@
 import pytest
 
 import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.io.types import CompressionType
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
 
@@ -37,6 +38,37 @@ def numeric_pa_type(request):
     return request.param
 
 
+def _get_vals_of_type(pa_type, length, seed):
+    """
+    Returns an list-like of random values of that type
+    """
+    rng = np.random.default_rng(seed=seed)
+    if pa_type == pa.int64():
+        half = length // 2
+        negs = rng.integers(-length, 0, half, dtype=np.int64)
+        pos = rng.integers(0, length, length - half, dtype=np.int64)
+        return np.concatenate([negs, pos])
+    elif pa_type == pa.uint64():
+        return rng.integers(0, length, length, dtype=np.uint64)
+    elif pa_type == pa.float64():
+        # Round to 6 decimal places or else we have problems comparing our
+        # output to pandas due to floating point/rounding differences
+        return rng.uniform(-length, length, length).round(6)
+    elif pa_type == pa.bool_():
+        return rng.integers(0, 2, length, dtype=bool)
+    elif pa_type == pa.string():
+        # Generate random ASCII strings
+        strs = []
+        for _ in range(length):
+            chrs = rng.integers(33, 128, length)
+            strs.append("".join(chr(x) for x in chrs))
+        return strs
+    else:
+        raise NotImplementedError(
+            f"random data generation not implemented for {pa_type}"
+        )
+
+
 # TODO: Consider adding another fixture/adapting this
 # fixture to consider nullability
 @pytest.fixture(scope="session", params=[0, 100])
@@ -57,10 +89,9 @@ def table_data(request):
     # plc.io.TableWithMetadata
     colnames = []
 
-    np.random.seed(42)
+    seed = 42
 
     for typ in ALL_PA_TYPES:
-        rand_vals = np.random.randint(0, nrows, nrows)
         child_colnames = []
 
         def _generate_nested_data(typ):
@@ -88,13 +119,17 @@ def _generate_nested_data(typ):
                 child_colnames.append(("", grandchild_colnames))
             else:
                 # typ is scalar type
-                pa_array = pa.array(rand_vals).cast(typ)
+                pa_array = pa.array(
+                    _get_vals_of_type(typ, nrows, seed=seed), type=typ
+                )
             return pa_array, child_colnames
 
         if isinstance(typ, (pa.ListType, pa.StructType)):
             rand_arr, child_colnames = _generate_nested_data(typ)
         else:
-            rand_arr = pa.array(rand_vals).cast(typ)
+            rand_arr = pa.array(
+                _get_vals_of_type(typ, nrows, seed=seed), type=typ
+            )
 
         table_dict[f"col_{typ}"] = rand_arr
         colnames.append((f"col_{typ}", child_colnames))
@@ -106,6 +141,20 @@ def _generate_nested_data(typ):
     ), pa_table
 
 
+@pytest.fixture(params=[(0, 0), ("half", 0), (-1, "half")])
+def nrows_skiprows(table_data, request):
+    """
+    Parametrized nrows fixture that accompanies table_data
+    """
+    _, pa_table = table_data
+    nrows, skiprows = request.param
+    if nrows == "half":
+        nrows = len(pa_table) // 2
+    if skiprows == "half":
+        skiprows = (len(pa_table) - nrows) // 2
+    return nrows, skiprows
+
+
 @pytest.fixture(
     params=["a.txt", pathlib.Path("a.txt"), io.BytesIO, io.StringIO],
 )
@@ -121,6 +170,38 @@ def source_or_sink(request, tmp_path):
         return fp_or_buf()
 
 
+unsupported_types = {
+    # Not supported by pandas
+    # TODO: find a way to test these
+    CompressionType.SNAPPY,
+    CompressionType.BROTLI,
+    CompressionType.LZ4,
+    CompressionType.LZO,
+    CompressionType.ZLIB,
+}
+
+unsupported_text_compression_types = unsupported_types.union(
+    {
+        # compressions not supported by libcudf
+        # for csv/json
+        CompressionType.XZ,
+        CompressionType.ZSTD,
+    }
+)
+
+
+@pytest.fixture(
+    params=set(CompressionType).difference(unsupported_text_compression_types)
+)
+def text_compression_type(request):
+    return request.param
+
+
+@pytest.fixture(params=[opt for opt in plc.io.types.CompressionType])
+def compression_type(request):
+    return request.param
+
+
 @pytest.fixture(
     scope="session", params=[opt for opt in plc.types.Interpolation]
 )
@@ -136,6 +217,15 @@ def sorted_opt(request):
     return request.param
 
 
-@pytest.fixture(scope="session", params=[False, True])
+@pytest.fixture(
+    scope="session", params=[False, True], ids=["without_nulls", "with_nulls"]
+)
 def has_nulls(request):
     return request.param
+
+
+@pytest.fixture(
+    scope="session", params=[False, True], ids=["without_nans", "with_nans"]
+)
+def has_nans(request):
+    return request.param
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_avro.py b/python/cudf/cudf/pylibcudf_tests/io/test_avro.py
index d6cd86768cd..061d6792ce3 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_avro.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_avro.py
@@ -120,4 +120,4 @@ def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable):
     if columns != []:
         expected = expected.select(columns)
 
-    assert_table_and_meta_eq(res, expected)
+    assert_table_and_meta_eq(expected, res)
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_csv.py b/python/cudf/cudf/pylibcudf_tests/io/test_csv.py
new file mode 100644
index 00000000000..95326a8b681
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_csv.py
@@ -0,0 +1,280 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import io
+import os
+from io import StringIO
+
+import pandas as pd
+import pyarrow as pa
+import pytest
+from utils import (
+    _convert_numeric_types_to_floating,
+    assert_table_and_meta_eq,
+    make_source,
+    write_source_str,
+)
+
+import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.io.types import CompressionType
+
+# Shared kwargs to pass to make_source
+_COMMON_CSV_SOURCE_KWARGS = {
+    "format": "csv",
+    "index": False,
+}
+
+
+@pytest.fixture(scope="module")
+def csv_table_data(table_data):
+    """
+    Like the table_data but with nested types dropped
+    since the CSV reader can't handle that
+    uint64 is also dropped since it can get confused with int64
+    """
+    _, pa_table = table_data
+    pa_table = pa_table.drop_columns(
+        [
+            "col_uint64",
+            "col_list<item: int64>",
+            "col_list<item: list<item: int64>>",
+            "col_struct<v: int64 not null>",
+            "col_struct<a: int64 not null, b_struct: struct<b: double not null> not null>",
+        ]
+    )
+    return plc.interop.from_arrow(pa_table), pa_table
+
+
+@pytest.mark.parametrize("delimiter", [",", ";"])
+def test_read_csv_basic(
+    csv_table_data,
+    source_or_sink,
+    text_compression_type,
+    nrows_skiprows,
+    delimiter,
+):
+    _, pa_table = csv_table_data
+    compression_type = text_compression_type
+    nrows, skiprows = nrows_skiprows
+
+    # can't compress non-binary data with pandas
+    if isinstance(source_or_sink, io.StringIO):
+        compression_type = CompressionType.NONE
+
+    source = make_source(
+        source_or_sink,
+        pa_table,
+        compression=compression_type,
+        sep=delimiter,
+        **_COMMON_CSV_SOURCE_KWARGS,
+    )
+
+    # Rename the table (by reversing the names) to test names argument
+    pa_table = pa_table.rename_columns(pa_table.column_names[::-1])
+    column_names = pa_table.column_names
+
+    # Adapt to nrows/skiprows
+    pa_table = pa_table.slice(
+        offset=skiprows, length=nrows if nrows != -1 else None
+    )
+
+    res = plc.io.csv.read_csv(
+        plc.io.SourceInfo([source]),
+        delimiter=delimiter,
+        compression=compression_type,
+        col_names=column_names,
+        nrows=nrows,
+        skiprows=skiprows,
+    )
+
+    assert_table_and_meta_eq(
+        pa_table,
+        res,
+        check_types_if_empty=False,
+        check_names=False if skiprows > 0 and column_names is None else True,
+    )
+
+
+# Note: make sure chunk size is big enough so that dtype inference
+# infers correctly
+@pytest.mark.parametrize("chunk_size", [1000, 5999])
+def test_read_csv_byte_range(table_data, chunk_size, tmp_path):
+    _, pa_table = table_data
+    if len(pa_table) == 0:
+        # pandas writes nothing when we have empty table
+        # and header=None
+        pytest.skip("Don't test empty table case")
+    source = f"{tmp_path}/a.csv"
+    source = make_source(
+        source, pa_table, header=False, **_COMMON_CSV_SOURCE_KWARGS
+    )
+    file_size = os.stat(source).st_size
+    tbls_w_meta = []
+    for segment in range((file_size + chunk_size - 1) // chunk_size):
+        tbls_w_meta.append(
+            plc.io.csv.read_csv(
+                plc.io.SourceInfo([source]),
+                byte_range_offset=segment * chunk_size,
+                byte_range_size=chunk_size,
+                header=-1,
+                col_names=pa_table.column_names,
+            )
+        )
+    if isinstance(source, io.IOBase):
+        source.seek(0)
+    exp = pd.read_csv(source, names=pa_table.column_names, header=None)
+    tbls = []
+    for tbl_w_meta in tbls_w_meta:
+        if tbl_w_meta.tbl.num_rows() > 0:
+            tbls.append(plc.interop.to_arrow(tbl_w_meta.tbl))
+    full_tbl = pa.concat_tables(tbls)
+
+    full_tbl_plc = plc.io.TableWithMetadata(
+        plc.interop.from_arrow(full_tbl),
+        tbls_w_meta[0].column_names(include_children=True),
+    )
+    assert_table_and_meta_eq(pa.Table.from_pandas(exp), full_tbl_plc)
+
+
+@pytest.mark.parametrize("usecols", [None, ["col_int64", "col_bool"], [0, 1]])
+def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols):
+    # Simple test for dtypes where we read in
+    # all numeric data as floats
+    _, pa_table = csv_table_data
+
+    source = make_source(
+        source_or_sink,
+        pa_table,
+        **_COMMON_CSV_SOURCE_KWARGS,
+    )
+    # Adjust table for usecols
+    if usecols is not None:
+        pa_table = pa_table.select(usecols)
+
+    dtypes, new_fields = _convert_numeric_types_to_floating(pa_table)
+    # Extract the dtype out of the (name, type, child_types) tuple
+    # (read_csv doesn't support this format since it doesn't support nested columns)
+    dtypes = {name: dtype for name, dtype, _ in dtypes}
+
+    new_schema = pa.schema(new_fields)
+
+    res = plc.io.csv.read_csv(
+        plc.io.SourceInfo([source]), dtypes=dtypes, usecols=usecols
+    )
+    new_table = pa_table.cast(new_schema)
+
+    assert_table_and_meta_eq(new_table, res)
+
+
+@pytest.mark.parametrize("skip_blanks", [True, False])
+@pytest.mark.parametrize("decimal, quotechar", [(".", "'"), ("_", '"')])
+@pytest.mark.parametrize("lineterminator", ["\n", "\r\n"])
+def test_read_csv_parse_options(
+    source_or_sink, decimal, quotechar, skip_blanks, lineterminator
+):
+    lines = [
+        "# first comment line",
+        "# third comment line",
+        "1,2,3,4_4,'z'",
+        '4,5,6,5_5,""',
+        "7,8,9,9_87,'123'",
+        "# last comment line",
+        "1,1,1,10_11,abc",
+    ]
+    buffer = lineterminator.join(lines)
+
+    write_source_str(source_or_sink, buffer)
+
+    plc_table_w_meta = plc.io.csv.read_csv(
+        plc.io.SourceInfo([source_or_sink]),
+        comment="#",
+        decimal=decimal,
+        skip_blank_lines=skip_blanks,
+        quotechar=quotechar,
+    )
+    df = pd.read_csv(
+        StringIO(buffer),
+        comment="#",
+        decimal=decimal,
+        skip_blank_lines=skip_blanks,
+        quotechar=quotechar,
+    )
+    assert_table_and_meta_eq(pa.Table.from_pandas(df), plc_table_w_meta)
+
+
+@pytest.mark.parametrize("na_filter", [True, False])
+@pytest.mark.parametrize("na_values", [["n/a"], ["NV_NAN"]])
+@pytest.mark.parametrize("keep_default_na", [True, False])
+def test_read_csv_na_values(
+    source_or_sink, na_filter, na_values, keep_default_na
+):
+    lines = ["a,b,c", "n/a,NaN,NV_NAN", "1.0,2.0,3.0"]
+    buffer = "\n".join(lines)
+
+    write_source_str(source_or_sink, buffer)
+
+    plc_table_w_meta = plc.io.csv.read_csv(
+        plc.io.SourceInfo([source_or_sink]),
+        na_filter=na_filter,
+        na_values=na_values if na_filter else None,
+        keep_default_na=keep_default_na,
+    )
+    df = pd.read_csv(
+        StringIO(buffer),
+        na_filter=na_filter,
+        na_values=na_values if na_filter else None,
+        keep_default_na=keep_default_na,
+    )
+    assert_table_and_meta_eq(pa.Table.from_pandas(df), plc_table_w_meta)
+
+
+@pytest.mark.parametrize("header", [0, 10, -1])
+def test_read_csv_header(csv_table_data, source_or_sink, header):
+    _, pa_table = csv_table_data
+
+    source = make_source(
+        source_or_sink,
+        pa_table,
+        **_COMMON_CSV_SOURCE_KWARGS,
+    )
+
+    plc_table_w_meta = plc.io.csv.read_csv(
+        plc.io.SourceInfo([source]), header=header
+    )
+    if header > 0:
+        if header < len(pa_table):
+            names_row = pa_table.take([header - 1]).to_pylist()[0].values()
+            pa_table = pa_table.slice(header)
+            col_names = [str(name) for name in names_row]
+            pa_table = pa_table.rename_columns(col_names)
+        else:
+            pa_table = pa.table([])
+    elif header < 0:
+        # neg header means use user-provided names (in this case nothing)
+        # (the original column names are now data)
+        tbl_dict = pa_table.to_pydict()
+        new_tbl_dict = {}
+        for i, (name, vals) in enumerate(tbl_dict.items()):
+            str_vals = [str(val) for val in vals]
+            new_tbl_dict[str(i)] = [name] + str_vals
+        pa_table = pa.table(new_tbl_dict)
+
+    assert_table_and_meta_eq(
+        pa_table,
+        plc_table_w_meta,
+        check_types_if_empty=False,
+    )
+
+
+# TODO: test these
+# str prefix = "",
+# bool mangle_dupe_cols = True,
+# size_type skipfooter = 0,
+# str thousands = None,
+# bool delim_whitespace = False,
+# bool skipinitialspace = False,
+# quote_style quoting = quote_style.MINIMAL,
+# bool doublequote = True,
+# bool detect_whitespace_around_quotes = False,
+# list parse_dates = None,
+# list true_values = None,
+# list false_values = None,
+# bool dayfirst = False,
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_json.py b/python/cudf/cudf/pylibcudf_tests/io/test_json.py
index d6b8bfa6976..4239f2438bb 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_json.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_json.py
@@ -1,11 +1,21 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import io
 
+import pandas as pd
 import pyarrow as pa
 import pytest
-from utils import sink_to_str
+from utils import (
+    assert_table_and_meta_eq,
+    make_source,
+    sink_to_str,
+    write_source_str,
+)
 
 import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.io.types import CompressionType
+
+# Shared kwargs to pass to make_source
+_COMMON_JSON_SOURCE_KWARGS = {"format": "json", "orient": "records"}
 
 
 @pytest.mark.parametrize("rows_per_chunk", [8, 100])
@@ -114,3 +124,217 @@ def test_write_json_bool_opts(true_value, false_value):
         pd_result = pd_result.replace("false", false_value)
 
     assert str_result == pd_result
+
+
+@pytest.mark.parametrize("lines", [True, False])
+def test_read_json_basic(
+    table_data, source_or_sink, lines, text_compression_type
+):
+    compression_type = text_compression_type
+
+    # can't compress non-binary data with pandas
+    if isinstance(source_or_sink, io.StringIO):
+        compression_type = CompressionType.NONE
+
+    _, pa_table = table_data
+
+    source = make_source(
+        source_or_sink,
+        pa_table,
+        lines=lines,
+        compression=compression_type,
+        **_COMMON_JSON_SOURCE_KWARGS,
+    )
+
+    if isinstance(source, io.IOBase):
+        source.seek(0)
+
+    res = plc.io.json.read_json(
+        plc.io.SourceInfo([source]),
+        compression=compression_type,
+        lines=lines,
+    )
+
+    # Adjustments to correct for the fact orient=records is lossy
+    #  and doesn't
+    # 1) preserve colnames when zero rows in table
+    # 2) preserve struct nullability
+    # 3) differentiate int64/uint64
+    if len(pa_table) == 0:
+        pa_table = pa.table([])
+
+    new_fields = []
+    for i in range(len(pa_table.schema)):
+        curr_field = pa_table.schema.field(i)
+        if curr_field.type == pa.uint64():
+            try:
+                curr_field = curr_field.with_type(pa.int64())
+            except OverflowError:
+                # There will be no confusion, values are too large
+                # for int64 anyways
+                pass
+        new_fields.append(curr_field)
+
+    pa_table = pa_table.cast(pa.schema(new_fields))
+
+    # Convert non-nullable struct fields to nullable fields
+    # since nullable=False cannot roundtrip through orient='records'
+    # JSON format
+    assert_table_and_meta_eq(pa_table, res, check_field_nullability=False)
+
+
+def test_read_json_dtypes(table_data, source_or_sink):
+    # Simple test for dtypes where we read in
+    # all numeric data as floats
+    _, pa_table = table_data
+    source = make_source(
+        source_or_sink,
+        pa_table,
+        lines=True,
+        **_COMMON_JSON_SOURCE_KWARGS,
+    )
+
+    dtypes = []
+    new_fields = []
+    for i in range(len(pa_table.schema)):
+        field = pa_table.schema.field(i)
+        child_types = []
+
+        def get_child_types(typ):
+            typ_child_types = []
+            for i in range(typ.num_fields):
+                curr_field = typ.field(i)
+                typ_child_types.append(
+                    (
+                        curr_field.name,
+                        curr_field.type,
+                        get_child_types(curr_field.type),
+                    )
+                )
+            return typ_child_types
+
+        plc_type = plc.interop.from_arrow(field.type)
+        if pa.types.is_integer(field.type) or pa.types.is_unsigned_integer(
+            field.type
+        ):
+            plc_type = plc.interop.from_arrow(pa.float64())
+            field = field.with_type(pa.float64())
+
+        dtypes.append((field.name, plc_type, child_types))
+
+        new_fields.append(field)
+
+    new_schema = pa.schema(new_fields)
+
+    res = plc.io.json.read_json(
+        plc.io.SourceInfo([source]), dtypes=dtypes, lines=True
+    )
+    new_table = pa_table.cast(new_schema)
+
+    # orient=records is lossy
+    # and doesn't preserve column names when there's zero rows in the table
+    if len(new_table) == 0:
+        new_table = pa.table([])
+
+    assert_table_and_meta_eq(new_table, res, check_field_nullability=False)
+
+
+@pytest.mark.parametrize("chunk_size", [10, 15, 20])
+def test_read_json_lines_byte_range(source_or_sink, chunk_size):
+    source = source_or_sink
+    if isinstance(source_or_sink, io.StringIO):
+        pytest.skip("byte_range doesn't work on StringIO")
+
+    json_str = "[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9]\n"
+    write_source_str(source, json_str)
+
+    tbls_w_meta = []
+    for chunk_start in range(0, len(json_str.encode("utf-8")), chunk_size):
+        tbls_w_meta.append(
+            plc.io.json.read_json(
+                plc.io.SourceInfo([source]),
+                lines=True,
+                byte_range_offset=chunk_start,
+                byte_range_size=chunk_start + chunk_size,
+            )
+        )
+
+    if isinstance(source, io.IOBase):
+        source.seek(0)
+    exp = pd.read_json(source, orient="records", lines=True)
+
+    # TODO: can do this operation using pylibcudf
+    tbls = []
+    for tbl_w_meta in tbls_w_meta:
+        if tbl_w_meta.tbl.num_rows() > 0:
+            tbls.append(plc.interop.to_arrow(tbl_w_meta.tbl))
+    full_tbl = pa.concat_tables(tbls)
+
+    full_tbl_plc = plc.io.TableWithMetadata(
+        plc.interop.from_arrow(full_tbl),
+        tbls_w_meta[0].column_names(include_children=True),
+    )
+    assert_table_and_meta_eq(pa.Table.from_pandas(exp), full_tbl_plc)
+
+
+@pytest.mark.parametrize("keep_quotes", [True, False])
+def test_read_json_lines_keep_quotes(keep_quotes, source_or_sink):
+    source = source_or_sink
+
+    json_bytes = '["a", "b", "c"]\n'
+    write_source_str(source, json_bytes)
+
+    tbl_w_meta = plc.io.json.read_json(
+        plc.io.SourceInfo([source]), lines=True, keep_quotes=keep_quotes
+    )
+
+    template = "{0}"
+    if keep_quotes:
+        template = '"{0}"'
+
+    exp = pa.Table.from_arrays(
+        [
+            [template.format("a")],
+            [template.format("b")],
+            [template.format("c")],
+        ],
+        names=["0", "1", "2"],
+    )
+
+    assert_table_and_meta_eq(exp, tbl_w_meta)
+
+
+@pytest.mark.parametrize(
+    "recovery_mode", [opt for opt in plc.io.types.JSONRecoveryMode]
+)
+def test_read_json_lines_recovery_mode(recovery_mode, source_or_sink):
+    source = source_or_sink
+
+    json_str = '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n'
+    write_source_str(source, json_str)
+
+    if recovery_mode == plc.io.types.JSONRecoveryMode.FAIL:
+        with pytest.raises(RuntimeError):
+            plc.io.json.read_json(
+                plc.io.SourceInfo([source]),
+                lines=True,
+                recovery_mode=recovery_mode,
+            )
+    else:
+        # Recover case (bad values replaced with nulls)
+        tbl_w_meta = plc.io.json.read_json(
+            plc.io.SourceInfo([source]),
+            lines=True,
+            recovery_mode=recovery_mode,
+        )
+        exp = pa.Table.from_arrays(
+            [[1, 2, None, 3], [10, 11, None, 12]], names=["a", "b"]
+        )
+        assert_table_and_meta_eq(exp, tbl_w_meta)
+
+
+# TODO: Add tests for these!
+# Tests were not added in the initial PR porting the JSON reader to pylibcudf
+# to save time (and since there are no existing tests for these in Python cuDF)
+# mixed_types_as_string = mixed_types_as_string,
+# prune_columns = prune_columns,
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
index 287dd8f21c8..438c482b77a 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
@@ -2,11 +2,9 @@
 
 import io
 
-import pyarrow as pa
 import pytest
 
 import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.io.datasource import NativeFileDatasource
 
 
 @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
@@ -18,10 +16,8 @@ def _skip_invalid_sinks(io_class, sink):
     """
     Skip invalid sinks for SinkInfo
     """
-    if io_class is plc.io.SinkInfo and isinstance(
-        sink, (bytes, NativeFileDatasource)
-    ):
-        pytest.skip(f"{sink} is not a valid input for SinkInfo")
+    if io_class is plc.io.SinkInfo and isinstance(sink, bytes):
+        pytest.skip("bytes is not a valid input for SinkInfo")
 
 
 @pytest.mark.parametrize(
@@ -30,7 +26,6 @@ def _skip_invalid_sinks(io_class, sink):
         "a.txt",
         b"hello world",
         io.BytesIO(b"hello world"),
-        NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
     ],
 )
 def test_source_info_ctor(io_class, source, tmp_path):
@@ -47,13 +42,12 @@ def test_source_info_ctor(io_class, source, tmp_path):
 @pytest.mark.parametrize(
     "sources",
     [
+        ["a.txt"],
+        [b"hello world"],
+        [io.BytesIO(b"hello world")],
         ["a.txt", "a.txt"],
         [b"hello world", b"hello there"],
         [io.BytesIO(b"hello world"), io.BytesIO(b"hello there")],
-        [
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-        ],
     ],
 )
 def test_source_info_ctor_multiple(io_class, sources, tmp_path):
@@ -79,11 +73,6 @@ def test_source_info_ctor_multiple(io_class, sources, tmp_path):
             io.BytesIO(b"hello there"),
             b"hello world",
         ],
-        [
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-            "awef.txt",
-            b"hello world",
-        ],
     ],
 )
 def test_source_info_ctor_mixing_invalid(io_class, sources, tmp_path):
diff --git a/python/cudf/cudf/pylibcudf_tests/test_binaryops.py b/python/cudf/cudf/pylibcudf_tests/test_binaryops.py
new file mode 100644
index 00000000000..a83caf39ead
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_binaryops.py
@@ -0,0 +1,786 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import math
+
+import numpy as np
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+from cudf._lib import pylibcudf as plc
+
+
+def idfn(param):
+    ltype, rtype, outtype, plc_op, _ = param
+    params = (plc_op.name, ltype, rtype, outtype)
+    return "-".join(map(str, params))
+
+
+@pytest.fixture(params=[True, False], ids=["nulls", "no_nulls"])
+def nulls(request):
+    return request.param
+
+
+def make_col(dtype, nulls):
+    if dtype == "int64":
+        data = [1, 2, 3, 4, 5]
+        pa_type = pa.int64()
+    elif dtype == "uint64":
+        data = [1, 2, 3, 4, 5]
+        pa_type = pa.uint64()
+    elif dtype == "float64":
+        data = [1.0, 2.0, 3.0, 4.0, 5.0]
+        pa_type = pa.float64()
+    elif dtype == "bool":
+        data = [True, False, True, False, True]
+        pa_type = pa.bool_()
+    elif dtype == "timestamp64[ns]":
+        data = [
+            np.datetime64("2022-01-01"),
+            np.datetime64("2022-01-02"),
+            np.datetime64("2022-01-03"),
+            np.datetime64("2022-01-04"),
+            np.datetime64("2022-01-05"),
+        ]
+        pa_type = pa.timestamp("ns")
+    elif dtype == "timedelta64[ns]":
+        data = [
+            np.timedelta64(1, "ns"),
+            np.timedelta64(2, "ns"),
+            np.timedelta64(3, "ns"),
+            np.timedelta64(4, "ns"),
+            np.timedelta64(5, "ns"),
+        ]
+        pa_type = pa.duration("ns")
+    else:
+        raise ValueError("Unsupported dtype")
+
+    if nulls:
+        data[3] = None
+
+    return pa.array(data, type=pa_type)
+
+
+@pytest.fixture
+def pa_data(request, nulls):
+    ltype, rtype, outtype = request.param
+    values = make_col(ltype, nulls), make_col(rtype, nulls), outtype
+    return values
+
+
+@pytest.fixture
+def plc_data(pa_data):
+    lhs, rhs, outtype = pa_data
+    return (
+        plc.interop.from_arrow(lhs),
+        plc.interop.from_arrow(rhs),
+        plc.interop.from_arrow(pa.from_numpy_dtype(np.dtype(outtype))),
+    )
+
+
+@pytest.fixture
+def tests(request, nulls):
+    ltype, rtype, py_outtype, plc_op, py_op = request.param
+    pa_lhs, pa_rhs = make_col(ltype, nulls), make_col(rtype, nulls)
+    plc_lhs, plc_rhs = (
+        plc.interop.from_arrow(pa_lhs),
+        plc.interop.from_arrow(pa_rhs),
+    )
+    plc_dtype = plc.interop.from_arrow(
+        pa.from_numpy_dtype(np.dtype(py_outtype))
+    )
+    return (
+        pa_lhs,
+        pa_rhs,
+        py_outtype,
+        plc_lhs,
+        plc_rhs,
+        plc_dtype,
+        py_op,
+        plc_op,
+    )
+
+
+def custom_pyop(func):
+    def wrapper(x, y):
+        x = x.to_pylist()
+        y = y.to_pylist()
+
+        def inner(x, y):
+            if x is None or y is None:
+                return None
+            return func(x, y)
+
+        return pa.array([inner(x, y) for x, y in zip(x, y)])
+
+    return wrapper
+
+
+@custom_pyop
+def py_floordiv(x, y):
+    return x // y
+
+
+@custom_pyop
+def py_pmod(x, y):
+    return (x % y + y) % y
+
+
+@custom_pyop
+def py_mod(x, y):
+    return x % y
+
+
+@custom_pyop
+def py_atan2(x, y):
+    return math.atan2(x, y)
+
+
+@custom_pyop
+def py_shift_right_unsigned(x, y):
+    unsigned_x = np.uint32(x)
+    result = unsigned_x >> y
+    return result
+
+
+@pytest.mark.parametrize(
+    "tests",
+    [
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.ADD,
+            pa.compute.add,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.ADD,
+            pa.compute.add,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.ADD,
+            pa.compute.add,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SUB,
+            pa.compute.subtract,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SUB,
+            pa.compute.subtract,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SUB,
+            pa.compute.subtract,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.MUL,
+            pa.compute.multiply,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.MUL,
+            pa.compute.multiply,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.MUL,
+            pa.compute.multiply,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.TRUE_DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.TRUE_DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.TRUE_DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.FLOOR_DIV,
+            py_floordiv,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.FLOOR_DIV,
+            py_floordiv,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.FLOOR_DIV,
+            py_floordiv,
+        ),
+        ("int64", "int64", "int64", plc.binaryop.BinaryOperator.MOD, py_mod),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.MOD,
+            py_mod,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.MOD,
+            py_mod,
+        ),
+        ("int64", "int64", "int64", plc.binaryop.BinaryOperator.PMOD, py_pmod),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.PMOD,
+            py_pmod,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.PMOD,
+            py_pmod,
+        ),
+        ("int64", "int64", "int64", plc.binaryop.BinaryOperator.PYMOD, py_mod),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.PYMOD,
+            py_mod,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.PYMOD,
+            py_mod,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.INT_POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.INT_POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.INT_POW,
+            pa.compute.power,
+        ),
+        (
+            "float64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOG_BASE,
+            pa.compute.logb,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOG_BASE,
+            pa.compute.logb,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.LOG_BASE,
+            pa.compute.logb,
+        ),
+        (
+            "float64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.ATAN2,
+            py_atan2,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.ATAN2,
+            py_atan2,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.ATAN2,
+            py_atan2,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SHIFT_LEFT,
+            pa.compute.shift_left,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SHIFT_LEFT,
+            pa.compute.shift_left,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SHIFT_LEFT,
+            pa.compute.shift_left,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT,
+            pa.compute.shift_right,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT,
+            pa.compute.shift_right,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT,
+            pa.compute.shift_right,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED,
+            py_shift_right_unsigned,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED,
+            py_shift_right_unsigned,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED,
+            py_shift_right_unsigned,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.BITWISE_AND,
+            pa.compute.bit_wise_and,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.BITWISE_AND,
+            pa.compute.bit_wise_and,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.BITWISE_AND,
+            pa.compute.bit_wise_and,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.BITWISE_OR,
+            pa.compute.bit_wise_or,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.BITWISE_OR,
+            pa.compute.bit_wise_or,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.BITWISE_OR,
+            pa.compute.bit_wise_or,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.BITWISE_XOR,
+            pa.compute.bit_wise_xor,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.BITWISE_XOR,
+            pa.compute.bit_wise_xor,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.BITWISE_XOR,
+            pa.compute.bit_wise_xor,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.EQUAL,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.EQUAL,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.NOT_EQUAL,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NOT_EQUAL,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.LESS,
+            pa.compute.less,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LESS,
+            pa.compute.less,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.GREATER,
+            pa.compute.greater,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.GREATER,
+            pa.compute.greater,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+            pa.compute.less_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+            pa.compute.less_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            pa.compute.greater_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            pa.compute.greater_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_EQUALS,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_EQUALS,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.NULL_MAX,
+            pa.compute.max_element_wise,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_MAX,
+            pa.compute.max_element_wise,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.NULL_MIN,
+            pa.compute.min_element_wise,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_MIN,
+            pa.compute.min_element_wise,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.GENERIC_BINARY,
+            None,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.INVALID_BINARY,
+            None,
+        ),
+    ],
+    indirect=True,
+    ids=idfn,
+)
+def test_binaryops(tests):
+    (
+        pa_lhs,
+        pa_rhs,
+        py_outtype,
+        plc_lhs,
+        plc_rhs,
+        plc_outtype,
+        py_op,
+        plc_op,
+    ) = tests
+
+    def get_result():
+        return plc.binaryop.binary_operation(
+            plc_lhs,
+            plc_rhs,
+            plc_op,
+            plc_outtype,
+        )
+
+    if not plc.binaryop.is_supported_operation(
+        plc_outtype, plc_lhs.type(), plc_rhs.type(), plc_op
+    ):
+        with pytest.raises(TypeError):
+            get_result()
+    else:
+        expect = py_op(pa_lhs, pa_rhs).cast(py_outtype)
+        got = get_result()
+        assert_column_eq(expect, got)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py
index 0a6df198d46..f27fe4e942e 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_copying.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py
@@ -9,9 +9,6 @@
     assert_column_eq,
     assert_table_eq,
     cudf_raises,
-    is_fixed_width,
-    is_floating,
-    is_integer,
     is_nested_list,
     is_nested_struct,
     is_string,
@@ -359,9 +356,9 @@ def test_scatter_table_type_mismatch(source_table, index_column, target_table):
     _, plc_index_column = index_column
     _, plc_target_table = target_table
     with cudf_raises(TypeError):
-        if is_integer(
+        if plc.traits.is_integral_not_bool(
             dtype := plc_target_table.columns()[0].type()
-        ) or is_floating(dtype):
+        ) or plc.traits.is_floating_point(dtype):
             pa_array = pa.array([True] * plc_source_table.num_rows())
         else:
             pa_array = pa.array([1] * plc_source_table.num_rows())
@@ -428,9 +425,9 @@ def test_scatter_scalars_type_mismatch(index_column, target_table):
     _, plc_index_column = index_column
     _, plc_target_table = target_table
     with cudf_raises(TypeError):
-        if is_integer(
+        if plc.traits.is_integral_not_bool(
             dtype := plc_target_table.columns()[0].type()
-        ) or is_floating(dtype):
+        ) or plc.traits.is_floating_point(dtype):
             plc_source_scalar = [plc.interop.from_arrow(pa.scalar(True))]
         else:
             plc_source_scalar = [plc.interop.from_arrow(pa.scalar(1))]
@@ -458,7 +455,7 @@ def test_empty_like_table(source_table):
 @pytest.mark.parametrize("size", [None, 10])
 def test_allocate_like(input_column, size):
     _, plc_input_column = input_column
-    if is_fixed_width(plc_input_column.type()):
+    if plc.traits.is_fixed_width(plc_input_column.type()):
         result = plc.copying.allocate_like(
             plc_input_column,
             plc.copying.MaskAllocationPolicy.RETAIN,
@@ -484,7 +481,7 @@ def test_copy_range_in_place(
 
     pa_target_column, _ = target_column
 
-    if not is_fixed_width(mutable_target_column.type()):
+    if not plc.traits.is_fixed_width(mutable_target_column.type()):
         with pytest.raises(TypeError):
             plc.copying.copy_range_in_place(
                 plc_input_column,
@@ -516,7 +513,7 @@ def test_copy_range_in_place_out_of_bounds(
 ):
     _, plc_input_column = input_column
 
-    if is_fixed_width(mutable_target_column.type()):
+    if plc.traits.is_fixed_width(mutable_target_column.type()):
         with cudf_raises(IndexError):
             plc.copying.copy_range_in_place(
                 plc_input_column,
@@ -528,7 +525,9 @@ def test_copy_range_in_place_out_of_bounds(
 
 
 def test_copy_range_in_place_different_types(mutable_target_column):
-    if is_integer(dtype := mutable_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := mutable_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
         plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
@@ -548,7 +547,7 @@ def test_copy_range_in_place_null_mismatch(
 ):
     pa_input_column, _ = input_column
 
-    if is_fixed_width(mutable_target_column.type()):
+    if plc.traits.is_fixed_width(mutable_target_column.type()):
         pa_input_column = pc.if_else(
             _pyarrow_index_to_mask([0], len(pa_input_column)),
             pa_input_column,
@@ -568,7 +567,9 @@ def test_copy_range_in_place_null_mismatch(
 def test_copy_range(input_column, target_column):
     pa_input_column, plc_input_column = input_column
     pa_target_column, plc_target_column = target_column
-    if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype):
+    if plc.traits.is_fixed_width(
+        dtype := plc_target_column.type()
+    ) or is_string(dtype):
         result = plc.copying.copy_range(
             plc_input_column,
             plc_target_column,
@@ -610,7 +611,9 @@ def test_copy_range_out_of_bounds(input_column, target_column):
 
 def test_copy_range_different_types(target_column):
     _, plc_target_column = target_column
-    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := plc_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
         plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
@@ -629,7 +632,9 @@ def test_shift(target_column, source_scalar):
     pa_source_scalar, plc_source_scalar = source_scalar
     pa_target_column, plc_target_column = target_column
     shift = 2
-    if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype):
+    if plc.traits.is_fixed_width(
+        dtype := plc_target_column.type()
+    ) or is_string(dtype):
         result = plc.copying.shift(plc_target_column, shift, plc_source_scalar)
         expected = pa.concat_arrays(
             [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]]
@@ -642,7 +647,9 @@ def test_shift(target_column, source_scalar):
 
 def test_shift_type_mismatch(target_column):
     _, plc_target_column = target_column
-    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := plc_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         fill_value = plc.interop.from_arrow(pa.scalar("a"))
     else:
         fill_value = plc.interop.from_arrow(pa.scalar(1))
@@ -747,7 +754,9 @@ def test_copy_if_else_column_column(target_column, mask, source_scalar):
 def test_copy_if_else_wrong_type(target_column, mask):
     _, plc_target_column = target_column
     _, plc_mask = mask
-    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := plc_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         plc_input_column = plc.interop.from_arrow(
             pa.array(["a"] * plc_target_column.size())
         )
@@ -951,9 +960,9 @@ def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table):
 def test_boolean_mask_scatter_from_wrong_col_type(target_table, mask):
     _, plc_target_table = target_table
     _, plc_mask = mask
-    if is_integer(
+    if plc.traits.is_integral_not_bool(
         dtype := plc_target_table.columns()[0].type()
-    ) or is_floating(dtype):
+    ) or plc.traits.is_floating_point(dtype):
         input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
         input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_expressions.py b/python/cudf/cudf/pylibcudf_tests/test_expressions.py
new file mode 100644
index 00000000000..f661512caad
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_expressions.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import pyarrow as pa
+import pytest
+
+import cudf._lib.pylibcudf as plc
+
+# We can't really evaluate these expressions, so just make sure
+# construction works properly
+
+
+def test_literal_construction_invalid():
+    with pytest.raises(ValueError):
+        plc.expressions.Literal(
+            plc.interop.from_arrow(pa.scalar(None, type=pa.list_(pa.int64())))
+        )
+
+
+@pytest.mark.parametrize(
+    "tableref",
+    [
+        plc.expressions.TableReference.LEFT,
+        plc.expressions.TableReference.RIGHT,
+    ],
+)
+def test_columnref_construction(tableref):
+    plc.expressions.ColumnReference(1.0, tableref)
+
+
+def test_columnnameref_construction():
+    plc.expressions.ColumnNameReference("abc")
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        # Unary op
+        {
+            "op": plc.expressions.ASTOperator.IDENTITY,
+            "left": plc.expressions.ColumnReference(1),
+        },
+        # Binop
+        {
+            "op": plc.expressions.ASTOperator.ADD,
+            "left": plc.expressions.ColumnReference(1),
+            "right": plc.expressions.ColumnReference(2),
+        },
+    ],
+)
+def test_astoperation_construction(kwargs):
+    plc.expressions.Operation(**kwargs)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index c781126e388..7cfed884f90 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -134,3 +134,60 @@ def test_index_of_list_column(test_data, column):
     expect = pa.array(column[1], type=pa.int32())
 
     assert_column_eq(expect, res)
+
+
+def test_reverse(test_data):
+    list_column = test_data[0][0]
+    arr = pa.array(list_column)
+    plc_column = plc.interop.from_arrow(arr)
+
+    res = plc.lists.reverse(plc_column)
+
+    expect = pa.array([lst[::-1] for lst in list_column])
+
+    assert_column_eq(expect, res)
+
+
+def test_segmented_gather(test_data):
+    list_column1 = test_data[0][0]
+    list_column2 = test_data[0][1]
+
+    plc_column1 = plc.interop.from_arrow(pa.array(list_column1))
+    plc_column2 = plc.interop.from_arrow(pa.array(list_column2))
+
+    res = plc.lists.segmented_gather(plc_column2, plc_column1)
+
+    expect = pa.array([[8, 9], [14], [0], [0, 0]])
+
+    assert_column_eq(expect, res)
+
+
+def test_extract_list_element_scalar(test_data):
+    arr = pa.array(test_data[0][0])
+    plc_column = plc.interop.from_arrow(arr)
+
+    res = plc.lists.extract_list_element(plc_column, 0)
+    expect = pa.compute.list_element(test_data[0][0], 0)
+
+    assert_column_eq(expect, res)
+
+
+def test_extract_list_element_column(test_data):
+    arr = pa.array(test_data[0][0])
+    plc_column = plc.interop.from_arrow(arr)
+    indices = plc.interop.from_arrow(pa.array([0, 1, -4, -1]))
+
+    res = plc.lists.extract_list_element(plc_column, indices)
+    expect = pa.array([0, None, None, 7])
+
+    assert_column_eq(expect, res)
+
+
+def test_count_elements(test_data):
+    arr = pa.array(test_data[0][1])
+    plc_column = plc.interop.from_arrow(arr)
+    res = plc.lists.count_elements(plc_column)
+
+    expect = pa.array([1, 1, 0, 3], type=pa.int32())
+
+    assert_column_eq(expect, res)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_traits.py b/python/cudf/cudf/pylibcudf_tests/test_traits.py
new file mode 100644
index 00000000000..6c22cb02f21
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_traits.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib import pylibcudf as plc
+
+
+def test_is_relationally_comparable():
+    assert plc.traits.is_relationally_comparable(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_relationally_comparable(
+        plc.DataType(plc.TypeId.LIST)
+    )
+
+
+def test_is_equality_comparable():
+    assert plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.LIST))
+
+
+def test_is_numeric():
+    assert plc.traits.is_numeric(plc.DataType(plc.TypeId.FLOAT64))
+    assert not plc.traits.is_numeric(plc.DataType(plc.TypeId.LIST))
+
+
+def test_is_index_type():
+    assert plc.traits.is_index_type(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_index_type(plc.DataType(plc.TypeId.BOOL8))
+
+
+def test_is_unsigned():
+    assert plc.traits.is_unsigned(plc.DataType(plc.TypeId.UINT8))
+    assert not plc.traits.is_unsigned(plc.DataType(plc.TypeId.INT8))
+
+
+def test_is_integral():
+    assert plc.traits.is_integral(plc.DataType(plc.TypeId.BOOL8))
+    assert not plc.traits.is_integral(plc.DataType(plc.TypeId.DECIMAL32))
+
+
+def test_is_integral_not_bool():
+    assert plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.BOOL8))
+
+
+def test_is_floating_point():
+    assert plc.traits.is_floating_point(plc.DataType(plc.TypeId.FLOAT64))
+    assert not plc.traits.is_floating_point(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_boolean():
+    assert plc.traits.is_boolean(plc.DataType(plc.TypeId.BOOL8))
+    assert not plc.traits.is_boolean(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_timestamp():
+    assert plc.traits.is_timestamp(
+        plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+    )
+    assert not plc.traits.is_timestamp(
+        plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
+    )
+
+
+def test_is_fixed_point():
+    assert plc.traits.is_fixed_point(plc.DataType(plc.TypeId.DECIMAL128))
+    assert not plc.traits.is_fixed_point(plc.DataType(plc.TypeId.FLOAT32))
+
+
+def test_is_duration():
+    assert plc.traits.is_duration(
+        plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
+    )
+    assert not plc.traits.is_duration(
+        plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+    )
+
+
+def test_is_chrono():
+    assert plc.traits.is_chrono(plc.DataType(plc.TypeId.DURATION_MICROSECONDS))
+    assert plc.traits.is_chrono(
+        plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+    )
+    assert not plc.traits.is_chrono(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_dictionary():
+    assert plc.traits.is_dictionary(plc.DataType(plc.TypeId.DICTIONARY32))
+    assert not plc.traits.is_dictionary(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_fixed_width():
+    assert plc.traits.is_fixed_width(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_fixed_width(plc.DataType(plc.TypeId.STRING))
+
+
+def test_is_compound():
+    assert plc.traits.is_compound(plc.DataType(plc.TypeId.STRUCT))
+    assert not plc.traits.is_compound(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_nested():
+    assert plc.traits.is_nested(plc.DataType(plc.TypeId.STRUCT))
+    assert not plc.traits.is_nested(plc.DataType(plc.TypeId.STRING))
+
+
+def test_is_bit_castable():
+    assert plc.traits.is_bit_castable(
+        plc.DataType(plc.TypeId.INT8), plc.DataType(plc.TypeId.UINT8)
+    )
+    assert not plc.traits.is_bit_castable(
+        plc.DataType(plc.TypeId.UINT8), plc.DataType(plc.TypeId.UINT16)
+    )
diff --git a/python/cudf/cudf/pylibcudf_tests/test_transform.py b/python/cudf/cudf/pylibcudf_tests/test_transform.py
new file mode 100644
index 00000000000..312939888dd
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_transform.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import math
+
+import pyarrow as pa
+from utils import assert_column_eq
+
+from cudf._lib import pylibcudf as plc
+
+
+def test_nans_to_nulls(has_nans):
+    if has_nans:
+        values = [1, float("nan"), float("nan"), None, 3, None]
+    else:
+        values = [1, 4, 5, None, 3, None]
+
+    replaced = [
+        None if (v is None or (v is not None and math.isnan(v))) else v
+        for v in values
+    ]
+
+    h_input = pa.array(values, type=pa.float32())
+    input = plc.interop.from_arrow(h_input)
+    assert input.null_count() == h_input.null_count
+    expect = pa.array(replaced, type=pa.float32())
+
+    mask, null_count = plc.transform.nans_to_nulls(input)
+
+    assert null_count == expect.null_count
+    got = input.with_mask(mask, null_count)
+
+    assert_column_eq(expect, got)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_unary.py b/python/cudf/cudf/pylibcudf_tests/test_unary.py
new file mode 100644
index 00000000000..b5e4f0cb0e8
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_unary.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib import pylibcudf as plc
+
+
+def test_is_supported_cast():
+    assert plc.unary.is_supported_cast(
+        plc.DataType(plc.TypeId.INT8), plc.DataType(plc.TypeId.UINT64)
+    )
+    assert plc.unary.is_supported_cast(
+        plc.DataType(plc.TypeId.DURATION_MILLISECONDS),
+        plc.DataType(plc.TypeId.UINT64),
+    )
+    assert not plc.unary.is_supported_cast(
+        plc.DataType(plc.TypeId.INT32), plc.DataType(plc.TypeId.TIMESTAMP_DAYS)
+    )
+    assert not plc.unary.is_supported_cast(
+        plc.DataType(plc.TypeId.INT32), plc.DataType(plc.TypeId.STRING)
+    )
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index e56c8d867cb..c2072d90e98 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -158,12 +158,12 @@ def assert_column_equal(
             return True
 
     if check_datetimelike_compat:
-        if np.issubdtype(left.dtype, np.datetime64):
+        if left.dtype.kind == "M":
             right = right.astype(left.dtype)
-        elif np.issubdtype(right.dtype, np.datetime64):
+        elif right.dtype.kind == "M":
             left = left.astype(right.dtype)
 
-        if np.issubdtype(left.dtype, np.datetime64):
+        if left.dtype.kind == "M":
             if not left.equals(right):
                 raise AssertionError(
                     f"[datetimelike_compat=True] {left.values} "
@@ -779,9 +779,7 @@ def assert_eq(left, right, **kwargs):
                 tm.assert_index_equal(left, right, **kwargs)
 
     elif isinstance(left, np.ndarray) and isinstance(right, np.ndarray):
-        if np.issubdtype(left.dtype, np.floating) and np.issubdtype(
-            right.dtype, np.floating
-        ):
+        if left.dtype.kind == "f" and right.dtype.kind == "f":
             assert np.allclose(left, right, equal_nan=True)
         else:
             assert np.array_equal(left, right)
diff --git a/python/cudf/cudf/tests/series/test_conversion.py b/python/cudf/cudf/tests/series/test_conversion.py
index e1dd359e1ba..1d680d7860d 100644
--- a/python/cudf/cudf/tests/series/test_conversion.py
+++ b/python/cudf/cudf/tests/series/test_conversion.py
@@ -31,5 +31,18 @@ def test_convert_dtypes(data, dtype):
     assert_eq(expect, got)
 
 
+def test_convert_integer_false_convert_floating_true():
+    data = [1.000000000000000000000000001, 1]
+    expected = pd.Series(data).convert_dtypes(
+        convert_integer=False, convert_floating=True
+    )
+    result = (
+        cudf.Series(data)
+        .convert_dtypes(convert_integer=False, convert_floating=True)
+        .to_pandas(nullable=True)
+    )
+    assert_eq(result, expected)
+
+
 # Now write the same test, but construct a DataFrame
 # as input instead of parametrizing:
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 7d8c3b53115..503b1a975b4 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -539,7 +539,14 @@ def test_series_reflected_ops_scalar(func, dtype, obj_class):
     if obj_class == "Index":
         gs = Index(gs)
 
-    gs_result = func(gs)
+    try:
+        gs_result = func(gs)
+    except OverflowError:
+        # An error is fine, if pandas raises the same error:
+        with pytest.raises(OverflowError):
+            func(random_series)
+
+        return
 
     # class typing
     if obj_class == "Index":
@@ -589,7 +596,14 @@ def test_series_reflected_ops_cudf_scalar(funcs, dtype, obj_class):
     if obj_class == "Index":
         gs = Index(gs)
 
-    gs_result = gpu_func(gs)
+    try:
+        gs_result = gpu_func(gs)
+    except OverflowError:
+        # An error is fine, if pandas raises the same error:
+        with pytest.raises(OverflowError):
+            cpu_func(random_series)
+
+        return
 
     # class typing
     if obj_class == "Index":
@@ -770,7 +784,8 @@ def test_operator_func_series_and_scalar(
         fill_value=fill_value,
     )
     pdf_series_result = getattr(pdf_series, func)(
-        scalar, fill_value=fill_value
+        np.array(scalar)[()] if use_cudf_scalar else scalar,
+        fill_value=fill_value,
     )
 
     assert_eq(pdf_series_result, gdf_series_result)
@@ -1679,12 +1694,7 @@ def test_scalar_null_binops(op, dtype_l, dtype_r):
     rhs = cudf.Scalar(cudf.NA, dtype=dtype_r)
 
     result = op(lhs, rhs)
-    assert result.value is (
-        cudf.NaT
-        if cudf.api.types.is_datetime64_dtype(result.dtype)
-        or cudf.api.types.is_timedelta64_dtype(result.dtype)
-        else cudf.NA
-    )
+    assert result.value is (cudf.NaT if result.dtype.kind in "mM" else cudf.NA)
 
     # make sure dtype is the same as had there been a valid scalar
     valid_lhs = cudf.Scalar(1, dtype=dtype_l)
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 09617306606..6a21cb1b9d7 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1085,8 +1085,9 @@ def test_csv_reader_arrow_nativefile(path_or_buf):
     # Arrow FileSystem interface
     expect = cudf.read_csv(path_or_buf("filepath"))
     fs, path = pa_fs.FileSystem.from_uri(path_or_buf("filepath"))
-    with fs.open_input_file(path) as fil:
-        got = cudf.read_csv(fil)
+    with pytest.warns(FutureWarning):
+        with fs.open_input_file(path) as fil:
+            got = cudf.read_csv(fil)
 
     assert_eq(expect, got)
 
@@ -1191,7 +1192,7 @@ def test_csv_reader_byte_range_type_corner_case(tmpdir):
     ).to_csv(fname, chunksize=100000)
 
     byte_range = (2_147_483_648, 0)
-    with pytest.raises(RuntimeError, match="Offset is past end of file"):
+    with pytest.raises(OverflowError, match="Offset is past end of file"):
         cudf.read_csv(fname, byte_range=byte_range, header=None)
 
 
@@ -1617,7 +1618,7 @@ def test_csv_reader_partial_dtype(dtype):
         StringIO('"A","B","C"\n0,1,2'), dtype=dtype, usecols=["A", "C"]
     )
 
-    assert names_df == header_df
+    assert_eq(names_df, header_df)
     assert all(names_df.dtypes == ["int16", "int64"])
 
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index f40106a30f4..e2ce5c03b70 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -5234,7 +5234,7 @@ def test_rowwise_ops(data, op, skipna, numeric_only):
             else (pdf[column].notna().count() == 0)
         )
         or cudf.api.types.is_numeric_dtype(pdf[column].dtype)
-        or cudf.api.types.is_bool_dtype(pdf[column].dtype)
+        or pdf[column].dtype.kind == "b"
         for column in pdf
     ):
         with pytest.raises(TypeError):
@@ -5457,9 +5457,7 @@ def test_rowwise_ops_datetime_dtypes(data, op, skipna, numeric_only):
     gdf = cudf.DataFrame(data)
     pdf = gdf.to_pandas()
 
-    if not numeric_only and not all(
-        cudf.api.types.is_datetime64_dtype(dt) for dt in gdf.dtypes
-    ):
+    if not numeric_only and not all(dt.kind == "M" for dt in gdf.dtypes):
         with pytest.raises(TypeError):
             got = getattr(gdf, op)(
                 axis=1, skipna=skipna, numeric_only=numeric_only
@@ -10835,7 +10833,7 @@ def test_dataframe_contains(name, contains, other_names):
         expectation = contains is cudf.NA and name is cudf.NA
         assert (contains in pdf) == expectation
         assert (contains in gdf) == expectation
-    elif pd.api.types.is_float_dtype(gdf.columns.dtype):
+    elif gdf.columns.dtype.kind == "f":
         # In some cases, the columns are converted to an Index[float] based on
         # the other column names. That casts name values from None to np.nan.
         expectation = contains is np.nan and (name is None or name is np.nan)
@@ -11102,3 +11100,12 @@ def test_from_records_with_index_no_shallow_copy():
     data = np.array([(1.0, 2), (3.0, 4)], dtype=[("x", "<f8"), ("y", "<i8")])
     df = cudf.DataFrame(data.view(np.recarray), index=idx)
     assert df.index is idx
+
+
+def test_bool_raises():
+    assert_exceptions_equal(
+        lfunc=bool,
+        rfunc=bool,
+        lfunc_args_and_kwargs=[[cudf.DataFrame()]],
+        rfunc_args_and_kwargs=[[pd.DataFrame()]],
+    )
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 092e9790c63..7ab9ff2ef23 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1534,18 +1534,7 @@ def test_date_range_start_end_periods(start, end, periods):
     )
 
 
-def test_date_range_start_end_freq(request, start, end, freq):
-    request.applymarker(
-        pytest.mark.xfail(
-            condition=(
-                start == "1831-05-08 15:23:21"
-                and end == "1996-11-21 04:05:30"
-                and freq == "110546789ms"
-            ),
-            reason="https://github.com/rapidsai/cudf/issues/12133",
-        )
-    )
-
+def test_date_range_start_end_freq(start, end, freq):
     if isinstance(freq, str):
         _gfreq = _pfreq = freq
     else:
@@ -1561,7 +1550,7 @@ def test_date_range_start_end_freq(request, start, end, freq):
     )
 
 
-def test_date_range_start_freq_periods(request, start, freq, periods):
+def test_date_range_start_freq_periods(start, freq, periods):
     if isinstance(freq, str):
         _gfreq = _pfreq = freq
     else:
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index c41a938f6ea..65f739bc74a 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -97,7 +97,7 @@ def test_typecast_from_float_to_decimal(request, data, from_dtype, to_dtype):
         pytest.mark.xfail(
             condition=version.parse(pa.__version__) >= version.parse("13.0.0")
             and from_dtype == np.dtype("float32")
-            and to_dtype.precision > 7,
+            and to_dtype.precision > 12,
             reason="https://github.com/rapidsai/cudf/issues/14169",
         )
     )
diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py
index 0da5c6b04d6..794660cffcb 100644
--- a/python/cudf/cudf/tests/test_doctests.py
+++ b/python/cudf/cudf/tests/test_doctests.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 import contextlib
 import doctest
 import inspect
@@ -8,6 +8,7 @@
 
 import numpy as np
 import pytest
+from packaging import version
 
 import cudf
 
@@ -80,6 +81,16 @@ def chdir_to_tmp_path(cls, tmp_path):
         yield
         os.chdir(original_directory)
 
+    @pytest.fixture(autouse=True)
+    def prinoptions(cls):
+        # TODO: NumPy now prints scalars as `np.int8(1)`, etc. this should
+        #       be adapted evantually.
+        if version.parse(np.__version__) >= version.parse("2.0"):
+            with np.printoptions(legacy="1.25"):
+                yield
+        else:
+            yield
+
     @pytest.mark.parametrize(
         "docstring",
         itertools.chain(*[_find_doctests_in_obj(mod) for mod in tests]),
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index edb534a3618..c62b5889fdd 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -341,7 +341,6 @@ def test_dtype(in_dtype, expect):
         np.complex128,
         complex,
         "S",
-        "a",
         "V",
         "float16",
         np.float16,
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index fc22d8bc0ea..28fdfb5c2f1 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -46,7 +46,8 @@ def mock_size(*args):
     # use_python_file_object=True, because the pyarrow
     # `open_input_file` command will fail (since it doesn't
     # use the monkey-patched `open` definition)
-    got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False)
+    with pytest.warns(FutureWarning):
+        got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False)
     assert_eq(pdf, got)
 
     # AbstractBufferedFile -> PythonFile conversion
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 05dcd85df6a..722a64cb553 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -16,7 +16,6 @@
 
 import cudf
 from cudf.api.extensions import no_default
-from cudf.api.types import is_bool_dtype
 from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex
 from cudf.testing import assert_eq
 from cudf.testing._utils import (
@@ -2397,8 +2396,8 @@ def test_intersection_index(idx1, idx2, sort, pandas_compatible):
             expected,
             actual,
             exact=False
-            if (is_bool_dtype(idx1.dtype) and not is_bool_dtype(idx2.dtype))
-            or (not is_bool_dtype(idx1.dtype) or is_bool_dtype(idx2.dtype))
+            if (idx1.dtype.kind == "b" and idx2.dtype.kind != "b")
+            or (idx1.dtype.kind != "b" or idx2.dtype.kind == "b")
             else True,
         )
 
@@ -3295,3 +3294,12 @@ def test_index_assignment_no_shallow_copy(index):
     df = cudf.DataFrame(range(1))
     df.index = index
     assert df.index is index
+
+
+def test_bool_rangeindex_raises():
+    assert_exceptions_equal(
+        lfunc=bool,
+        rfunc=bool,
+        lfunc_args_and_kwargs=[[pd.RangeIndex(0)]],
+        rfunc_args_and_kwargs=[[cudf.RangeIndex(0)]],
+    )
diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py
index 4a0dc331e1a..a4f0b9fc97e 100644
--- a/python/cudf/cudf/tests/test_interpolate.py
+++ b/python/cudf/cudf/tests/test_interpolate.py
@@ -135,3 +135,9 @@ def test_interpolate_dataframe_error_cases(data, kwargs):
         lfunc_args_and_kwargs=([], kwargs),
         rfunc_args_and_kwargs=([], kwargs),
     )
+
+
+def test_interpolate_noop_new_column():
+    ser = cudf.Series([1.0, 2.0, 3.0])
+    result = ser.interpolate()
+    assert ser._column is not result._column
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 297040b6d95..c81c2d1d94b 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -1077,8 +1077,13 @@ def test_json_dtypes_nested_data():
     )
 
     pdf = pd.read_json(
-        StringIO(expected_json_str), orient="records", lines=True
+        StringIO(expected_json_str),
+        orient="records",
+        lines=True,
     )
+
+    assert_eq(df, pdf)
+
     pdf.columns = pdf.columns.astype("str")
     pa_table_pdf = pa.Table.from_pandas(
         pdf, schema=df.to_arrow().schema, safe=False
@@ -1423,3 +1428,19 @@ def test_json_reader_on_bad_lines(on_bad_lines):
                 orient="records",
                 on_bad_lines=on_bad_lines,
             )
+
+
+def test_chunked_json_reader():
+    df = cudf.DataFrame(
+        {
+            "a": ["aaaa"] * 9_00_00_00,
+            "b": list(range(0, 9_00_00_00)),
+        }
+    )
+    buf = BytesIO()
+    df.to_json(buf, lines=True, orient="records", engine="cudf")
+    buf.seek(0)
+    df = df.to_pandas()
+    with cudf.option_context("io.json.low_memory", True):
+        gdf = cudf.read_json(buf, lines=True)
+    assert_eq(df, gdf)
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index f76143cb381..36bcaa66d7d 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -12,6 +12,7 @@
 from cudf import NA
 from cudf._lib.copying import get_element
 from cudf.api.types import is_scalar
+from cudf.core.column.column import column_empty
 from cudf.testing import assert_eq
 from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES
 
@@ -693,12 +694,7 @@ def test_list_scalar_host_construction_null(elem_type, nesting_level):
         dtype = cudf.ListDtype(dtype)
 
     slr = cudf.Scalar(None, dtype=dtype)
-    assert slr.value is (
-        cudf.NaT
-        if cudf.api.types.is_datetime64_dtype(slr.dtype)
-        or cudf.api.types.is_timedelta64_dtype(slr.dtype)
-        else cudf.NA
-    )
+    assert slr.value is (cudf.NaT if slr.dtype.kind in "mM" else cudf.NA)
 
 
 @pytest.mark.parametrize(
@@ -926,3 +922,29 @@ def test_list_iterate_error():
 def test_list_struct_list_memory_usage():
     df = cudf.DataFrame({"a": [[{"b": [1]}]]})
     assert df.memory_usage().sum() == 16
+
+
+def test_empty_nested_list_uninitialized_offsets_memory_usage():
+    col = column_empty(0, cudf.ListDtype(cudf.ListDtype("int64")))
+    nested_col = col.children[1]
+    empty_inner = type(nested_col)(
+        size=nested_col.size,
+        dtype=nested_col.dtype,
+        mask=nested_col.mask,
+        offset=nested_col.offset,
+        null_count=nested_col.null_count,
+        children=(
+            column_empty(0, nested_col.children[0].dtype),
+            nested_col.children[1],
+        ),
+    )
+    col_empty_offset = type(col)(
+        size=col.size,
+        dtype=col.dtype,
+        mask=col.mask,
+        offset=col.offset,
+        null_count=col.null_count,
+        children=(column_empty(0, col.children[0].dtype), empty_inner),
+    )
+    ser = cudf.Series._from_data({None: col_empty_offset})
+    assert ser.memory_usage() == 8
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 07c2e9c3fcf..2c00d48266c 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -832,25 +832,17 @@ def test_multiindex_copy_deep(data, copy_on_write, deep):
 
         # Assert ._levels identity
         lptrs = [
-            lv._data._data[None].base_data.get_ptr(mode="read")
-            for lv in mi1._levels
+            lv._column.base_data.get_ptr(mode="read") for lv in mi1._levels
         ]
         rptrs = [
-            lv._data._data[None].base_data.get_ptr(mode="read")
-            for lv in mi2._levels
+            lv._column.base_data.get_ptr(mode="read") for lv in mi2._levels
         ]
 
         assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
         # Assert ._codes identity
-        lptrs = [
-            c.base_data.get_ptr(mode="read")
-            for _, c in mi1._codes._data.items()
-        ]
-        rptrs = [
-            c.base_data.get_ptr(mode="read")
-            for _, c in mi2._codes._data.items()
-        ]
+        lptrs = [c.base_data.get_ptr(mode="read") for c in mi1._codes]
+        rptrs = [c.base_data.get_ptr(mode="read") for c in mi2._codes]
 
         assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
@@ -2169,3 +2161,12 @@ def test_nunique(array, dropna):
     result = gidx.nunique(dropna=dropna)
     expected = pidx.nunique(dropna=dropna)
     assert result == expected
+
+
+def test_bool_raises():
+    assert_exceptions_equal(
+        lfunc=bool,
+        rfunc=bool,
+        lfunc_args_and_kwargs=[[cudf.MultiIndex.from_arrays([range(1)])]],
+        rfunc_args_and_kwargs=[[pd.MultiIndex.from_arrays([range(1)])]],
+    )
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 588bc87d268..f2820d9c112 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -711,7 +711,8 @@ def test_parquet_reader_arrow_nativefile(parquet_path_or_buf):
     expect = cudf.read_parquet(parquet_path_or_buf("filepath"))
     fs, path = pa_fs.FileSystem.from_uri(parquet_path_or_buf("filepath"))
     with fs.open_input_file(path) as fil:
-        got = cudf.read_parquet(fil)
+        with pytest.warns(FutureWarning):
+            got = cudf.read_parquet(fil)
 
     assert_eq(expect, got)
 
@@ -726,16 +727,18 @@ def test_parquet_reader_use_python_file_object(
     fs, _, paths = get_fs_token_paths(parquet_path_or_buf("filepath"))
 
     # Pass open fsspec file
-    with fs.open(paths[0], mode="rb") as fil:
-        got1 = cudf.read_parquet(
-            fil, use_python_file_object=use_python_file_object
-        )
+    with pytest.warns(FutureWarning):
+        with fs.open(paths[0], mode="rb") as fil:
+            got1 = cudf.read_parquet(
+                fil, use_python_file_object=use_python_file_object
+            )
     assert_eq(expect, got1)
 
     # Pass path only
-    got2 = cudf.read_parquet(
-        paths[0], use_python_file_object=use_python_file_object
-    )
+    with pytest.warns(FutureWarning):
+        got2 = cudf.read_parquet(
+            paths[0], use_python_file_object=use_python_file_object
+        )
     assert_eq(expect, got2)
 
 
@@ -1617,7 +1620,11 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf):
     assert_eq(pdf, gdf)
 
     # Write out the gdf using the GPU accelerated writer with INT96 timestamps
-    gdf.to_parquet(gdf_fname.strpath, index=None, int96_timestamps=True)
+    gdf.to_parquet(
+        gdf_fname.strpath,
+        index=None,
+        int96_timestamps=True,
+    )
 
     assert os.path.exists(gdf_fname)
 
@@ -1789,10 +1796,11 @@ def test_parquet_write_bytes_io(simple_gdf):
     assert_eq(cudf.read_parquet(output), simple_gdf)
 
 
-def test_parquet_writer_bytes_io(simple_gdf):
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_writer_bytes_io(simple_gdf, store_schema):
     output = BytesIO()
 
-    writer = ParquetWriter(output)
+    writer = ParquetWriter(output, store_schema=store_schema)
     writer.write_table(simple_gdf)
     writer.write_table(simple_gdf)
     writer.close()
@@ -2124,7 +2132,8 @@ def test_parquet_writer_chunked_partitioned_context(tmpdir_factory):
 
 
 @pytest.mark.parametrize("cols", [None, ["b"]])
-def test_parquet_write_to_dataset(tmpdir_factory, cols):
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_write_to_dataset(tmpdir_factory, cols, store_schema):
     dir1 = tmpdir_factory.mktemp("dir1")
     dir2 = tmpdir_factory.mktemp("dir2")
     if cols is None:
@@ -2140,7 +2149,7 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
             "b": np.random.choice(np.arange(4), size=size),
         }
     )
-    gdf.to_parquet(dir1, partition_cols=cols)
+    gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema)
     cudf.io.write_to_dataset(gdf, dir2, partition_cols=cols)
 
     # Read back with cudf
@@ -2156,7 +2165,7 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
         }
     )
     with pytest.raises(ValueError):
-        gdf.to_parquet(dir1, partition_cols=cols)
+        gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema)
 
 
 @pytest.mark.parametrize(
@@ -2386,7 +2395,8 @@ def test_parquet_writer_list_large_mixed(tmpdir):
     assert_eq(expect, got)
 
 
-def test_parquet_writer_list_chunked(tmpdir):
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_writer_list_chunked(tmpdir, store_schema):
     table1 = cudf.DataFrame(
         {
             "a": list_gen(string_gen, 128, 80, 50),
@@ -2407,7 +2417,7 @@ def test_parquet_writer_list_chunked(tmpdir):
     expect = cudf.concat([table1, table2])
     expect = expect.reset_index(drop=True)
 
-    writer = ParquetWriter(fname)
+    writer = ParquetWriter(fname, store_schema=store_schema)
     writer.write_table(table1)
     writer.write_table(table2)
     writer.close()
@@ -2542,6 +2552,10 @@ def normalized_equals(value1, value2):
         value1 = None
     if value2 is pd.NA or value2 is pd.NaT:
         value2 = None
+    if isinstance(value1, np.datetime64):
+        value1 = pd.Timestamp(value1).to_pydatetime()
+    if isinstance(value2, np.datetime64):
+        value2 = pd.Timestamp(value2).to_pydatetime()
     if isinstance(value1, pd.Timestamp):
         value1 = value1.to_pydatetime()
     if isinstance(value2, pd.Timestamp):
@@ -2550,6 +2564,9 @@ def normalized_equals(value1, value2):
         value1 = value1.replace(tzinfo=None)
     if isinstance(value2, datetime.datetime):
         value2 = value2.replace(tzinfo=None)
+    if isinstance(value1, pd.Timedelta):
+        unit = "ms" if value1.unit == "s" else value1.unit
+        value2 = pd.Timedelta(value2, unit=unit)
 
     # if one is datetime then both values are datetimes now
     if isinstance(value1, datetime.datetime):
@@ -2563,7 +2580,8 @@ def normalized_equals(value1, value2):
 
 
 @pytest.mark.parametrize("add_nulls", [True, False])
-def test_parquet_writer_statistics(tmpdir, pdf, add_nulls):
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_writer_statistics(tmpdir, pdf, add_nulls, store_schema):
     file_path = tmpdir.join("cudf.parquet")
     if "col_category" in pdf.columns:
         pdf = pdf.drop(columns=["col_category", "col_bool"])
@@ -2580,7 +2598,7 @@ def test_parquet_writer_statistics(tmpdir, pdf, add_nulls):
     if add_nulls:
         for col in gdf:
             set_random_null_mask_inplace(gdf[col])
-    gdf.to_parquet(file_path, index=False)
+    gdf.to_parquet(file_path, index=False, store_schema=store_schema)
 
     # Read back from pyarrow
     pq_file = pq.ParquetFile(file_path)
@@ -3205,7 +3223,8 @@ def test_parquet_writer_zstd():
         assert_eq(expected, got)
 
 
-def test_parquet_writer_time_delta_physical_type():
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_writer_time_delta_physical_type(store_schema):
     df = cudf.DataFrame(
         {
             "s": cudf.Series([1], dtype="timedelta64[s]"),
@@ -3217,22 +3236,35 @@ def test_parquet_writer_time_delta_physical_type():
         }
     )
     buffer = BytesIO()
-    df.to_parquet(buffer)
+    df.to_parquet(buffer, store_schema=store_schema)
 
     got = pd.read_parquet(buffer)
-    expected = pd.DataFrame(
-        {
-            "s": ["00:00:01"],
-            "ms": ["00:00:00.002000"],
-            "us": ["00:00:00.000003"],
-            "ns": ["00:00:00.000004"],
-        },
-        dtype="str",
-    )
+
+    if store_schema:
+        expected = pd.DataFrame(
+            {
+                "s": ["0 days 00:00:01"],
+                "ms": ["0 days 00:00:00.002000"],
+                "us": ["0 days 00:00:00.000003"],
+                "ns": ["0 days 00:00:00.000004"],
+            },
+            dtype="str",
+        )
+    else:
+        expected = pd.DataFrame(
+            {
+                "s": ["00:00:01"],
+                "ms": ["00:00:00.002000"],
+                "us": ["00:00:00.000003"],
+                "ns": ["00:00:00.000004"],
+            },
+            dtype="str",
+        )
     assert_eq(got.astype("str"), expected)
 
 
-def test_parquet_roundtrip_time_delta():
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_roundtrip_time_delta(store_schema):
     num_rows = 12345
     df = cudf.DataFrame(
         {
@@ -3255,10 +3287,11 @@ def test_parquet_roundtrip_time_delta():
         }
     )
     buffer = BytesIO()
-    df.to_parquet(buffer)
-    # TODO: Remove `check_dtype` once following issue is fixed in arrow:
-    # https://github.com/apache/arrow/issues/33321
+    df.to_parquet(buffer, store_schema=store_schema)
+    # `check_dtype` cannot be removed here as timedelta64[s] will change to `timedelta[ms]`
     assert_eq(df, cudf.read_parquet(buffer), check_dtype=False)
+    if store_schema:
+        assert_eq(df, pd.read_parquet(buffer))
 
 
 def test_parquet_reader_malformed_file(datadir):
@@ -3420,35 +3453,87 @@ def test_parquet_reader_roundtrip_with_arrow_schema():
     # Check results for reader with schema
     assert_eq(expected, got)
 
+    # Reset buffer
+    buffer = BytesIO()
 
-def test_parquet_reader_roundtrip_structs_with_arrow_schema():
-    # Ensure that the structs with duration types are faithfully being
-    # roundtripped across Parquet with arrow schema
-    pdf = pd.DataFrame(
-        {
-            "struct": {
-                "payload": {
-                    "Domain": {
-                        "Name": "abc",
-                        "Id": {"Name": "host", "Value": "127.0.0.8"},
-                        "Duration": datetime.timedelta(minutes=12),
-                    },
-                    "StreamId": "12345678",
-                    "Duration": datetime.timedelta(minutes=4),
-                    "Offset": None,
-                    "Resource": [
-                        {
-                            "Name": "ZoneName",
-                            "Value": "RAPIDS",
-                            "Duration": datetime.timedelta(seconds=1),
-                        }
-                    ],
+    # Write to buffer with cudf
+    expected.to_parquet(buffer, store_schema=True)
+
+    # Read parquet with arrow schema
+    got = cudf.read_parquet(buffer)
+    # Convert to cudf table for an apple to apple comparison
+    expected = cudf.from_pandas(pdf)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        # struct
+        [
+            {"a": 1, "b": 2},
+            {"a": 10, "b": 20},
+            {"a": None, "b": 22},
+            {"a": None, "b": None},
+            {"a": 15, "b": None},
+        ],
+        # struct-of-list
+        [
+            {"a": 1, "b": 2, "c": [1, 2, 3]},
+            {"a": 10, "b": 20, "c": [4, 5]},
+            {"a": None, "b": 22, "c": [6]},
+            {"a": None, "b": None, "c": None},
+            {"a": 15, "b": None, "c": [-1, -2]},
+            None,
+            {"a": 100, "b": 200, "c": [-10, None, -20]},
+        ],
+        # list-of-struct
+        [
+            [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}],
+            None,
+            [{"a": 10, "b": 20}],
+            [{"a": 100, "b": 200}, {"a": None, "b": 300}, None],
+        ],
+        # struct-of-struct
+        [
+            {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2},
+            {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4},
+            {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6},
+            {"a": 7, "b": None, "c": 8},
+            {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None},
+            None,
+            {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10},
+        ],
+        # struct-with-mixed-types
+        [
+            {
+                "struct": {
+                    "payload": {
+                        "Domain": {
+                            "Name": "abc",
+                            "Id": {"Name": "host", "Value": "127.0.0.8"},
+                            "Duration": datetime.timedelta(minutes=12),
+                        },
+                        "StreamId": "12345678",
+                        "Duration": datetime.timedelta(minutes=4),
+                        "Offset": None,
+                        "Resource": [
+                            {
+                                "Name": "ZoneName",
+                                "Value": "RAPIDS",
+                                "Duration": datetime.timedelta(seconds=1),
+                            }
+                        ],
+                    }
                 }
             }
-        }
-    )
+        ],
+    ],
+)
+def test_parquet_reader_roundtrip_structs_with_arrow_schema(tmpdir, data):
+    # Ensure that the structs with duration types are faithfully being
+    # roundtripped across Parquet with arrow schema
+    pdf = pd.DataFrame({"struct": pd.Series(data)})
 
-    # Reset the buffer and write parquet with arrow
     buffer = BytesIO()
     pdf.to_parquet(buffer, engine="pyarrow")
 
@@ -3460,6 +3545,203 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema():
     # Check results
     assert_eq(expected, got)
 
+    # Reset buffer
+    buffer = BytesIO()
+
+    # Write to buffer with cudf
+    expected.to_parquet(buffer, store_schema=True)
+
+    # Read parquet with arrow schema
+    got = cudf.read_parquet(buffer)
+    # Convert to cudf table for an apple to apple comparison
+    expected = cudf.from_pandas(pdf)
+
+    # Check results
+    assert_eq(expected, got)
+
+
+@pytest.mark.parametrize("index", [None, True, False])
+def test_parquet_writer_roundtrip_with_arrow_schema(index):
+    # Ensure that the concrete and nested types are faithfully being roundtripped
+    # across Parquet with arrow schema
+    expected = cudf.DataFrame(
+        {
+            "s": cudf.Series([None, None, None], dtype="timedelta64[s]"),
+            "us": cudf.Series([None, 3456, None], dtype="timedelta64[us]"),
+            "duration_list": list(
+                [
+                    [
+                        datetime.timedelta(minutes=7, seconds=4),
+                        datetime.timedelta(minutes=7),
+                    ],
+                    [
+                        None,
+                        None,
+                    ],
+                    [
+                        datetime.timedelta(minutes=7, seconds=4),
+                        None,
+                    ],
+                ]
+            ),
+            "int64": cudf.Series([-1234, 123, 4123], dtype="int64"),
+            "uint32": cudf.Series([1234, 123, 4123], dtype="uint32"),
+            "list": list([[1, 2], [1, 2], [1, 2]]),
+            "bool": cudf.Series([True, None, False], dtype=bool),
+            "fixed32": cudf.Series([0.00, 1.0, None]).astype(
+                cudf.Decimal32Dtype(7, 2)
+            ),
+            "fixed64": cudf.Series([0.00, 1.0, None]).astype(
+                cudf.Decimal64Dtype(7, 2)
+            ),
+            "fixed128": cudf.Series([0.00, 1.0, None]).astype(
+                cudf.Decimal128Dtype(7, 2)
+            ),
+            "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
+            "map": cudf.Series(["cat", "dog", "lion"]).map(
+                {"cat": "kitten", "dog": "puppy", "lion": "cub"}
+            ),
+        }
+    )
+
+    # Write to Parquet with arrow schema for faithful roundtrip
+    buffer = BytesIO()
+    expected.to_parquet(buffer, store_schema=True, index=index)
+
+    # Convert decimal types to d128
+    expected = expected.astype({"fixed32": cudf.Decimal128Dtype(9, 2)})
+    expected = expected.astype({"fixed64": cudf.Decimal128Dtype(18, 2)})
+
+    # Read parquet with pyarrow, pandas and cudf readers
+    got = cudf.DataFrame.from_arrow(pq.read_table(buffer))
+    got2 = cudf.DataFrame.from_pandas(pd.read_parquet(buffer))
+    got3 = cudf.read_parquet(buffer)
+
+    # drop the index column for comparison: __index_level_0__
+    if index:
+        got.drop(columns="__index_level_0__", inplace=True)
+        got2.drop(columns="__index_level_0__", inplace=True)
+
+    # Check results
+    assert_eq(expected, got)
+    assert_eq(expected, got2)
+    assert_eq(expected, got3)
+
+
+def test_parquet_writer_int96_timestamps_and_arrow_schema():
+    df = cudf.DataFrame(
+        {
+            "timestamp": cudf.Series(
+                [1234, 123, 4123], dtype="datetime64[ms]"
+            ),
+        }
+    )
+
+    # Output buffer
+    buffer = BytesIO()
+
+    # Writing out parquet with both INT96 timestamps and arrow_schema
+    # enabled should throw an exception.
+    with pytest.raises(RuntimeError):
+        df.to_parquet(buffer, int96_timestamps=True, store_schema=True)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        # struct
+        [
+            {"a": 1, "b": 2},
+            {"a": 10, "b": 20},
+            {"a": None, "b": 22},
+            {"a": None, "b": None},
+            {"a": 15, "b": None},
+        ],
+        # struct-of-list
+        [
+            {"a": 1, "b": 2, "c": [1, 2, 3]},
+            {"a": 10, "b": 20, "c": [4, 5]},
+            {"a": None, "b": 22, "c": [6]},
+            {"a": None, "b": None, "c": None},
+            {"a": 15, "b": None, "c": [-1, -2]},
+            None,
+            {"a": 100, "b": 200, "c": [-10, None, -20]},
+        ],
+        # list-of-struct
+        [
+            [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}],
+            None,
+            [{"a": 10, "b": 20}],
+            [{"a": 100, "b": 200}, {"a": None, "b": 300}, None],
+        ],
+        # struct-of-struct
+        [
+            {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2},
+            {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4},
+            {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6},
+            {"a": 7, "b": None, "c": 8},
+            {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None},
+            None,
+            {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10},
+        ],
+        # struct-with-mixed-types
+        [
+            {
+                "struct": {
+                    "payload": {
+                        "Domain": {
+                            "Name": "abc",
+                            "Id": {"Name": "host", "Value": "127.0.0.8"},
+                            "Duration": datetime.timedelta(minutes=12),
+                        },
+                        "StreamId": "12345678",
+                        "Duration": datetime.timedelta(minutes=4),
+                        "Offset": None,
+                        "Resource": [
+                            {
+                                "Name": "ZoneName",
+                                "Value": "RAPIDS",
+                                "Duration": datetime.timedelta(seconds=1),
+                            }
+                        ],
+                    }
+                }
+            }
+        ],
+    ],
+)
+@pytest.mark.parametrize("index", [None, True, False])
+def test_parquet_writer_roundtrip_structs_with_arrow_schema(
+    tmpdir, data, index
+):
+    # Ensure that the structs are faithfully being roundtripped across
+    # Parquet with arrow schema
+    pa_expected = pa.Table.from_pydict({"struct": data})
+
+    expected = cudf.DataFrame.from_arrow(pa_expected)
+
+    # Write expected data frame to Parquet with arrow schema
+    buffer = BytesIO()
+    expected.to_parquet(buffer, store_schema=True, index=index)
+
+    # Read Parquet with pyarrow
+    pa_got = pq.read_table(buffer)
+
+    # drop the index column for comparison: __index_level_0__
+    if index:
+        pa_got = pa_got.drop(columns="__index_level_0__")
+
+    # Check results
+    assert_eq(pa_expected, pa_got)
+
+    # Convert to cuDF table and also read Parquet with cuDF reader
+    got = cudf.DataFrame.from_arrow(pa_got)
+    got2 = cudf.read_parquet(buffer)
+
+    # Check results
+    assert_eq(expected, got)
+    assert_eq(expected, got2)
+
 
 @pytest.mark.parametrize("chunk_read_limit", [0, 240, 1024000000])
 @pytest.mark.parametrize("pass_read_limit", [0, 240, 1024000000])
@@ -3493,6 +3775,6 @@ def test_parquet_reader_pandas_compatibility():
     )
     buffer = BytesIO()
     df.to_parquet(buffer)
-    with cudf.option_context("mode.pandas_compatible", True):
+    with cudf.option_context("io.parquet.low_memory", True):
         expected = cudf.read_parquet(buffer)
     assert_eq(expected, df)
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index 1247fa362ce..8be6463c699 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -248,16 +248,11 @@ def test_sum_masked(nelem):
 
 def test_sum_boolean():
     s = Series(np.arange(100000))
-    got = (s > 1).sum(dtype=np.int32)
+    got = (s > 1).sum()
     expect = 99998
 
     assert expect == got
 
-    got = (s > 1).sum(dtype=np.bool_)
-    expect = True
-
-    assert expect == got
-
 
 def test_date_minmax():
     np_data = np.random.normal(size=10**3)
@@ -371,3 +366,11 @@ def test_reduction_column_multiindex():
     result = df.mean()
     expected = df.to_pandas().mean()
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("op", ["sum", "product"])
+def test_dtype_deprecated(op):
+    ser = cudf.Series(range(5))
+    with pytest.warns(FutureWarning):
+        result = getattr(ser, op)(dtype=np.dtype(np.int8))
+    assert isinstance(result, np.int8)
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 193d64a9e7f..a013745f71e 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -186,13 +186,11 @@ def test_MI():
         }
     )
     levels = [["a", "b", "c", "d"], ["w", "x", "y", "z"], ["m", "n"]]
-    codes = cudf.DataFrame(
-        {
-            "a": [0, 0, 0, 0, 1, 1, 2, 2, 3, 3],
-            "b": [0, 1, 2, 3, 0, 1, 2, 3, 0, 1],
-            "c": [0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
-        }
-    )
+    codes = [
+        [0, 0, 0, 0, 1, 1, 2, 2, 3, 3],
+        [0, 1, 2, 3, 0, 1, 2, 3, 0, 1],
+        [0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
+    ]
     pd.options.display.max_rows = 999
     pd.options.display.max_columns = 0
     gdf = gdf.set_index(cudf.MultiIndex(levels=levels, codes=codes))
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index a44bf791767..3ae318d3bf5 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -138,22 +138,24 @@ def test_read_csv(s3_base, s3so, pdf, bytes_per_thread):
     buffer = pdf.to_csv(index=False)
 
     # Use fsspec file object
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            bytes_per_thread=bytes_per_thread,
-            use_python_file_object=False,
-        )
+    with pytest.warns(FutureWarning):
+        with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                bytes_per_thread=bytes_per_thread,
+                use_python_file_object=False,
+            )
     assert_eq(pdf, got)
 
     # Use Arrow PythonFile object
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            use_python_file_object=True,
-        )
+    with pytest.warns(FutureWarning):
+        with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                use_python_file_object=True,
+            )
     assert_eq(pdf, got)
 
 
@@ -166,8 +168,9 @@ def test_read_csv_arrow_nativefile(s3_base, s3so, pdf):
         fs = pa_fs.S3FileSystem(
             endpoint_override=s3so["client_kwargs"]["endpoint_url"],
         )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_csv(fil)
+        with pytest.warns(FutureWarning):
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_csv(fil)
 
     assert_eq(pdf, got)
 
@@ -184,17 +187,18 @@ def test_read_csv_byte_range(
 
     # Use fsspec file object
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            byte_range=(74, 73),
-            bytes_per_thread=bytes_per_thread
-            if not use_python_file_object
-            else None,
-            header=None,
-            names=["Integer", "Float", "Integer2", "String", "Boolean"],
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                byte_range=(74, 73),
+                bytes_per_thread=bytes_per_thread
+                if not use_python_file_object
+                else None,
+                header=None,
+                names=["Integer", "Float", "Integer2", "String", "Boolean"],
+                use_python_file_object=use_python_file_object,
+            )
 
     assert_eq(pdf.iloc[-2:].reset_index(drop=True), got)
 
@@ -241,18 +245,19 @@ def test_read_parquet(
     # Check direct path handling
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got1 = cudf.read_parquet(
-            f"s3://{bucket}/{fname}",
-            open_file_options=(
-                {"precache_options": {"method": precache}}
-                if use_python_file_object
-                else None
-            ),
-            storage_options=s3so,
-            bytes_per_thread=bytes_per_thread,
-            columns=columns,
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got1 = cudf.read_parquet(
+                f"s3://{bucket}/{fname}",
+                open_file_options=(
+                    {"precache_options": {"method": precache}}
+                    if use_python_file_object
+                    else None
+                ),
+                storage_options=s3so,
+                bytes_per_thread=bytes_per_thread,
+                columns=columns,
+                use_python_file_object=use_python_file_object,
+            )
     expect = pdf[columns] if columns else pdf
     assert_eq(expect, got1)
 
@@ -263,12 +268,13 @@ def test_read_parquet(
             f"s3://{bucket}/{fname}", storage_options=s3so
         )[0]
         with fs.open(f"s3://{bucket}/{fname}", mode="rb") as f:
-            got2 = cudf.read_parquet(
-                f,
-                bytes_per_thread=bytes_per_thread,
-                columns=columns,
-                use_python_file_object=use_python_file_object,
-            )
+            with pytest.warns(FutureWarning):
+                got2 = cudf.read_parquet(
+                    f,
+                    bytes_per_thread=bytes_per_thread,
+                    columns=columns,
+                    use_python_file_object=use_python_file_object,
+                )
     assert_eq(expect, got2)
 
 
@@ -353,11 +359,12 @@ def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns):
     pdf.to_parquet(path=buffer)
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        fs = pa_fs.S3FileSystem(
-            endpoint_override=s3so["client_kwargs"]["endpoint_url"],
-        )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_parquet(fil, columns=columns)
+        with pytest.warns(FutureWarning):
+            fs = pa_fs.S3FileSystem(
+                endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+            )
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_parquet(fil, columns=columns)
 
     expect = pdf[columns] if columns else pdf
     assert_eq(expect, got)
@@ -372,12 +379,13 @@ def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache):
     buffer.seek(0)
     filters = [("String", "==", "Omega")]
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_parquet(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            filters=filters,
-            open_file_options={"precache_options": {"method": precache}},
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_parquet(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                filters=filters,
+                open_file_options={"precache_options": {"method": precache}},
+            )
 
     # All row-groups should be filtered out
     assert_eq(pdf_ext.iloc[:0], got.reset_index(drop=True))
@@ -449,12 +457,13 @@ def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns):
         buffer = f.read()
 
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_orc(
-            f"s3://{bucket}/{fname}",
-            columns=columns,
-            storage_options=s3so,
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_orc(
+                f"s3://{bucket}/{fname}",
+                columns=columns,
+                storage_options=s3so,
+                use_python_file_object=use_python_file_object,
+            )
 
     if columns:
         expect = expect[columns]
@@ -475,8 +484,9 @@ def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns):
         fs = pa_fs.S3FileSystem(
             endpoint_override=s3so["client_kwargs"]["endpoint_url"],
         )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_orc(fil, columns=columns)
+        with pytest.warns(FutureWarning):
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_orc(fil, columns=columns)
 
     if columns:
         expect = expect[columns]
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 05a91a8fea3..f2faf4343b6 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -8,6 +8,7 @@
 import pandas as pd
 import pyarrow as pa
 import pytest
+from packaging import version
 
 import rmm
 
@@ -211,9 +212,7 @@ def test_scalar_roundtrip(value):
 )
 def test_null_scalar(dtype):
     s = cudf.Scalar(None, dtype=dtype)
-    if cudf.api.types.is_datetime64_dtype(
-        dtype
-    ) or cudf.api.types.is_timedelta64_dtype(dtype):
+    if s.dtype.kind in "mM":
         assert s.value is cudf.NaT
     else:
         assert s.value is cudf.NA
@@ -253,6 +252,22 @@ def test_generic_null_scalar_construction_fails(value):
         cudf.Scalar(value)
 
 
+@pytest.mark.parametrize(
+    "value, dtype", [(1000, "uint8"), (2**30, "int16"), (-1, "uint16")]
+)
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_scalar_out_of_bounds_pyint_fails(value, dtype):
+    # Test that we align with NumPy on scalar creation behavior from
+    # Python integers.
+    if version.parse(np.__version__) >= version.parse("2.0"):
+        with pytest.raises(OverflowError):
+            cudf.Scalar(value, dtype)
+    else:
+        # NumPy allowed this, but it gives a DeprecationWarning on newer
+        # versions (which cudf did not used to do).
+        assert cudf.Scalar(value, dtype).value == np.dtype(dtype).type(value)
+
+
 @pytest.mark.parametrize(
     "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["object"]
 )
@@ -352,12 +367,7 @@ def test_scalar_implicit_int_conversion(value):
 @pytest.mark.parametrize("dtype", sorted(set(ALL_TYPES) - {"category"}))
 def test_scalar_invalid_implicit_conversion(cls, dtype):
     try:
-        cls(
-            pd.NaT
-            if cudf.api.types.is_datetime64_dtype(dtype)
-            or cudf.api.types.is_timedelta64_dtype(dtype)
-            else pd.NA
-        )
+        cls(pd.NaT if cudf.dtype(dtype).kind in "mM" else pd.NA)
     except TypeError as e:
         with pytest.raises(TypeError, match=re.escape(str(e))):
             slr = cudf.Scalar(None, dtype=dtype)
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index dbbf4fba3a6..5f5d79c1dce 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -81,7 +81,10 @@ def generate_valid_scalar_unaop_combos():
 @pytest.mark.parametrize("slr,dtype,op", generate_valid_scalar_unaop_combos())
 def test_scalar_unary_operations(slr, dtype, op):
     slr_host = np.array([slr])[0].astype(cudf.dtype(dtype))
-    slr_device = cudf.Scalar(slr, dtype=dtype)
+    # The scalar may be out of bounds, so go via array force-cast
+    # NOTE: This is a change in behavior
+    slr = np.array(slr).astype(dtype)[()]
+    slr_device = cudf.Scalar(slr)
 
     expect = op(slr_host)
     got = op(slr_device)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 2aa3129ab30..b0788bcc0fc 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -1,7 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import datetime
 from decimal import Decimal
+from typing import TYPE_CHECKING
 
 import cupy as cp
 import numpy as np
@@ -10,8 +12,9 @@
 from pandas.core.dtypes.common import infer_dtype_from_object
 
 import cudf
-from cudf._typing import DtypeObj
-from cudf.api.types import is_bool, is_float, is_integer
+
+if TYPE_CHECKING:
+    from cudf._typing import DtypeObj
 
 """Map numpy dtype to pyarrow types.
 Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special
@@ -91,10 +94,6 @@
 BOOL_TYPES = {"bool"}
 ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES
 
-# The NumPy scalar types are a bit of a mess as they align with the C types
-# so for now we use the `sctypes` dict (although it was made private in 2.0)
-_NUMPY_SCTYPES = np.sctypes if hasattr(np, "sctypes") else np._core.sctypes
-
 
 def np_to_pa_dtype(dtype):
     """Util to convert numpy dtype to PyArrow dtype."""
@@ -116,12 +115,6 @@ def np_to_pa_dtype(dtype):
     return _np_pa_dtypes[cudf.dtype(dtype).type]
 
 
-def numeric_normalize_types(*args):
-    """Cast all args to a common type using numpy promotion logic"""
-    dtype = np.result_type(*[a.dtype for a in args])
-    return [a.astype(dtype) for a in args]
-
-
 def _find_common_type_decimal(dtypes):
     # Find the largest scale and the largest difference between
     # precision and scale of the columns to be concatenated
@@ -253,16 +246,18 @@ def to_cudf_compatible_scalar(val, dtype=None):
     elif isinstance(val, datetime.timedelta):
         val = np.timedelta64(val)
 
-    val = _maybe_convert_to_default_type(
-        cudf.api.types.pandas_dtype(type(val))
-    ).type(val)
-
     if dtype is not None:
-        if isinstance(val, str) and np.dtype(dtype).kind == "M":
+        dtype = np.dtype(dtype)
+        if isinstance(val, str) and dtype.kind == "M":
             # pd.Timestamp can handle str, but not np.str_
             val = pd.Timestamp(str(val)).to_datetime64().astype(dtype)
         else:
-            val = val.astype(dtype)
+            # At least datetimes cannot be converted to scalar via dtype.type:
+            val = np.array(val, dtype)[()]
+    else:
+        val = _maybe_convert_to_default_type(
+            cudf.api.types.pandas_dtype(type(val))
+        ).type(val)
 
     if val.dtype.type is np.datetime64:
         time_unit, _ = np.datetime_data(val.dtype)
@@ -330,32 +325,28 @@ def can_convert_to_column(obj):
     return is_column_like(obj) or cudf.api.types.is_list_like(obj)
 
 
-def min_scalar_type(a, min_size=8):
-    return min_signed_type(a, min_size=min_size)
-
-
-def min_signed_type(x, min_size=8):
+def min_signed_type(x: int, min_size: int = 8) -> np.dtype:
     """
     Return the smallest *signed* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in _NUMPY_SCTYPES["int"]:
+    for int_dtype in (np.int8, np.int16, np.int32, np.int64):
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max:
-                return int_dtype
+                return np.dtype(int_dtype)
     # resort to using `int64` and let numpy raise appropriate exception:
     return np.int64(x).dtype
 
 
-def min_unsigned_type(x, min_size=8):
+def min_unsigned_type(x: int, min_size: int = 8) -> np.dtype:
     """
     Return the smallest *unsigned* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in _NUMPY_SCTYPES["uint"]:
+    for int_dtype in (np.uint8, np.uint16, np.uint32, np.uint64):
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if 0 <= x <= np.iinfo(int_dtype).max:
-                return int_dtype
+                return np.dtype(int_dtype)
     # resort to using `uint64` and let numpy raise appropriate exception:
     return np.uint64(x).dtype
 
@@ -373,10 +364,10 @@ def min_column_type(x, expected_type):
     if x.null_count == len(x):
         return x.dtype
 
-    if np.issubdtype(x.dtype, np.floating):
+    if x.dtype.kind == "f":
         return get_min_float_dtype(x)
 
-    elif np.issubdtype(expected_type, np.integer):
+    elif cudf.dtype(expected_type).kind in "iu":
         max_bound_dtype = np.min_scalar_type(x.max())
         min_bound_dtype = np.min_scalar_type(x.min())
         result_type = np.promote_types(max_bound_dtype, min_bound_dtype)
@@ -422,9 +413,7 @@ def get_time_unit(obj):
 
 def _get_nan_for_dtype(dtype):
     dtype = cudf.dtype(dtype)
-    if pd.api.types.is_datetime64_dtype(
-        dtype
-    ) or pd.api.types.is_timedelta64_dtype(dtype):
+    if dtype.kind in "mM":
         time_unit, _ = np.datetime_data(dtype)
         return dtype.type("nat", time_unit)
     elif dtype.kind == "f":
@@ -525,16 +514,14 @@ def find_common_type(dtypes):
             return cudf.dtype("O")
 
     # Aggregate same types
-    dtypes = set(dtypes)
+    dtypes = {cudf.dtype(dtype) for dtype in dtypes}
+    if len(dtypes) == 1:
+        return dtypes.pop()
 
     if any(
         isinstance(dtype, cudf.core.dtypes.DecimalDtype) for dtype in dtypes
     ):
-        if all(
-            cudf.api.types.is_decimal_dtype(dtype)
-            or cudf.api.types.is_numeric_dtype(dtype)
-            for dtype in dtypes
-        ):
+        if all(cudf.api.types.is_numeric_dtype(dtype) for dtype in dtypes):
             return _find_common_type_decimal(
                 [
                     dtype
@@ -544,40 +531,28 @@ def find_common_type(dtypes):
             )
         else:
             return cudf.dtype("O")
-    if any(isinstance(dtype, cudf.ListDtype) for dtype in dtypes):
-        if len(dtypes) == 1:
-            return dtypes.get(0)
-        else:
-            # TODO: As list dtypes allow casting
-            # to identical types, improve this logic of returning a
-            # common dtype, for example:
-            # ListDtype(int64) & ListDtype(int32) common
-            # dtype could be ListDtype(int64).
-            raise NotImplementedError(
-                "Finding a common type for `ListDtype` is currently "
-                "not supported"
-            )
-    if any(isinstance(dtype, cudf.StructDtype) for dtype in dtypes):
-        if len(dtypes) == 1:
-            return dtypes.get(0)
-        else:
-            raise NotImplementedError(
-                "Finding a common type for `StructDtype` is currently "
-                "not supported"
-            )
+    elif any(
+        isinstance(dtype, (cudf.ListDtype, cudf.StructDtype))
+        for dtype in dtypes
+    ):
+        # TODO: As list dtypes allow casting
+        # to identical types, improve this logic of returning a
+        # common dtype, for example:
+        # ListDtype(int64) & ListDtype(int32) common
+        # dtype could be ListDtype(int64).
+        raise NotImplementedError(
+            "Finding a common type for `ListDtype` or `StructDtype` is currently "
+            "not supported"
+        )
 
     # Corner case 1:
     # Resort to np.result_type to handle "M" and "m" types separately
-    dt_dtypes = set(
-        filter(lambda t: cudf.api.types.is_datetime_dtype(t), dtypes)
-    )
+    dt_dtypes = set(filter(lambda t: t.kind == "M", dtypes))
     if len(dt_dtypes) > 0:
         dtypes = dtypes - dt_dtypes
         dtypes.add(np.result_type(*dt_dtypes))
 
-    td_dtypes = set(
-        filter(lambda t: pd.api.types.is_timedelta64_dtype(t), dtypes)
-    )
+    td_dtypes = set(filter(lambda t: t.kind == "m", dtypes))
     if len(td_dtypes) > 0:
         dtypes = dtypes - td_dtypes
         dtypes.add(np.result_type(*td_dtypes))
@@ -598,121 +573,22 @@ def _dtype_pandas_compatible(dtype):
     return dtype
 
 
-def _can_cast(from_dtype, to_dtype):
-    """
-    Utility function to determine if we can cast
-    from `from_dtype` to `to_dtype`. This function primarily calls
-    `np.can_cast` but with some special handling around
-    cudf specific dtypes.
-    """
-    if cudf.utils.utils.is_na_like(from_dtype):
-        return True
-    if isinstance(from_dtype, type):
-        from_dtype = cudf.dtype(from_dtype)
-    if isinstance(to_dtype, type):
-        to_dtype = cudf.dtype(to_dtype)
-
-    # TODO : Add precision & scale checking for
-    # decimal types in future
-
-    if isinstance(from_dtype, cudf.core.dtypes.DecimalDtype):
-        if isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
-            return True
-        elif isinstance(to_dtype, np.dtype):
-            if to_dtype.kind in {"i", "f", "u", "U", "O"}:
-                return True
-            else:
-                return False
-    elif isinstance(from_dtype, np.dtype):
-        if isinstance(to_dtype, np.dtype):
-            return np.can_cast(from_dtype, to_dtype)
-        elif isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
-            if from_dtype.kind in {"i", "f", "u", "U", "O"}:
-                return True
-            else:
-                return False
-        elif isinstance(to_dtype, cudf.core.types.CategoricalDtype):
-            return True
-        else:
-            return False
-    elif isinstance(from_dtype, cudf.core.dtypes.ListDtype):
-        # TODO: Add level based checks too once casting of
-        # list columns is supported
-        if isinstance(to_dtype, cudf.core.dtypes.ListDtype):
-            return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type)
-        else:
-            return False
-    elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype):
-        if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype):
-            return True
-        elif isinstance(to_dtype, np.dtype):
-            return np.can_cast(from_dtype._categories.dtype, to_dtype)
-        else:
-            return False
-    else:
-        return np.can_cast(from_dtype, to_dtype)
-
-
-def _maybe_convert_to_default_type(dtype):
+def _maybe_convert_to_default_type(dtype: DtypeObj) -> DtypeObj:
     """Convert `dtype` to default if specified by user.
 
     If not specified, return as is.
     """
-    if cudf.get_option("default_integer_bitwidth"):
-        if cudf.api.types.is_signed_integer_dtype(dtype):
-            return cudf.dtype(
-                f'i{cudf.get_option("default_integer_bitwidth")//8}'
-            )
-        elif cudf.api.types.is_unsigned_integer_dtype(dtype):
-            return cudf.dtype(
-                f'u{cudf.get_option("default_integer_bitwidth")//8}'
-            )
-    if cudf.get_option(
-        "default_float_bitwidth"
-    ) and cudf.api.types.is_float_dtype(dtype):
-        return cudf.dtype(f'f{cudf.get_option("default_float_bitwidth")//8}')
-
+    if ib := cudf.get_option("default_integer_bitwidth"):
+        if dtype.kind == "i":
+            return cudf.dtype(f"i{ib//8}")
+        elif dtype.kind == "u":
+            return cudf.dtype(f"u{ib//8}")
+    if (fb := cudf.get_option("default_float_bitwidth")) and dtype.kind == "f":
+        return cudf.dtype(f"f{fb//8}")
     return dtype
 
 
-def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool:
-    if not len(rng):
-        return True
-    return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype)
-
-
-def _dtype_can_hold_element(dtype: np.dtype, element) -> bool:
-    if dtype.kind in {"i", "u"}:
-        if isinstance(element, range):
-            if _dtype_can_hold_range(element, dtype):
-                return True
-            return False
-
-        elif is_integer(element) or (
-            is_float(element) and element.is_integer()
-        ):
-            info = np.iinfo(dtype)
-            if info.min <= element <= info.max:
-                return True
-            return False
-
-    elif dtype.kind == "f":
-        if is_integer(element) or is_float(element):
-            casted = dtype.type(element)
-            if np.isnan(casted) or casted == element:
-                return True
-            # otherwise e.g. overflow see TestCoercionFloat32
-            return False
-
-    elif dtype.kind == "b":
-        if is_bool(element):
-            return True
-        return False
-
-    raise NotImplementedError(f"Unsupported dtype: {dtype}")
-
-
-def _get_base_dtype(dtype: DtypeObj) -> DtypeObj:
+def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype:
     # TODO: replace the use of this function with just `dtype.base`
     # when Pandas 2.1.0 is the minimum version we support:
     # https://github.com/pandas-dev/pandas/pull/52706
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 0209c692935..80555750b3a 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -6,6 +6,7 @@
 import warnings
 from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper
 from threading import Thread
+from typing import Callable
 
 import fsspec
 import fsspec.implementations.local
@@ -15,6 +16,7 @@
 from pyarrow import PythonFile as ArrowPythonFile
 from pyarrow.lib import NativeFile
 
+from cudf.api.extensions import no_default
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
 
@@ -24,7 +26,6 @@
 except ImportError:
     fsspec_parquet = None
 
-
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
 _ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024
 
@@ -86,7 +87,7 @@
 1       20  rapids
 2       30      ai
 """.format(remote_data_sources=_docstring_remote_sources)
-doc_read_avro = docfmt_partial(docstring=_docstring_read_avro)
+doc_read_avro: Callable = docfmt_partial(docstring=_docstring_read_avro)
 
 _docstring_read_parquet_metadata = """
 Read a Parquet file's metadata and schema
@@ -174,15 +175,23 @@
     columns are also loaded.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. Setting this argument to `False`
-    will require the entire file to be copied to host memory, and is highly
-    discouraged.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 open_file_options : dict, optional
     Dictionary of key-value pairs to pass to the function used to open remote
     files. By default, this will be `fsspec.parquet.open_parquet_file`. To
     deactivate optimized precaching, set the "method" to `None` under the
     "precache_options" key. Note that the `open_file_func` key can also be
     used to specify a custom file-open function.
+
+    .. deprecated:: 24.08
+        `open_file_options` is deprecated as it was intended for
+        pyarrow file inputs, which will no longer be accepted as
+        input/output cudf readers/writers in the future.
 bytes_per_thread : int, default None
     Determines the number of bytes to be allocated per thread to read the
     files in parallel. When there is a file of large size, we get slightly
@@ -322,6 +331,12 @@
 output_as_binary : set, optional, default None
     If a column name is present in the set, that column will be output as
     unannotated binary, rather than the default 'UTF-8'.
+store_schema : bool, default False
+    If ``True``, writes arrow schema to Parquet file footer's key-value
+    metadata section to faithfully round-trip ``duration`` types with arrow.
+    This cannot be used with ``int96_timestamps`` enabled as int96 timestamps
+    are deprecated in arrow. Also, all decimal32 and decimal64 columns will be
+    converted to decimal128 as arrow only supports decimal128 and decimal256 types.
 **kwargs
     Additional parameters will be passed to execution engines other
     than ``cudf``.
@@ -462,8 +477,12 @@
     If True, use row index if available for faster seeking.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. This option is likely to improve
-    performance when making small reads from larger ORC files.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -928,7 +947,7 @@
 --------
 cudf.DataFrame.to_hdf : Write a HDF file from a DataFrame.
 """
-doc_read_hdf = docfmt_partial(docstring=_docstring_read_hdf)
+doc_read_hdf: Callable = docfmt_partial(docstring=_docstring_read_hdf)
 
 _docstring_to_hdf = """
 Write the contained data to an HDF5 file using HDFStore.
@@ -1000,7 +1019,7 @@
 cudf.DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
 cudf.DataFrame.to_feather : Write out feather-format for DataFrames.
 """
-doc_to_hdf = docfmt_partial(docstring=_docstring_to_hdf)
+doc_to_hdf: Callable = docfmt_partial(docstring=_docstring_to_hdf)
 
 _docstring_read_feather = """
 Load an feather object from the file path, returning a DataFrame.
@@ -1182,8 +1201,12 @@
     the end of the range.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. This option is likely to improve
-    performance when making small reads from larger CSV files.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -1403,7 +1426,7 @@
 result : Series
 
 """
-doc_read_text = docfmt_partial(docstring=_docstring_text_datasource)
+doc_read_text: Callable = docfmt_partial(docstring=_docstring_text_datasource)
 
 
 _docstring_get_reader_filepath_or_buffer = """
@@ -1424,9 +1447,19 @@
 use_python_file_object : boolean, default False
     If True, Arrow-backed PythonFile objects will be used in place
     of fsspec AbstractBufferedFile objects.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers.
 open_file_options : dict, optional
     Optional dictionary of keyword arguments to pass to
     `_open_remote_files` (used for remote storage only).
+
+    .. deprecated:: 24.08
+        `open_file_options` is deprecated as it was intended for
+        pyarrow file inputs, which will no longer be accepted as
+        input/output cudf readers/writers in the future.
 allow_raw_text_input : boolean, default False
     If True, this indicates the input `path_or_data` could be a raw text
     input and will not check for its existence in the filesystem. If False,
@@ -1702,7 +1735,8 @@ def get_reader_filepath_or_buffer(
     mode="rb",
     fs=None,
     iotypes=(BytesIO, NativeFile),
-    use_python_file_object=False,
+    # no_default aliases to False
+    use_python_file_object=no_default,
     open_file_options=None,
     allow_raw_text_input=False,
     storage_options=None,
@@ -1714,6 +1748,30 @@ def get_reader_filepath_or_buffer(
 
     path_or_data = stringify_pathlike(path_or_data)
 
+    if use_python_file_object is no_default:
+        use_python_file_object = False
+    elif use_python_file_object is not None:
+        warnings.warn(
+            "The 'use_python_file_object' keyword is deprecated and "
+            "will be removed in a future version.",
+            FutureWarning,
+        )
+    else:
+        # Preserve the readers (e.g. read_csv) default of True
+        # if no use_python_file_object option is specified by the user
+        # for now (note: this is different from the default for this
+        # function of False)
+        # TODO: when non-pyarrow file reading perf is good enough
+        # we can default this to False
+        use_python_file_object = True
+
+    if open_file_options is not None:
+        warnings.warn(
+            "The 'open_file_options' keyword is deprecated and "
+            "will be removed in a future version.",
+            FutureWarning,
+        )
+
     if isinstance(path_or_data, str):
         # Get a filesystem object if one isn't already available
         paths = [path_or_data]
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 7347ec7866a..c9b343e0f9f 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -6,6 +6,7 @@
 import os
 import traceback
 import warnings
+from contextlib import contextmanager
 
 import numpy as np
 import pandas as pd
@@ -403,3 +404,28 @@ def _all_bools_with_nulls(lhs, rhs, bool_fill_value):
     if result_mask is not None:
         result_col = result_col.set_mask(result_mask.as_mask())
     return result_col
+
+
+@contextmanager
+def maybe_filter_deprecation(
+    condition: bool, message: str, category: type[Warning]
+):
+    """Conditionally filter a warning category.
+
+    Parameters
+    ----------
+    condition
+        If true, filter the warning
+    message
+        Message to match, passed to :func:`warnings.filterwarnings`
+    category
+        Category of warning, passed to :func:`warnings.filterwarnings`
+    """
+    with warnings.catch_warnings():
+        if condition:
+            warnings.filterwarnings(
+                "ignore",
+                message,
+                category=category,
+            )
+        yield
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index f51ce103677..6292022d8e4 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1080,6 +1080,13 @@ def test_pickle(obj):
 
     tm.assert_equal(obj, copy)
 
+    with tempfile.TemporaryFile() as f:
+        xpd.to_pickle(obj, f)
+        f.seek(0)
+        copy = xpd.read_pickle(f)
+
+    tm.assert_equal(obj, copy)
+
 
 def test_dataframe_query():
     cudf_pandas_df = xpd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
@@ -1566,3 +1573,62 @@ def test_arrow_string_arrays():
     )
 
     tm.assert_equal(cu_arr, pd_arr)
+
+
+@pytest.mark.parametrize("indexer", ["at", "iat"])
+def test_at_iat(indexer):
+    df = xpd.DataFrame(range(3))
+    result = getattr(df, indexer)[0, 0]
+    assert result == 0
+
+    getattr(df, indexer)[0, 0] = 1
+    expected = pd.DataFrame([1, 1, 2])
+    tm.assert_frame_equal(df, expected)
+
+
+def test_at_setitem_empty():
+    df = xpd.DataFrame({"name": []}, dtype="float64")
+    df.at[0, "name"] = 1.0
+    df.at[0, "new"] = 2.0
+    expected = pd.DataFrame({"name": [1.0], "new": [2.0]})
+    tm.assert_frame_equal(df, expected)
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        xpd.Index([1, 2, 3], name="foo"),
+        xpd.Index(["a", "b", "c"], name="foo"),
+        xpd.RangeIndex(start=0, stop=3, step=1, name="foo"),
+        xpd.CategoricalIndex(["a", "b", "a"], name="foo"),
+        xpd.DatetimeIndex(
+            ["2024-04-24", "2025-04-24", "2026-04-24"], name="foo"
+        ),
+        xpd.TimedeltaIndex(["1 days", "2 days", "3 days"], name="foo"),
+        xpd.PeriodIndex(
+            ["2024-06", "2023-06", "2022-06"], freq="M", name="foo"
+        ),
+        xpd.IntervalIndex.from_breaks([0, 1, 2, 3], name="foo"),
+        xpd.MultiIndex.from_tuples(
+            [(1, "a"), (2, "b"), (3, "c")], names=["foo1", "bar1"]
+        ),
+    ],
+)
+def test_change_index_name(index):
+    s = xpd.Series([1, 2, object()], index=index)
+    df = xpd.DataFrame({"values": [1, 2, object()]}, index=index)
+
+    if isinstance(index, xpd.MultiIndex):
+        names = ["foo2", "bar2"]
+        s.index.names = names
+        df.index.names = names
+
+        assert s.index.names == names
+        assert df.index.names == names
+    else:
+        name = "bar"
+        s.index.name = name
+        df.index.name = name
+
+        assert s.index.name == name
+        assert df.index.name == name
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 20b731624df..dcb33b1fc1a 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -121,7 +121,7 @@ skip = [
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
 requires = [
-    "cmake>=3.26.4",
+    "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 11e18cd4f32..badfdf06d15 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -101,7 +101,7 @@ regex = "(?P<value>.*)"
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
 requires = [
-    "cmake>=3.26.4",
+    "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 979087d5273..764cdd3b3ca 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -34,7 +34,12 @@ def _callback(
         return ir.evaluate(cache={}).to_polars()
 
 
-def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None:
+def execute_with_cudf(
+    nt: NodeTraverser,
+    *,
+    raise_on_fail: bool = False,
+    exception: type[Exception] | tuple[type[Exception], ...] = Exception,
+) -> None:
     """
     A post optimization callback that attempts to execute the plan with cudf.
 
@@ -47,11 +52,15 @@ def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None
         Should conversion raise an exception rather than continuing
         without setting a callback.
 
+    exception
+        Optional exception, or tuple of exceptions, to catch during
+        translation. Defaults to ``Exception``.
+
     The NodeTraverser is mutated if the libcudf executor can handle the plan.
     """
     try:
         with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
             nt.set_udf(partial(_callback, translate_ir(nt)))
-    except NotImplementedError:
+    except exception:
         if raise_on_fail:
             raise
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 28685f0c4ed..02018548b2c 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -13,6 +13,8 @@
 if TYPE_CHECKING:
     from typing_extensions import Self
 
+    import polars as pl
+
 __all__: list[str] = ["Column", "NamedColumn"]
 
 
@@ -76,12 +78,49 @@ def sorted_like(self, like: Column, /) -> Self:
 
         See Also
         --------
-        set_sorted
+        set_sorted, copy_metadata
         """
         return self.set_sorted(
             is_sorted=like.is_sorted, order=like.order, null_order=like.null_order
         )
 
+    def copy_metadata(self, from_: pl.Series, /) -> Self:
+        """
+        Copy metadata from a host series onto self.
+
+        Parameters
+        ----------
+        from_
+            Polars series to copy metadata from
+
+        Returns
+        -------
+        Self with metadata set.
+
+        See Also
+        --------
+        set_sorted, sorted_like
+        """
+        if len(from_) <= 1:
+            return self
+        ascending = from_.flags["SORTED_ASC"]
+        descending = from_.flags["SORTED_DESC"]
+        if ascending or descending:
+            has_null_first = from_.item(0) is None
+            has_null_last = from_.item(-1) is None
+            order = (
+                plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING
+            )
+            null_order = plc.types.NullOrder.BEFORE
+            if (descending and has_null_first) or (ascending and has_null_last):
+                null_order = plc.types.NullOrder.AFTER
+            return self.set_sorted(
+                is_sorted=plc.types.Sorted.YES,
+                order=order,
+                null_order=null_order,
+            )
+        return self
+
     def set_sorted(
         self,
         *,
@@ -128,24 +167,28 @@ def copy(self) -> Self:
         )
 
     def mask_nans(self) -> Self:
-        """Return a copy of self with nans masked out."""
-        if self.nan_count > 0:
-            raise NotImplementedError("Need to port transform.hpp to pylibcudf")
+        """Return a shallow copy of self with nans masked out."""
+        if plc.traits.is_floating_point(self.obj.type()):
+            old_count = self.obj.null_count()
+            mask, new_count = plc.transform.nans_to_nulls(self.obj)
+            result = type(self)(self.obj.with_mask(mask, new_count))
+            if old_count == new_count:
+                return result.sorted_like(self)
+            return result
         return self.copy()
 
     @functools.cached_property
     def nan_count(self) -> int:
         """Return the number of NaN values in the column."""
-        if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
-            return 0
-        return plc.interop.to_arrow(
-            plc.reduce.reduce(
-                plc.unary.is_nan(self.obj),
-                plc.aggregation.sum(),
-                # TODO: pylibcudf needs to have a SizeType DataType singleton
-                plc.DataType(plc.TypeId.INT32),
-            )
-        ).as_py()
+        if plc.traits.is_floating_point(self.obj.type()):
+            return plc.interop.to_arrow(
+                plc.reduce.reduce(
+                    plc.unary.is_nan(self.obj),
+                    plc.aggregation.sum(),
+                    plc.types.SIZE_TYPE,
+                )
+            ).as_py()
+        return 0
 
 
 class NamedColumn(Column):
@@ -187,3 +230,17 @@ def copy(self, *, new_name: str | None = None) -> Self:
             order=self.order,
             null_order=self.null_order,
         )
+
+    def mask_nans(self) -> Self:
+        """Return a shallow copy of self with nans masked out."""
+        # Annoying, the inheritance is not right (can't call the
+        # super-type mask_nans), but will sort that by refactoring
+        # later.
+        if plc.traits.is_floating_point(self.obj.type()):
+            old_count = self.obj.null_count()
+            mask, new_count = plc.transform.nans_to_nulls(self.obj)
+            result = type(self)(self.obj.with_mask(mask, new_count), self.name)
+            if old_count == new_count:
+                return result.sorted_like(self)
+            return result
+        return self.copy()
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index ec8d00c3123..cbeadf1426a 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -5,19 +5,22 @@
 
 from __future__ import annotations
 
+import itertools
 from functools import cached_property
 from typing import TYPE_CHECKING, cast
 
+import pyarrow as pa
+
 import polars as pl
 
 import cudf._lib.pylibcudf as plc
 
 from cudf_polars.containers.column import NamedColumn
+from cudf_polars.utils import dtypes
 
 if TYPE_CHECKING:
     from collections.abc import Mapping, Sequence, Set
 
-    import pyarrow as pa
     from typing_extensions import Self
 
     import cudf
@@ -49,8 +52,16 @@ def to_polars(self) -> pl.DataFrame:
             self.table,
             [plc.interop.ColumnMetadata(name=c.name) for c in self.columns],
         )
-
-        return cast(pl.DataFrame, pl.from_arrow(table))
+        return cast(pl.DataFrame, pl.from_arrow(table)).with_columns(
+            *(
+                pl.col(c.name).set_sorted(
+                    descending=c.order == plc.types.Order.DESCENDING
+                )
+                if c.is_sorted
+                else pl.col(c.name)
+                for c in self.columns
+            )
+        )
 
     @cached_property
     def column_names_set(self) -> frozenset[str]:
@@ -82,6 +93,35 @@ def from_cudf(cls, df: cudf.DataFrame) -> Self:
             ]
         )
 
+    @classmethod
+    def from_polars(cls, df: pl.DataFrame) -> Self:
+        """
+        Create from a polars dataframe.
+
+        Parameters
+        ----------
+        df
+            Polars dataframe to convert
+
+        Returns
+        -------
+        New dataframe representing the input.
+        """
+        table = df.to_arrow()
+        schema = table.schema
+        for i, field in enumerate(schema):
+            schema = schema.set(
+                i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type))
+            )
+        # No-op if the schema is unchanged.
+        d_table = plc.interop.from_arrow(table.cast(schema))
+        return cls(
+            [
+                NamedColumn(column, h_col.name).copy_metadata(h_col)
+                for column, h_col in zip(d_table.columns(), df.iter_columns())
+            ]
+        )
+
     @classmethod
     def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
         """
@@ -160,7 +200,10 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self:
         -----
         If column names overlap, newer names replace older ones.
         """
-        return type(self)([*self.columns, *columns])
+        columns = list(
+            {c.name: c for c in itertools.chain(self.columns, columns)}.values()
+        )
+        return type(self)(columns)
 
     def discard_columns(self, names: Set[str]) -> Self:
         """Drop columns by name."""
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index fe859c8d958..a034d55120a 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -32,7 +32,7 @@
 if TYPE_CHECKING:
     from collections.abc import Mapping, Sequence
 
-    import polars.polars as plrs
+    import polars as pl
     import polars.type_aliases as pl_types
 
     from cudf_polars.containers import DataFrame
@@ -44,6 +44,7 @@
     "Col",
     "BooleanFunction",
     "StringFunction",
+    "TemporalFunction",
     "Sort",
     "SortBy",
     "Gather",
@@ -369,6 +370,10 @@ def do_evaluate(
         # datatype of pyarrow scalar is correct by construction.
         return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1))
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([])
+
 
 class LiteralColumn(Expr):
     __slots__ = ("value",)
@@ -376,11 +381,18 @@ class LiteralColumn(Expr):
     value: pa.Array[Any, Any]
     children: tuple[()]
 
-    def __init__(self, dtype: plc.DataType, value: plrs.PySeries) -> None:
+    def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
         super().__init__(dtype)
         data = value.to_arrow()
         self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
 
+    def get_hash(self) -> int:
+        """Compute a hash of the column."""
+        # This is stricter than necessary, but we only need this hash
+        # for identity in groupby replacements so it's OK. And this
+        # way we avoid doing potentially expensive compute.
+        return hash((type(self), self.dtype, id(self.value)))
+
     def do_evaluate(
         self,
         df: DataFrame,
@@ -392,6 +404,10 @@ def do_evaluate(
         # datatype of pyarrow array is correct by construction.
         return Column(plc.interop.from_arrow(self.value))
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([])
+
 
 class Col(Expr):
     __slots__ = ("name",)
@@ -703,6 +719,7 @@ def _validate_input(self):
             pl_expr.StringFunction.EndsWith,
             pl_expr.StringFunction.StartsWith,
             pl_expr.StringFunction.Contains,
+            pl_expr.StringFunction.Slice,
         ):
             raise NotImplementedError(f"String function {self.name}")
         if self.name == pl_expr.StringFunction.Contains:
@@ -716,6 +733,11 @@ def _validate_input(self):
                     raise NotImplementedError(
                         "Regex contains only supports a scalar pattern"
                     )
+        elif self.name == pl_expr.StringFunction.Slice:
+            if not all(isinstance(child, Literal) for child in self.children[1:]):
+                raise NotImplementedError(
+                    "Slice only supports literal start and stop values"
+                )
 
     def do_evaluate(
         self,
@@ -744,6 +766,36 @@ def do_evaluate(
                 flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
             )
             return Column(plc.strings.contains.contains_re(column.obj, prog))
+        elif self.name == pl_expr.StringFunction.Slice:
+            child, expr_offset, expr_length = self.children
+            assert isinstance(expr_offset, Literal)
+            assert isinstance(expr_length, Literal)
+
+            column = child.evaluate(df, context=context, mapping=mapping)
+            # libcudf slices via [start,stop).
+            # polars slices with offset + length where start == offset
+            # stop = start + length. Negative values for start look backward
+            # from the last element of the string. If the end index would be
+            # below zero, an empty string is returned.
+            # Do this maths on the host
+            start = expr_offset.value.as_py()
+            length = expr_length.value.as_py()
+
+            if length == 0:
+                stop = start
+            else:
+                # No length indicates a scan to the end
+                # The libcudf equivalent is a null stop
+                stop = start + length if length else None
+                if length and start < 0 and length >= -start:
+                    stop = None
+            return Column(
+                plc.strings.slice.slice_strings(
+                    column.obj,
+                    plc.interop.from_arrow(pa.scalar(start, type=pa.int32())),
+                    plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())),
+                )
+            )
         columns = [
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
@@ -779,6 +831,159 @@ def do_evaluate(
         )  # pragma: no cover; handled by init raising
 
 
+class TemporalFunction(Expr):
+    __slots__ = ("name", "options", "children")
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        name: pl_expr.TemporalFunction,
+        options: tuple[Any, ...],
+        *children: Expr,
+    ) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.name = name
+        self.children = children
+        if self.name != pl_expr.TemporalFunction.Year:
+            raise NotImplementedError(f"String function {self.name}")
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        columns = [
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        ]
+        if self.name == pl_expr.TemporalFunction.Year:
+            (column,) = columns
+            return Column(plc.datetime.extract_year(column.obj))
+        raise NotImplementedError(
+            f"TemporalFunction {self.name}"
+        )  # pragma: no cover; init trips first
+
+
+class UnaryFunction(Expr):
+    __slots__ = ("name", "options", "children")
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr
+    ) -> None:
+        super().__init__(dtype)
+        self.name = name
+        self.options = options
+        self.children = children
+        if self.name not in ("mask_nans", "round", "setsorted", "unique"):
+            raise NotImplementedError(f"Unary function {name=}")
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        if self.name == "mask_nans":
+            (child,) = self.children
+            return child.evaluate(df, context=context, mapping=mapping).mask_nans()
+        if self.name == "round":
+            (decimal_places,) = self.options
+            (values,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            return Column(
+                plc.round.round(
+                    values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP
+                )
+            ).sorted_like(values)
+        elif self.name == "unique":
+            (maintain_order,) = self.options
+            (values,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            # Only one column, so keep_any is the same as keep_first
+            # for stable distinct
+            keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY
+            if values.is_sorted:
+                maintain_order = True
+                result = plc.stream_compaction.unique(
+                    plc.Table([values.obj]),
+                    [0],
+                    keep,
+                    plc.types.NullEquality.EQUAL,
+                )
+            else:
+                distinct = (
+                    plc.stream_compaction.stable_distinct
+                    if maintain_order
+                    else plc.stream_compaction.distinct
+                )
+                result = distinct(
+                    plc.Table([values.obj]),
+                    [0],
+                    keep,
+                    plc.types.NullEquality.EQUAL,
+                    plc.types.NanEquality.ALL_EQUAL,
+                )
+            (column,) = result.columns()
+            if maintain_order:
+                return Column(column).sorted_like(values)
+            return Column(column)
+        elif self.name == "setsorted":
+            (column,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            (asc,) = self.options
+            order = (
+                plc.types.Order.ASCENDING
+                if asc == "ascending"
+                else plc.types.Order.DESCENDING
+            )
+            null_order = plc.types.NullOrder.BEFORE
+            if column.obj.null_count() > 0 and (n := column.obj.size()) > 1:
+                # PERF: This invokes four stream synchronisations!
+                has_nulls_first = not plc.copying.get_element(column.obj, 0).is_valid()
+                has_nulls_last = not plc.copying.get_element(
+                    column.obj, n - 1
+                ).is_valid()
+                if (order == plc.types.Order.DESCENDING and has_nulls_first) or (
+                    order == plc.types.Order.ASCENDING and has_nulls_last
+                ):
+                    null_order = plc.types.NullOrder.AFTER
+            return column.set_sorted(
+                is_sorted=plc.types.Sorted.YES,
+                order=order,
+                null_order=null_order,
+            )
+        raise NotImplementedError(
+            f"Unimplemented unary function {self.name=}"
+        )  # pragma: no cover; init trips first
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if depth == 1:
+            # inside aggregation, need to pre-evaluate, groupby
+            # construction has checked that we don't have nested aggs,
+            # so stop the recursion and return ourselves for pre-eval
+            return AggInfo([(self, plc.aggregation.collect_list(), self)])
+        else:
+            (child,) = self.children
+            return child.collect_agg(depth=depth)
+
+
 class Sort(Expr):
     __slots__ = ("options", "children")
     _non_child = ("dtype", "options")
@@ -1055,12 +1260,19 @@ def collect_agg(self, *, depth: int) -> AggInfo:
             raise NotImplementedError(
                 "Nested aggregations in groupby"
             )  # pragma: no cover; check_agg trips first
+        if (isminmax := self.name in {"min", "max"}) and self.options:
+            raise NotImplementedError("Nan propagation in groupby for min/max")
         (child,) = self.children
         ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
         if self.request is None:
             raise NotImplementedError(
                 f"Aggregation {self.name} in groupby"
             )  # pragma: no cover; __init__ trips first
+        if isminmax and plc.traits.is_floating_point(self.dtype):
+            assert expr is not None
+            # Ignore nans in these groupby aggs, do this by masking
+            # nans in the input
+            expr = UnaryFunction(self.dtype, "mask_nans", (), expr)
         return AggInfo([(expr, self.request, self)])
 
     def _reduce(
@@ -1182,7 +1394,8 @@ def __init__(
         self.children = (left, right)
         if (
             op in (plc.binaryop.BinaryOperator.ADD, plc.binaryop.BinaryOperator.SUB)
-            and ({left.dtype.id(), right.dtype.id()}.issubset(dtypes.TIMELIKE_TYPES))
+            and plc.traits.is_chrono(left.dtype)
+            and plc.traits.is_chrono(right.dtype)
             and not dtypes.have_compatible_resolution(left.dtype.id(), right.dtype.id())
         ):
             raise NotImplementedError("Casting rules for timelike types")
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 9b3096becd4..a84fe73810e 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -15,9 +15,9 @@
 
 import dataclasses
 import itertools
-import json
 import types
 from functools import cache
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, ClassVar
 
 import pyarrow as pa
@@ -30,7 +30,7 @@
 
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import DataFrame, NamedColumn
-from cudf_polars.utils import dtypes, sorting
+from cudf_polars.utils import sorting
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
@@ -96,6 +96,8 @@ def broadcast(
     ``target_length`` is provided and not all columns are length-1
     (i.e. ``n != 1``), then ``target_length`` must be equal to ``n``.
     """
+    if len(columns) == 0:
+        return []
     lengths: set[int] = {column.obj.size() for column in columns}
     if lengths == {1}:
         if target_length is None:
@@ -183,8 +185,10 @@ class Scan(IR):
 
     typ: str
     """What type of file are we reading? Parquet, CSV, etc..."""
-    options: tuple[Any, ...]
-    """Type specific options, as json-encoded strings."""
+    reader_options: dict[str, Any]
+    """Reader-specific options, as dictionary."""
+    cloud_options: dict[str, Any] | None
+    """Cloud-related authentication options, currently ignored."""
     paths: list[str]
     """List of paths to read from."""
     file_options: Any
@@ -204,9 +208,33 @@ def __post_init__(self) -> None:
         if self.file_options.n_rows is not None:
             raise NotImplementedError("row limit in scan")
         if self.typ not in ("csv", "parquet"):
+            raise NotImplementedError(f"Unhandled scan type: {self.typ}")
+        if self.cloud_options is not None and any(
+            self.cloud_options[k] is not None for k in ("aws", "azure", "gcp")
+        ):
             raise NotImplementedError(
-                f"Unhandled scan type: {self.typ}"
-            )  # pragma: no cover; polars raises on the rust side for now
+                "Read from cloud storage"
+            )  # pragma: no cover; no test yet
+        if self.typ == "csv":
+            if self.reader_options["skip_rows_after_header"] != 0:
+                raise NotImplementedError("Skipping rows after header in CSV reader")
+            parse_options = self.reader_options["parse_options"]
+            if (
+                null_values := parse_options["null_values"]
+            ) is not None and "Named" in null_values:
+                raise NotImplementedError(
+                    "Per column null value specification not supported for CSV reader"
+                )
+            if (
+                comment := parse_options["comment_prefix"]
+            ) is not None and "Multi" in comment:
+                raise NotImplementedError(
+                    "Multi-character comment prefix not supported for CSV reader"
+                )
+            if not self.reader_options["has_header"]:
+                # Need to do some file introspection to get the number
+                # of columns so that column projection works right.
+                raise NotImplementedError("Reading CSV without header")
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -214,14 +242,72 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         with_columns = options.with_columns
         row_index = options.row_index
         if self.typ == "csv":
-            opts, cloud_opts = map(json.loads, self.options)
-            df = DataFrame.from_cudf(
-                cudf.concat(
-                    [cudf.read_csv(p, usecols=with_columns) for p in self.paths]
+            parse_options = self.reader_options["parse_options"]
+            sep = chr(parse_options["separator"])
+            quote = chr(parse_options["quote_char"])
+            eol = chr(parse_options["eol_char"])
+            if self.reader_options["schema"] is not None:
+                # Reader schema provides names
+                column_names = list(self.reader_options["schema"]["inner"].keys())
+            else:
+                # file provides column names
+                column_names = None
+            usecols = with_columns
+            # TODO: support has_header=False
+            header = 0
+
+            # polars defaults to no null recognition
+            null_values = [""]
+            if parse_options["null_values"] is not None:
+                ((typ, nulls),) = parse_options["null_values"].items()
+                if typ == "AllColumnsSingle":
+                    # Single value
+                    null_values.append(nulls)
+                else:
+                    # List of values
+                    null_values.extend(nulls)
+            if parse_options["comment_prefix"] is not None:
+                comment = chr(parse_options["comment_prefix"]["Single"])
+            else:
+                comment = None
+            decimal = "," if parse_options["decimal_comma"] else "."
+
+            # polars skips blank lines at the beginning of the file
+            pieces = []
+            for p in self.paths:
+                skiprows = self.reader_options["skip_rows"]
+                path = Path(p)
+                with path.open() as f:
+                    while f.readline() == "\n":
+                        skiprows += 1
+                tbl_w_meta = plc.io.csv.read_csv(
+                    plc.io.SourceInfo([path]),
+                    delimiter=sep,
+                    quotechar=quote,
+                    lineterminator=eol,
+                    col_names=column_names,
+                    header=header,
+                    usecols=usecols,
+                    na_filter=True,
+                    na_values=null_values,
+                    keep_default_na=False,
+                    skiprows=skiprows,
+                    comment=comment,
+                    decimal=decimal,
+                    dtypes=self.schema,
+                )
+                pieces.append(tbl_w_meta)
+            tables, colnames = zip(
+                *(
+                    (piece.tbl, piece.column_names(include_children=False))
+                    for piece in pieces
                 )
             )
+            df = DataFrame.from_table(
+                plc.concatenate.concatenate(list(tables)),
+                colnames[0],
+            )
         elif self.typ == "parquet":
-            opts, cloud_opts = map(json.loads, self.options)
             cdf = cudf.read_parquet(self.paths, columns=with_columns)
             assert isinstance(cdf, cudf.DataFrame)
             df = DataFrame.from_cudf(cdf)
@@ -229,7 +315,12 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             raise NotImplementedError(
                 f"Unhandled scan type: {self.typ}"
             )  # pragma: no cover; post init trips first
-        if row_index is not None:
+        if (
+            row_index is not None
+            # TODO: remove condition when dropping support for polars 1.0
+            # https://github.com/pola-rs/polars/pull/17363
+            and row_index[0] in self.schema
+        ):
             name, offset = row_index
             dtype = self.schema[name]
             step = plc.interop.from_arrow(
@@ -301,17 +392,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         pdf = pl.DataFrame._from_pydf(self.df)
         if self.projection is not None:
             pdf = pdf.select(self.projection)
-        table = pdf.to_arrow()
-        schema = table.schema
-        for i, field in enumerate(schema):
-            schema = schema.set(
-                i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type))
-            )
-        # No-op if the schema is unchanged.
-        table = table.cast(schema)
-        df = DataFrame.from_table(
-            plc.interop.from_arrow(table), list(self.schema.keys())
-        )
+        df = DataFrame.from_polars(pdf)
         assert all(
             c.obj.type() == dtype for c, dtype in zip(df.columns, self.schema.values())
         )
@@ -431,11 +512,11 @@ def check_agg(agg: expr.Expr) -> int:
         NotImplementedError
             For unsupported expression nodes.
         """
-        if isinstance(agg, (expr.BinOp, expr.Cast)):
+        if isinstance(agg, (expr.BinOp, expr.Cast, expr.UnaryFunction)):
             return max(GroupBy.check_agg(child) for child in agg.children)
         elif isinstance(agg, expr.Agg):
             return 1 + max(GroupBy.check_agg(child) for child in agg.children)
-        elif isinstance(agg, (expr.Len, expr.Col, expr.Literal)):
+        elif isinstance(agg, (expr.Len, expr.Col, expr.Literal, expr.LiteralColumn)):
             return 0
         else:
             raise NotImplementedError(f"No handler for {agg=}")
@@ -458,16 +539,17 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         keys = broadcast(
             *(k.evaluate(df) for k in self.keys), target_length=df.num_rows
         )
-        # TODO: use sorted information, need to expose column_order
-        # and null_precedence in pylibcudf groupby constructor
-        # sorted = (
-        #     plc.types.Sorted.YES
-        #     if all(k.is_sorted for k in keys)
-        #     else plc.types.Sorted.NO
-        # )
+        sorted = (
+            plc.types.Sorted.YES
+            if all(k.is_sorted for k in keys)
+            else plc.types.Sorted.NO
+        )
         grouper = plc.groupby.GroupBy(
             plc.Table([k.obj for k in keys]),
             null_handling=plc.types.NullPolicy.INCLUDE,
+            keys_are_sorted=sorted,
+            column_order=[k.order for k in keys],
+            null_precedence=[k.null_order for k in keys],
         )
         # TODO: uniquify
         requests = []
@@ -494,7 +576,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         results = [
             req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests
         ]
-        return DataFrame([*result_keys, *results]).slice(self.options.slice)
+        return DataFrame(broadcast(*result_keys, *results)).slice(self.options.slice)
 
 
 @dataclasses.dataclass
@@ -573,6 +655,59 @@ def _joiners(
         else:
             assert_never(how)
 
+    def _reorder_maps(
+        self,
+        left_rows: int,
+        lg: plc.Column,
+        left_policy: plc.copying.OutOfBoundsPolicy,
+        right_rows: int,
+        rg: plc.Column,
+        right_policy: plc.copying.OutOfBoundsPolicy,
+    ) -> list[plc.Column]:
+        """
+        Reorder gather maps to satisfy polars join order restrictions.
+
+        Parameters
+        ----------
+        left_rows
+            Number of rows in left table
+        lg
+            Left gather map
+        left_policy
+            Nullify policy for left map
+        right_rows
+            Number of rows in right table
+        rg
+            Right gather map
+        right_policy
+            Nullify policy for right map
+
+        Returns
+        -------
+        list of reordered left and right gather maps.
+
+        Notes
+        -----
+        For a left join, the polars result preserves the order of the
+        left keys, and is stable wrt the right keys. For all other
+        joins, there is no order obligation.
+        """
+        dt = plc.interop.to_arrow(plc.types.SIZE_TYPE)
+        init = plc.interop.from_arrow(pa.scalar(0, type=dt))
+        step = plc.interop.from_arrow(pa.scalar(1, type=dt))
+        left_order = plc.copying.gather(
+            plc.Table([plc.filling.sequence(left_rows, init, step)]), lg, left_policy
+        )
+        right_order = plc.copying.gather(
+            plc.Table([plc.filling.sequence(right_rows, init, step)]), rg, right_policy
+        )
+        return plc.sorting.stable_sort_by_key(
+            plc.Table([lg, rg]),
+            plc.Table([*left_order.columns(), *right_order.columns()]),
+            [plc.types.Order.ASCENDING, plc.types.Order.ASCENDING],
+            [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER],
+        ).columns()
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         left = self.left.evaluate(cache=cache)
@@ -613,6 +748,11 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             result = DataFrame.from_table(table, left.column_names)
         else:
             lg, rg = join_fn(left_on.table, right_on.table, null_equality)
+            if how == "left":
+                # Order of left table is preserved
+                lg, rg = self._reorder_maps(
+                    left.num_rows, lg, left_policy, right.num_rows, rg, right_policy
+                )
             if coalesce and how == "inner":
                 right = right.discard_columns(right_on.column_names_set)
             left = DataFrame.from_table(
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index a2fdb3c3d79..dec45679c75 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+import json
 from contextlib import AbstractContextManager, nullcontext
 from functools import singledispatch
 from typing import Any
@@ -12,6 +13,7 @@
 import pyarrow as pa
 from typing_extensions import assert_never
 
+import polars as pl
 import polars.polars as plrs
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
@@ -88,10 +90,16 @@ def _(
     node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     typ, *options = node.scan_type
+    if typ == "ndjson":
+        (reader_options,) = map(json.loads, options)
+        cloud_options = None
+    else:
+        reader_options, cloud_options = map(json.loads, options)
     return ir.Scan(
         schema,
         typ,
-        tuple(options),
+        reader_options,
+        cloud_options,
         node.paths,
         node.file_options,
         translate_named_expr(visitor, n=node.predicate)
@@ -361,8 +369,23 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             options,
             *(translate_expr(visitor, n=n) for n in node.input),
         )
-    else:
-        raise NotImplementedError(f"No handler for Expr function node with {name=}")
+    elif isinstance(name, pl_expr.TemporalFunction):
+        return expr.TemporalFunction(
+            dtype,
+            name,
+            options,
+            *(translate_expr(visitor, n=n) for n in node.input),
+        )
+    elif isinstance(name, str):
+        return expr.UnaryFunction(
+            dtype,
+            name,
+            options,
+            *(translate_expr(visitor, n=n) for n in node.input),
+        )
+    raise NotImplementedError(
+        f"No handler for Expr function node with {name=}"
+    )  # pragma: no cover; polars raises on the rust side for now
 
 
 @_translate_expr.register
@@ -387,7 +410,7 @@ def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr
 @_translate_expr.register
 def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
     if isinstance(node.value, plrs.PySeries):
-        return expr.LiteralColumn(dtype, node.value)
+        return expr.LiteralColumn(dtype, pl.Series._from_pyseries(node.value))
     value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype))
     return expr.Literal(dtype, value)
 
@@ -432,8 +455,11 @@ def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.E
     # Push casts into literals so we can handle Cast(Literal(Null))
     if isinstance(inner, expr.Literal):
         return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype)))
-    else:
-        return expr.Cast(dtype, inner)
+    elif isinstance(inner, expr.Cast):
+        # Translation of Len/Count-agg put in a cast, remove double
+        # casts if we have one.
+        (inner,) = inner.children
+    return expr.Cast(dtype, inner)
 
 
 @_translate_expr.register
@@ -443,12 +469,15 @@ def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr
 
 @_translate_expr.register
 def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
-    return expr.Agg(
+    value = expr.Agg(
         dtype,
         node.name,
         node.options,
         *(translate_expr(visitor, n=n) for n in node.arguments),
     )
+    if value.name == "count" and value.dtype.id() != plc.TypeId.INT32:
+        return expr.Cast(value.dtype, value)
+    return value
 
 
 @_translate_expr.register
@@ -475,7 +504,10 @@ def _(
 
 @_translate_expr.register
 def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
-    return expr.Len(dtype)
+    value = expr.Len(dtype)
+    if dtype.id() != plc.TypeId.INT32:
+        return expr.Cast(dtype, value)
+    return value  # pragma: no cover; never reached since polars len has uint32 dtype
 
 
 def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr:
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 507acb5d33a..918cd024fa2 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -17,19 +17,6 @@
 __all__ = ["from_polars", "downcast_arrow_lists", "have_compatible_resolution"]
 
 
-TIMELIKE_TYPES: frozenset[plc.TypeId] = frozenset(
-    [
-        plc.TypeId.TIMESTAMP_MILLISECONDS,
-        plc.TypeId.TIMESTAMP_MICROSECONDS,
-        plc.TypeId.TIMESTAMP_NANOSECONDS,
-        plc.TypeId.TIMESTAMP_DAYS,
-        plc.TypeId.DURATION_MILLISECONDS,
-        plc.TypeId.DURATION_MICROSECONDS,
-        plc.TypeId.DURATION_NANOSECONDS,
-    ]
-)
-
-
 def have_compatible_resolution(lid: plc.TypeId, rid: plc.TypeId):
     """
     Do two datetime typeids have matching resolution for a binop.
diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
new file mode 100644
index 00000000000..9807cffb384
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Version utilities so that cudf_polars supports a range of polars versions."""
+
+# ruff: noqa: SIM300
+from __future__ import annotations
+
+from packaging.version import parse
+
+from polars import __version__
+
+POLARS_VERSION = parse(__version__)
+
+POLARS_VERSION_GE_10 = POLARS_VERSION >= parse("1.0")
+POLARS_VERSION_GE_11 = POLARS_VERSION >= parse("1.1")
+POLARS_VERSION_GE_12 = POLARS_VERSION >= parse("1.2")
+POLARS_VERSION_GE_121 = POLARS_VERSION >= parse("1.2.1")
+POLARS_VERSION_GT_10 = POLARS_VERSION > parse("1.0")
+POLARS_VERSION_GT_11 = POLARS_VERSION > parse("1.1")
+POLARS_VERSION_GT_12 = POLARS_VERSION > parse("1.2")
+
+POLARS_VERSION_LE_12 = POLARS_VERSION <= parse("1.2")
+POLARS_VERSION_LE_11 = POLARS_VERSION <= parse("1.1")
+POLARS_VERSION_LT_12 = POLARS_VERSION < parse("1.2")
+POLARS_VERSION_LT_11 = POLARS_VERSION < parse("1.1")
+
+if POLARS_VERSION < parse("1.0"):  # pragma: no cover
+    raise ImportError("cudf_polars requires py-polars v1.0 or greater.")
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index bf4673fcc50..0b559f7a8e9 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -182,5 +182,3 @@ docstring-code-format = true
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../dependencies.yaml"
-# Pure python
-disable-cuda = true
diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py
index 3291d8db161..4f3c0de5975 100644
--- a/python/cudf_polars/tests/containers/test_column.py
+++ b/python/cudf_polars/tests/containers/test_column.py
@@ -3,12 +3,14 @@
 
 from __future__ import annotations
 
+from functools import partial
+
 import pyarrow
 import pytest
 
 import cudf._lib.pylibcudf as plc
 
-from cudf_polars.containers import Column
+from cudf_polars.containers import Column, NamedColumn
 
 
 def test_non_scalar_access_raises():
@@ -54,17 +56,21 @@ def test_shallow_copy():
 
 
 @pytest.mark.parametrize("typeid", [plc.TypeId.INT8, plc.TypeId.FLOAT32])
-def test_mask_nans(typeid):
+@pytest.mark.parametrize("constructor", [Column, partial(NamedColumn, name="name")])
+def test_mask_nans(typeid, constructor):
     dtype = plc.DataType(typeid)
     values = pyarrow.array([0, 0, 0], type=plc.interop.to_arrow(dtype))
-    column = Column(plc.interop.from_arrow(values))
+    column = constructor(plc.interop.from_arrow(values))
     masked = column.mask_nans()
-    assert column.obj is masked.obj
+    assert column.obj.null_count() == masked.obj.null_count()
 
 
-def test_mask_nans_float_with_nan_notimplemented():
+def test_mask_nans_float():
     dtype = plc.DataType(plc.TypeId.FLOAT32)
     values = pyarrow.array([0, 0, float("nan")], type=plc.interop.to_arrow(dtype))
     column = Column(plc.interop.from_arrow(values))
-    with pytest.raises(NotImplementedError):
-        _ = column.mask_nans()
+    masked = column.mask_nans()
+    expect = pyarrow.array([0, 0, None], type=plc.interop.to_arrow(dtype))
+    got = pyarrow.array(plc.interop.to_arrow(masked.obj))
+
+    assert expect == got
diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py
index 2e385e39eef..87508e17407 100644
--- a/python/cudf_polars/tests/containers/test_dataframe.py
+++ b/python/cudf_polars/tests/containers/test_dataframe.py
@@ -5,6 +5,8 @@
 
 import pytest
 
+import polars as pl
+
 import cudf._lib.pylibcudf as plc
 
 from cudf_polars.containers import DataFrame, NamedColumn
@@ -90,3 +92,52 @@ def test_shallow_copy():
     )
     assert df.columns[0].is_sorted == plc.types.Sorted.YES
     assert copy.columns[0].is_sorted == plc.types.Sorted.NO
+
+
+def test_sorted_flags_preserved_empty():
+    df = pl.DataFrame({"a": pl.Series([], dtype=pl.Int8())})
+    df.select(pl.col("a").sort())
+
+    gf = DataFrame.from_polars(df)
+
+    (a,) = gf.columns
+
+    assert a.is_sorted == plc.types.Sorted.YES
+
+    assert df.flags == gf.to_polars().flags
+
+
+@pytest.mark.parametrize("nulls_last", [True, False])
+def test_sorted_flags_preserved(with_nulls, nulls_last):
+    values = [1, 2, -1, 2, 4, 5]
+    if with_nulls:
+        values[4] = None
+    df = pl.DataFrame({"a": values, "b": values, "c": values})
+
+    df = df.select(
+        pl.col("a").sort(descending=False, nulls_last=nulls_last),
+        pl.col("b").sort(descending=True, nulls_last=nulls_last),
+        pl.col("c"),
+    )
+
+    gf = DataFrame.from_polars(df)
+
+    a_null_order = (
+        plc.types.NullOrder.AFTER
+        if nulls_last and with_nulls
+        else plc.types.NullOrder.BEFORE
+    )
+    b_null_order = (
+        plc.types.NullOrder.AFTER
+        if not nulls_last and with_nulls
+        else plc.types.NullOrder.BEFORE
+    )
+    a, b, c = gf.columns
+    assert a.is_sorted == plc.types.Sorted.YES
+    assert a.order == plc.types.Order.ASCENDING
+    assert a.null_order == a_null_order
+    assert b.is_sorted == plc.types.Sorted.YES
+    assert b.order == plc.types.Order.DESCENDING
+    assert b.null_order == b_null_order
+    assert c.is_sorted == plc.types.Sorted.NO
+    assert df.flags == gf.to_polars().flags
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index 267d0a99692..245bde3acab 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -20,13 +20,7 @@ def dtype(request):
     return request.param
 
 
-@pytest.fixture(
-    params=[
-        False,
-        pytest.param(True, marks=pytest.mark.xfail(reason="No handler for set_sorted")),
-    ],
-    ids=["unsorted", "sorted"],
-)
+@pytest.fixture(params=[False, True], ids=["unsorted", "sorted"])
 def is_sorted(request):
     return request.param
 
@@ -59,14 +53,25 @@ def test_agg(df, agg):
 
 
 @pytest.mark.parametrize(
-    "propagate_nans",
-    [pytest.param(False, marks=pytest.mark.xfail(reason="Need to mask nans")), True],
-    ids=["mask_nans", "propagate_nans"],
+    "op", [pl.Expr.min, pl.Expr.nan_min, pl.Expr.max, pl.Expr.nan_max]
 )
-@pytest.mark.parametrize("op", ["min", "max"])
-def test_agg_float_with_nans(propagate_nans, op):
-    df = pl.LazyFrame({"a": pl.Series([1, 2, float("nan")], dtype=pl.Float64())})
-    op = getattr(pl.Expr, f"nan_{op}" if propagate_nans else op)
+def test_agg_float_with_nans(op):
+    df = pl.LazyFrame(
+        {
+            "a": pl.Series([1, 2, float("nan")], dtype=pl.Float64()),
+            "b": pl.Series([1, 2, None], dtype=pl.Int8()),
+        }
+    )
+    q = df.select(op(pl.col("a")), op(pl.col("b")))
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.xfail(reason="https://github.com/pola-rs/polars/issues/17513")
+@pytest.mark.parametrize("op", [pl.Expr.max, pl.Expr.min])
+def test_agg_singleton(op):
+    df = pl.LazyFrame({"a": pl.Series([float("nan")])})
+
     q = df.select(op(pl.col("a")))
 
     assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py
index 6ba2a1dce1e..218101bf87c 100644
--- a/python/cudf_polars/tests/expressions/test_datetime_basic.py
+++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py
@@ -2,6 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import datetime
+from operator import methodcaller
+
 import pytest
 
 import polars as pl
@@ -32,3 +35,28 @@ def test_datetime_dataframe_scan(dtype):
 
     query = ldf.select(pl.col("b"), pl.col("a"))
     assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize(
+    "field",
+    [
+        methodcaller("year"),
+        pytest.param(
+            methodcaller("day"),
+            marks=pytest.mark.xfail(reason="day extraction not implemented"),
+        ),
+    ],
+)
+def test_datetime_extract(field):
+    ldf = pl.LazyFrame(
+        {"dates": [datetime.date(2024, 1, 1), datetime.date(2024, 10, 11)]}
+    )
+    q = ldf.select(field(pl.col("dates").dt))
+
+    with pytest.raises(AssertionError):
+        # polars produces int32, libcudf produces int16 for the year extraction
+        # libcudf can lose data here.
+        # https://github.com/rapidsai/cudf/issues/16196
+        assert_gpu_result_equal(q)
+
+    assert_gpu_result_equal(q, check_dtypes=False)
diff --git a/python/cudf_polars/tests/expressions/test_round.py b/python/cudf_polars/tests/expressions/test_round.py
new file mode 100644
index 00000000000..3af3a0ce6d1
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_round.py
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import math
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(params=[pl.Float32, pl.Float64])
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture
+def df(dtype, with_nulls):
+    a = [-math.e, 10, 22.5, 1.5, 2.5, -1.5, math.pi, 8]
+    if with_nulls:
+        a[2] = None
+        a[-1] = None
+    return pl.LazyFrame({"a": a}, schema={"a": dtype})
+
+
+@pytest.mark.parametrize("decimals", [0, 2, 4])
+def test_round(df, decimals):
+    q = df.select(pl.col("a").round(decimals=decimals))
+
+    assert_gpu_result_equal(q, check_exact=False)
diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py
index 0195266f5c6..d46df92db94 100644
--- a/python/cudf_polars/tests/expressions/test_sort.py
+++ b/python/cudf_polars/tests/expressions/test_sort.py
@@ -8,6 +8,9 @@
 
 import polars as pl
 
+import cudf._lib.pylibcudf as plc
+
+from cudf_polars import translate_ir
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
@@ -51,3 +54,31 @@ def test_sort_by_expression(descending, nulls_last, maintain_order):
         )
     )
     assert_gpu_result_equal(query, check_row_order=maintain_order)
+
+
+@pytest.mark.parametrize("descending", [False, True])
+@pytest.mark.parametrize("nulls_last", [False, True])
+def test_setsorted(descending, nulls_last, with_nulls):
+    values = sorted([1, 2, 3, 4, 5, 6, -2], reverse=descending)
+    if with_nulls:
+        values[-1 if nulls_last else 0] = None
+    df = pl.LazyFrame({"a": values})
+
+    q = df.set_sorted("a", descending=descending)
+
+    assert_gpu_result_equal(q)
+
+    df = translate_ir(q._ldf.visit()).evaluate(cache={})
+
+    (a,) = df.columns
+
+    assert a.is_sorted == plc.types.Sorted.YES
+    null_order = (
+        plc.types.NullOrder.AFTER
+        if (descending ^ nulls_last) and with_nulls
+        else plc.types.NullOrder.BEFORE
+    )
+    assert a.null_order == null_order
+    assert a.order == (
+        plc.types.Order.DESCENDING if descending else plc.types.Order.ASCENDING
+    )
diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py
index 9729e765948..8cf65dd51ac 100644
--- a/python/cudf_polars/tests/expressions/test_stringfunction.py
+++ b/python/cudf_polars/tests/expressions/test_stringfunction.py
@@ -37,6 +37,30 @@ def ldf(with_nulls):
     return pl.LazyFrame({"a": a, "b": range(len(a))})
 
 
+slice_cases = [
+    (1, 3),
+    (0, 3),
+    (0, 0),
+    (-3, 1),
+    (-100, 5),
+    (1, 1),
+    (100, 100),
+    (-3, 4),
+    (-3, 3),
+]
+
+
+@pytest.fixture(params=slice_cases)
+def slice_column_data(ldf, request):
+    start, length = request.param
+    if length:
+        return ldf.with_columns(
+            pl.lit(start).alias("start"), pl.lit(length).alias("length")
+        )
+    else:
+        return ldf.with_columns(pl.lit(start).alias("start"))
+
+
 def test_supported_stringfunction_expression(ldf):
     query = ldf.select(
         pl.col("a").str.starts_with("Z"),
@@ -104,3 +128,25 @@ def test_contains_invalid(ldf):
         query.collect()
     with pytest.raises(pl.exceptions.ComputeError):
         query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True))
+
+
+@pytest.mark.parametrize("offset", [1, -1, 0, 100, -100])
+def test_slice_scalars_offset(ldf, offset):
+    query = ldf.select(pl.col("a").str.slice(offset))
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize("offset,length", slice_cases)
+def test_slice_scalars_length_and_offset(ldf, offset, length):
+    query = ldf.select(pl.col("a").str.slice(offset, length))
+    assert_gpu_result_equal(query)
+
+
+def test_slice_column(slice_column_data):
+    if "length" in slice_column_data.collect_schema():
+        query = slice_column_data.select(
+            pl.col("a").str.slice(pl.col("start"), pl.col("length"))
+        )
+    else:
+        query = slice_column_data.select(pl.col("a").str.slice(pl.col("start")))
+    assert_ir_translation_raises(query, NotImplementedError)
diff --git a/python/cudf_polars/tests/expressions/test_unique.py b/python/cudf_polars/tests/expressions/test_unique.py
new file mode 100644
index 00000000000..9b009a422c2
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_unique.py
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"])
+@pytest.mark.parametrize("pre_sorted", [False, True], ids=["unsorted", "sorted"])
+def test_unique(maintain_order, pre_sorted):
+    ldf = pl.DataFrame(
+        {
+            "b": [1.5, 2.5, None, 1.5, 3, float("nan"), 3],
+        }
+    ).lazy()
+    if pre_sorted:
+        ldf = ldf.sort("b")
+
+    query = ldf.select(pl.col("b").unique(maintain_order=maintain_order))
+    assert_gpu_result_equal(query, check_row_order=maintain_order)
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index aefad59eb91..a75825ef3d3 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import itertools
+
 import pytest
 
 import polars as pl
@@ -10,6 +12,7 @@
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
+from cudf_polars.utils import versions
 
 
 @pytest.fixture
@@ -26,12 +29,12 @@ def df():
 
 @pytest.fixture(
     params=[
-        ["key1"],
-        ["key2"],
+        [pl.col("key1")],
+        [pl.col("key2")],
         [pl.col("key1") * pl.col("key2")],
-        ["key1", "key2"],
+        [pl.col("key1"), pl.col("key2")],
         [pl.col("key1") == pl.col("key2")],
-        ["key2", pl.col("key1") == pl.lit(1, dtype=pl.Int64)],
+        [pl.col("key2"), pl.col("key1") == pl.lit(1, dtype=pl.Int64)],
     ],
     ids=lambda keys: "-".join(map(str, keys)),
 )
@@ -47,6 +50,8 @@ def keys(request):
         [pl.col("float").max() - pl.col("int").min()],
         [pl.col("float").mean(), pl.col("int").std()],
         [(pl.col("float") - pl.lit(2)).max()],
+        [pl.col("float").sum().round(decimals=1)],
+        [pl.col("float").round(decimals=1).sum()],
     ],
     ids=lambda aggs: "-".join(map(str, aggs)),
 )
@@ -80,13 +85,39 @@ def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs):
     assert_gpu_result_equal(q, check_exact=False)
 
 
+def test_groupby_sorted_keys(df: pl.LazyFrame, keys, exprs):
+    sorted_keys = [
+        key.sort(descending=descending)
+        for key, descending in zip(keys, itertools.cycle([False, True]))
+    ]
+
+    q = df.group_by(*sorted_keys).agg(*exprs)
+
+    schema = q.collect_schema()
+    sort_keys = list(schema.keys())[: len(keys)]
+    # Multiple keys don't do sorting
+    qsorted = q.sort(*sort_keys)
+    if len(keys) > 1:
+        with pytest.raises(AssertionError):
+            # https://github.com/pola-rs/polars/issues/17556
+            assert_gpu_result_equal(q, check_exact=False)
+        if versions.POLARS_VERSION_LT_12 and schema[sort_keys[1]] == pl.Boolean():
+            # https://github.com/pola-rs/polars/issues/17557
+            with pytest.raises(AssertionError):
+                assert_gpu_result_equal(qsorted, check_exact=False)
+        else:
+            assert_gpu_result_equal(qsorted, check_exact=False)
+    elif schema[sort_keys[0]] == pl.Boolean():
+        # Boolean keys don't do sorting, so we get random order
+        assert_gpu_result_equal(qsorted, check_exact=False)
+    else:
+        assert_gpu_result_equal(q, check_exact=False)
+
+
 def test_groupby_len(df, keys):
     q = df.group_by(*keys).agg(pl.len())
 
-    # TODO: polars returns UInt32, libcudf returns Int32
-    with pytest.raises(AssertionError):
-        assert_gpu_result_equal(q, check_row_order=False)
-    assert_gpu_result_equal(q, check_dtypes=False, check_row_order=False)
+    assert_gpu_result_equal(q, check_row_order=False)
 
 
 @pytest.mark.parametrize(
@@ -100,3 +131,55 @@ def test_groupby_unsupported(df, expr):
     q = df.group_by("key1").agg(expr)
 
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.xfail(reason="https://github.com/pola-rs/polars/issues/17513")
+def test_groupby_minmax_with_nan():
+    df = pl.LazyFrame(
+        {"key": [1, 2, 2, 2], "value": [float("nan"), 1, -1, float("nan")]}
+    )
+
+    q = df.group_by("key").agg(
+        pl.col("value").max().alias("max"), pl.col("value").min().alias("min")
+    )
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("op", [pl.Expr.nan_max, pl.Expr.nan_min])
+def test_groupby_nan_minmax_raises(op):
+    df = pl.LazyFrame(
+        {"key": [1, 2, 2, 2], "value": [float("nan"), 1, -1, float("nan")]}
+    )
+
+    q = df.group_by("key").agg(op(pl.col("value")))
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize(
+    "key",
+    [
+        pytest.param(
+            1,
+            marks=pytest.mark.xfail(
+                versions.POLARS_VERSION_GE_121, reason="polars 1.2.1 disallows this"
+            ),
+        ),
+        pl.col("key1"),
+    ],
+)
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.lit(1).alias("value"),
+        pl.lit([[4, 5, 6]]).alias("value"),
+        pl.col("float") * (1 - pl.col("int")),
+        [pl.lit(2).alias("value"), pl.col("float") * 2],
+    ],
+)
+def test_groupby_literal_in_agg(df, key, expr):
+    # check_row_order=False doesn't work for list aggregations
+    # so just sort by the group key
+    q = df.group_by(key).agg(expr).sort(key, maintain_order=True)
+    assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 89f6fd3455b..1ffbf3c0ef4 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -53,7 +53,7 @@ def test_join(how, coalesce, join_nulls, join_expr):
     query = left.join(
         right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=coalesce
     )
-    assert_gpu_result_equal(query, check_row_order=False)
+    assert_gpu_result_equal(query, check_row_order=how == "left")
 
 
 def test_cross_join():
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index f129cc7ca32..0981a96a34a 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import os
+
 import pytest
 
 import polars as pl
@@ -10,6 +12,7 @@
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
+from cudf_polars.utils import versions
 
 
 @pytest.fixture(
@@ -22,22 +25,22 @@ def row_index(request):
 
 @pytest.fixture(
     params=[
-        (None, 0),
+        None,
         pytest.param(
-            (2, 1), marks=pytest.mark.xfail(reason="No handling of row limit in scan")
+            2, marks=pytest.mark.xfail(reason="No handling of row limit in scan")
         ),
         pytest.param(
-            (3, 0), marks=pytest.mark.xfail(reason="No handling of row limit in scan")
+            3, marks=pytest.mark.xfail(reason="No handling of row limit in scan")
         ),
     ],
     ids=["all-rows", "n_rows-with-skip", "n_rows-no-skip"],
 )
-def n_rows_skip_rows(request):
+def n_rows(request):
     return request.param
 
 
 @pytest.fixture(params=["csv", "parquet"])
-def df(request, tmp_path, row_index, n_rows_skip_rows):
+def df(request, tmp_path, row_index, n_rows):
     df = pl.DataFrame(
         {
             "a": [1, 2, 3, None],
@@ -46,14 +49,12 @@ def df(request, tmp_path, row_index, n_rows_skip_rows):
         }
     )
     name, offset = row_index
-    n_rows, skip_rows = n_rows_skip_rows
     if request.param == "csv":
         df.write_csv(tmp_path / "file.csv")
         return pl.scan_csv(
             tmp_path / "file.csv",
             row_index_name=name,
             row_index_offset=offset,
-            skip_rows_after_header=skip_rows,
             n_rows=n_rows,
         )
     else:
@@ -97,3 +98,138 @@ def test_scan_unsupported_raises(tmp_path):
     df.write_ndjson(tmp_path / "df.json")
     q = pl.scan_ndjson(tmp_path / "df.json")
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.xfail(
+    versions.POLARS_VERSION_LT_11,
+    reason="https://github.com/pola-rs/polars/issues/15730",
+)
+def test_scan_row_index_projected_out(tmp_path):
+    df = pl.DataFrame({"a": [1, 2, 3]})
+
+    df.write_parquet(tmp_path / "df.pq")
+
+    q = pl.scan_parquet(tmp_path / "df.pq").with_row_index().select(pl.col("a"))
+
+    assert_gpu_result_equal(q)
+
+
+def test_scan_csv_column_renames_projection_schema(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+
+    q = pl.scan_csv(
+        tmp_path / "test.csv",
+        with_column_names=lambda names: [f"{n}_suffix" for n in names],
+        schema_overrides={
+            "foo_suffix": pl.String(),
+            "bar_suffix": pl.Int8(),
+            "baz_suffix": pl.UInt16(),
+        },
+    )
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "filename,glob",
+    [
+        (["test1.csv", "test2.csv"], True),
+        ("test*.csv", True),
+        # Make sure we don't expand glob when
+        # trying to read a file like test*.csv
+        # when glob=False
+        ("test*.csv", False),
+    ],
+)
+def test_scan_csv_multi(tmp_path, filename, glob):
+    with (tmp_path / "test1.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    with (tmp_path / "test2.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    with (tmp_path / "test*.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    os.chdir(tmp_path)
+    q = pl.scan_csv(filename, glob=glob)
+
+    assert_gpu_result_equal(q)
+
+
+def test_scan_csv_multi_differing_colnames(tmp_path):
+    with (tmp_path / "test1.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    with (tmp_path / "test2.csv").open("w") as f:
+        f.write("""abc,def,ghi\n1,2\n3,4,5""")
+    q = pl.scan_csv(
+        [tmp_path / "test1.csv", tmp_path / "test2.csv"],
+    )
+    with pytest.raises(pl.exceptions.ComputeError):
+        q.explain()
+
+
+def test_scan_csv_skip_after_header_not_implemented(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2,3\n3,4,5""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", skip_rows_after_header=1)
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_csv_null_values_per_column_not_implemented(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2,3\n3,4,5""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", null_values={"foo": "1", "baz": "5"})
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_csv_comment_str_not_implemented(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n// 1,2,3\n3,4,5""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", comment_prefix="// ")
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_csv_comment_char(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n# 1,2,3\n3,4,5""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", comment_prefix="#")
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("nulls", [None, "3", ["3", "5"]])
+def test_scan_csv_null_values(tmp_path, nulls):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2,3\n3,4,5\n5,,2""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", null_values=nulls)
+
+    assert_gpu_result_equal(q)
+
+
+def test_scan_csv_decimal_comma(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo|bar|baz\n1,23|2,34|3,56\n1""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", separator="|", decimal_comma=True)
+
+    assert_gpu_result_equal(q)
+
+
+def test_scan_csv_skip_initial_empty_rows(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""\n\n\n\nfoo|bar|baz\n1|2|3\n1""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", separator="|", skip_rows=1, has_header=False)
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+    q = pl.scan_csv(tmp_path / "test.csv", separator="|", skip_rows=1)
+
+    assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py
index b021d832910..865b95a7d91 100644
--- a/python/cudf_polars/tests/test_union.py
+++ b/python/cudf_polars/tests/test_union.py
@@ -46,3 +46,12 @@ def test_concat_vertical():
     q = pl.concat([ldf, ldf2], how="vertical")
 
     assert_gpu_result_equal(q)
+
+
+def test_concat_diagonal_empty():
+    df1 = pl.LazyFrame()
+    df2 = pl.LazyFrame({"a": [1, 2]})
+
+    q = pl.concat([df1, df2], how="diagonal_relaxed")
+
+    assert_gpu_result_equal(q, collect_kwargs={"no_optimization": True})
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 1f55a59ea55..4bdb5d921ec 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -667,17 +667,10 @@ def from_dict(
         )
 
     @staticmethod
-    def read_json(*args, engine="auto", **kwargs):
-        return _default_backend(
-            dd.read_json,
-            *args,
-            engine=(
-                partial(cudf.read_json, engine=engine)
-                if isinstance(engine, str)
-                else engine
-            ),
-            **kwargs,
-        )
+    def read_json(*args, **kwargs):
+        from dask_cudf.io.json import read_json as read_json_impl
+
+        return read_json_impl(*args, **kwargs)
 
     @staticmethod
     def read_orc(*args, **kwargs):
diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py
index 2a6ad603414..8705d98e9d6 100644
--- a/python/dask_cudf/dask_cudf/io/json.py
+++ b/python/dask_cudf/dask_cudf/io/json.py
@@ -1,15 +1,71 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from functools import partial
 
+import numpy as np
+from fsspec.core import get_compression, get_fs_token_paths
+
 import dask
+from dask.utils import parse_bytes
 
 import cudf
+from cudf.core.column import as_column
+from cudf.utils.ioutils import _is_local_filesystem
 
 from dask_cudf.backends import _default_backend
 
 
-def read_json(url_path, engine="auto", **kwargs):
+def _read_json_partition(
+    paths,
+    fs=None,
+    include_path_column=False,
+    path_converter=None,
+    **kwargs,
+):
+    # Transfer all data up front for remote storage
+    sources = (
+        paths
+        if fs is None
+        else fs.cat_ranges(
+            paths,
+            [0] * len(paths),
+            fs.sizes(paths),
+        )
+    )
+
+    if include_path_column:
+        # Add "path" column.
+        # Must iterate over sources sequentially
+        if not isinstance(include_path_column, str):
+            include_path_column = "path"
+        converted_paths = (
+            paths
+            if path_converter is None
+            else [path_converter(path) for path in paths]
+        )
+        dfs = []
+        for i, source in enumerate(sources):
+            df = cudf.read_json(source, **kwargs)
+            df[include_path_column] = as_column(
+                converted_paths[i], length=len(df)
+            )
+            dfs.append(df)
+        return cudf.concat(dfs)
+    else:
+        # Pass sources directly to cudf
+        return cudf.read_json(sources, **kwargs)
+
+
+def read_json(
+    url_path,
+    engine="auto",
+    blocksize=None,
+    orient="records",
+    lines=None,
+    compression="infer",
+    aggregate_files=True,
+    **kwargs,
+):
     """Read JSON data into a :class:`.DataFrame`.
 
     This function wraps :func:`dask.dataframe.read_json`, and passes
@@ -30,7 +86,13 @@ def read_json(url_path, engine="auto", **kwargs):
         data. The default value is "auto", so that
         ``engine=partial(cudf.read_json, engine="auto")`` will be
         passed to :func:`dask.dataframe.read_json` by default.
-
+    aggregate_files : bool or int
+        Whether to map multiple files to each output partition. If True,
+        the `blocksize` argument will be used to determine the number of
+        files in each partition. If any one file is larger than `blocksize`,
+        the `aggregate_files` argument will be ignored. If an integer value
+        is specified, the `blocksize` argument will be ignored, and that
+        number of files will be mapped to each partition. Default is True.
     **kwargs :
         Key-word arguments to pass through to :func:`dask.dataframe.read_json`.
 
@@ -60,9 +122,77 @@ def read_json(url_path, engine="auto", **kwargs):
 
     """
 
-    # TODO: Add optimized code path to leverage the
-    # `byte_range` argument in `cudf.read_json` for
-    # local storage (see `dask_cudf.read_csv`)
+    if lines is None:
+        lines = orient == "records"
+    if orient != "records" and lines:
+        raise ValueError(
+            'Line-delimited JSON is only available with orient="records".'
+        )
+    if blocksize and (orient != "records" or not lines):
+        raise ValueError(
+            "JSON file chunking only allowed for JSON-lines"
+            "input (orient='records', lines=True)."
+        )
+
+    inputs = []
+    if aggregate_files and blocksize or int(aggregate_files) > 1:
+        # Attempt custom read if we are mapping multiple files
+        # to each output partition. Otherwise, upstream logic
+        # is sufficient.
+
+        storage_options = kwargs.get("storage_options", {})
+        fs, _, paths = get_fs_token_paths(
+            url_path, mode="rb", storage_options=storage_options
+        )
+        if isinstance(aggregate_files, int) and aggregate_files > 1:
+            # Map a static file count to each partition
+            inputs = [
+                paths[offset : offset + aggregate_files]
+                for offset in range(0, len(paths), aggregate_files)
+            ]
+        elif aggregate_files is True and blocksize:
+            # Map files dynamically (using blocksize)
+            file_sizes = fs.sizes(paths)  # NOTE: This can be slow
+            blocksize = parse_bytes(blocksize)
+            if all([file_size <= blocksize for file_size in file_sizes]):
+                counts = np.unique(
+                    np.floor(np.cumsum(file_sizes) / blocksize),
+                    return_counts=True,
+                )[1]
+                offsets = np.concatenate([[0], counts.cumsum()])
+                inputs = [
+                    paths[offsets[i] : offsets[i + 1]]
+                    for i in range(len(offsets) - 1)
+                ]
+
+    if inputs:
+        # Inputs were successfully populated.
+        # Use custom _read_json_partition function
+        # to generate each partition.
+
+        compression = get_compression(
+            url_path[0] if isinstance(url_path, list) else url_path,
+            compression,
+        )
+        _kwargs = dict(
+            orient=orient,
+            lines=lines,
+            compression=compression,
+            include_path_column=kwargs.get("include_path_column", False),
+            path_converter=kwargs.get("path_converter"),
+        )
+        if not _is_local_filesystem(fs):
+            _kwargs["fs"] = fs
+        # TODO: Generate meta more efficiently
+        meta = _read_json_partition(inputs[0][:1], **_kwargs)
+        return dask.dataframe.from_map(
+            _read_json_partition,
+            inputs,
+            meta=meta,
+            **_kwargs,
+        )
+
+    # Fall back to dask.dataframe.read_json
     return _default_backend(
         dask.dataframe.read_json,
         url_path,
@@ -71,5 +201,9 @@ def read_json(url_path, engine="auto", **kwargs):
             if isinstance(engine, str)
             else engine
         ),
+        blocksize=blocksize,
+        orient=orient,
+        lines=lines,
+        compression=compression,
         **kwargs,
     )
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
index dc780478794..abafbffd197 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_json.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
+import math
 import os
 
 import pandas as pd
@@ -97,3 +98,31 @@ def test_read_json_nested(tmp_path):
         # Ensure not passing kwargs also reads the file.
         actual = dask_cudf.read_json(f)
         dd.assert_eq(actual, actual_pd)
+
+
+def test_read_json_aggregate_files(tmp_path):
+    df1 = dask.datasets.timeseries(
+        dtypes={"x": int, "y": int}, freq="120s"
+    ).reset_index(drop=True)
+    json_path = str(tmp_path / "data-*.json")
+    df1.to_json(json_path)
+
+    df2 = dask_cudf.read_json(json_path, aggregate_files=2)
+    assert df2.npartitions == math.ceil(df1.npartitions / 2)
+    dd.assert_eq(df1, df2, check_index=False)
+
+    df2 = dask_cudf.read_json(
+        json_path, aggregate_files=True, blocksize="1GiB"
+    )
+    assert df2.npartitions == 1
+    dd.assert_eq(df1, df2, check_index=False)
+
+    for include_path_column, name in [(True, "path"), ("file", "file")]:
+        df2 = dask_cudf.read_json(
+            json_path,
+            aggregate_files=2,
+            include_path_column=include_path_column,
+        )
+        assert name in df2.columns
+        assert len(df2[name].compute().unique()) == df1.npartitions
+        dd.assert_eq(df1, df2.drop(columns=[name]), check_index=False)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index a67404da4fe..3947c69aaa5 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -138,5 +138,7 @@ def test_read_parquet(s3_base, s3so, open_file_options):
             storage_options=s3so,
             open_file_options=open_file_options,
         )
-        assert df.a.sum().compute() == 10
-        assert df.b.sum().compute() == 9
+        with pytest.warns(FutureWarning):
+            assert df.a.sum().compute() == 10
+        with pytest.warns(FutureWarning):
+            assert df.b.sum().compute() == 9