diff --git a/.github/workflows/jni-docker-build.yml b/.github/workflows/jni-docker-build.yml
deleted file mode 100644
index 0bdc409d0ab..00000000000
--- a/.github/workflows/jni-docker-build.yml
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name: JNI Docker Build
-
-on:
-  workflow_dispatch: # manual trigger only
-
-concurrency:
-  group: jni-docker-build-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  docker-build:
-    if: github.repository == 'rapidsai/cudf'
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-
-      - name: Login to DockerHub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.GPUCIBOT_DOCKERHUB_USER }}
-          password: ${{ secrets.GPUCIBOT_DOCKERHUB_TOKEN }}
-
-      - name: Set ENVs
-        run: |
-          echo "IMAGE_NAME=rapidsai/cudf-jni-build" >> $GITHUB_ENV
-          echo "IMAGE_REF=${GITHUB_REF_NAME}" >> $GITHUB_ENV
-
-      - name: Build and Push
-        uses: docker/build-push-action@v3
-        with:
-          push: true
-          file: java/ci/Dockerfile.centos7
-          tags: "${{ env.IMAGE_NAME }}:${{ env.IMAGE_REF }}"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3e99cf3fa9a..0ae745257cb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v4.6.0
     hooks:
       - id: trailing-whitespace
         exclude: |
@@ -24,11 +24,11 @@ repos:
         files: python/.*
         types_or: [python, cython, pyi]
   - repo: https://github.com/MarcoGorelli/cython-lint
-    rev: v0.16.0
+    rev: v0.16.2
     hooks:
       - id: cython-lint
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: 'v1.3.0'
+    rev: 'v1.10.0'
     hooks:
       - id: mypy
         additional_dependencies: [types-cachetools]
@@ -39,7 +39,7 @@ repos:
                "python/dask_cudf/dask_cudf"]
         pass_filenames: false
   - repo: https://github.com/nbQA-dev/nbQA
-    rev: 1.7.1
+    rev: 1.8.5
     hooks:
       - id: nbqa-isort
         # Use the cudf_kafka isort orderings in notebooks so that dask
@@ -52,7 +52,7 @@ repos:
         types_or: [c, c++, cuda]
         args: ["-fallback-style=none", "-style=file", "-i"]
   - repo: https://github.com/sirosen/texthooks
-    rev: 0.6.3
+    rev: 0.6.6
     hooks:
       - id: fix-smartquotes
         exclude: |
@@ -124,12 +124,12 @@ repos:
             ^CHANGELOG.md$
           )
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.8.0
+    rev: v1.13.4
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.4
+    rev: v0.4.3
     hooks:
       - id: ruff
         files: python/.*$
diff --git a/build.sh b/build.sh
index e5daf2f3451..43bb04f7a18 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 # cuDF build script
 
@@ -109,8 +109,8 @@ function buildAll {
 }
 
 function buildLibCudfJniInDocker {
-    local cudaVersion="11.5.0"
-    local imageName="cudf-build:${cudaVersion}-devel-centos7"
+    local cudaVersion="11.8.0"
+    local imageName="cudf-build:${cudaVersion}-devel-rocky8"
     local CMAKE_GENERATOR="${CMAKE_GENERATOR:-Ninja}"
     local workspaceDir="/rapids"
     local localMavenRepo=${LOCAL_MAVEN_REPO:-"$HOME/.m2/repository"}
@@ -120,7 +120,7 @@ function buildLibCudfJniInDocker {
     mkdir -p "$CUDF_JAR_JAVA_BUILD_DIR/libcudf-cmake-build"
     mkdir -p "$HOME/.ccache" "$HOME/.m2"
     nvidia-docker build \
-        -f java/ci/Dockerfile.centos7 \
+        -f java/ci/Dockerfile.rocky \
         --build-arg CUDA_VERSION=${cudaVersion} \
         -t $imageName .
     nvidia-docker run -it -u $(id -u):$(id -g) --rm \
diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
index f87a3a36fcc..6cf70a2347f 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
@@ -17,10 +17,8 @@ MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-${RAPIDS_FULL_VER
 PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json
 
 rapids-logger "Fetching latest available results from nightly"
-aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
-
-read -r COMPARE_ENV < s3_output.txt
-export COMPARE_ENV
+aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json')], &LastModified)[::].[Key]" --output text  | tee s3_output.txt
+COMPARE_ENV=$(tail -n 1 s3_output.txt)
 rapids-logger "Latest available results from nightly: ${COMPARE_ENV}"
 
 aws s3 cp "s3://rapids-downloads/${COMPARE_ENV}" main-results.json
diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml
index c98c2701653..d399e440edd 100644
--- a/conda/recipes/cudf/conda_build_config.yaml
+++ b/conda/recipes/cudf/conda_build_config.yaml
@@ -4,7 +4,10 @@ c_compiler_version:
 cxx_compiler_version:
   - 11
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index ae2d938250b..ddcadfd1570 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -57,7 +57,7 @@ requirements:
     - {{ compiler('cuda') }}
     {% endif %}
     - cuda-version ={{ cuda_version }}
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     - python
     - cython >=3.0.3
diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml
index c98c2701653..d399e440edd 100644
--- a/conda/recipes/cudf_kafka/conda_build_config.yaml
+++ b/conda/recipes/cudf_kafka/conda_build_config.yaml
@@ -4,7 +4,10 @@ c_compiler_version:
 cxx_compiler_version:
   - 11
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 45e41bf8de7..ab41d9e1f15 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -53,7 +53,7 @@ requirements:
     - {{ compiler('cuda') }}
     {% endif %}
     - cuda-version ={{ cuda_version }}
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     - python
     - cython >=3.0.3
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index b7fbaab9306..ba5e96fb6cf 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -10,7 +10,10 @@ cuda_compiler:
 cuda11_compiler:
   - nvcc
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 695c515b9d4..76115362b6c 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -43,7 +43,7 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
     - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     - librmm ={{ minor_version }}
     - libkvikio ={{ minor_version }}
@@ -170,7 +170,7 @@ outputs:
         {% endif %}
         - cuda-version ={{ cuda_version }}
         - ninja
-        - sysroot_{{ target_platform }} {{ sysroot_version }}
+        - {{ stdlib("c") }}
       host:
         - {{ pin_subpackage('libcudf', exact=True) }}
         {% if cuda_major == "11" %}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 53da710f0ea..f11f3fc3c9a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -395,8 +395,9 @@ add_library(
   src/io/orc/dict_enc.cu
   src/io/orc/orc.cpp
   src/io/orc/reader_impl.cu
+  src/io/orc/reader_impl_chunking.cu
+  src/io/orc/reader_impl_decode.cu
   src/io/orc/reader_impl_helpers.cpp
-  src/io/orc/reader_impl_preprocess.cu
   src/io/orc/stats_enc.cu
   src/io/orc/stripe_data.cu
   src/io/orc/stripe_enc.cu
@@ -429,6 +430,7 @@ add_library(
   src/io/text/multibyte_split.cu
   src/io/utilities/arrow_io_source.cpp
   src/io/utilities/column_buffer.cpp
+  src/io/utilities/column_buffer_strings.cu
   src/io/utilities/config_utils.cpp
   src/io/utilities/data_casting.cu
   src/io/utilities/data_sink.cpp
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 5fd328dfc68..7e61d881f07 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -346,6 +346,11 @@ target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB)
 # ---------------------------------------------------------------------------------
 ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)
 
+# ##################################################################################################
+# * reshape benchmark
+# ---------------------------------------------------------------------------------
+ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
+
 add_custom_target(
   run_benchmarks
   DEPENDS CUDF_BENCHMARKS
diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index fdb7dbe59b8..b7c214a8374 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,31 +24,59 @@
 
 #include <nvbench/nvbench.cuh>
 
+namespace {
+
 // Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
-constexpr int64_t data_size        = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
+constexpr std::size_t data_size    = 512 << 20;
+constexpr std::size_t Mbytes       = 1024 * 1024;
 
+template <bool is_chunked_read>
 void orc_read_common(cudf::size_type num_rows_to_read,
                      cuio_source_sink_pair& source_sink,
                      nvbench::state& state)
 {
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(source_sink.make_source_info());
+  auto const read_opts =
+    cudf::io::orc_reader_options::builder(source_sink.make_source_info()).build();
 
   auto mem_stats_logger = cudf::memory_stats_logger();  // init stats logger
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-  state.exec(
-    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
-      try_drop_l3_cache();
-
-      timer.start();
-      auto const result = cudf::io::read_orc(read_opts);
-      timer.stop();
 
-      CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns");
-      CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
-    });
+  if constexpr (is_chunked_read) {
+    state.exec(
+      nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch&, auto& timer) {
+        try_drop_l3_cache();
+        auto const output_limit_MB =
+          static_cast<std::size_t>(state.get_int64("chunk_read_limit_MB"));
+        auto const read_limit_MB = static_cast<std::size_t>(state.get_int64("pass_read_limit_MB"));
+
+        auto reader =
+          cudf::io::chunked_orc_reader(output_limit_MB * Mbytes, read_limit_MB * Mbytes, read_opts);
+        cudf::size_type num_rows{0};
+
+        timer.start();
+        do {
+          auto chunk = reader.read_chunk();
+          num_rows += chunk.tbl->num_rows();
+        } while (reader.has_next());
+        timer.stop();
+
+        CUDF_EXPECTS(num_rows == num_rows_to_read, "Unexpected number of rows");
+      });
+  } else {  // not is_chunked_read
+    state.exec(
+      nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch&, auto& timer) {
+        try_drop_l3_cache();
+
+        timer.start();
+        auto const result = cudf::io::read_orc(read_opts);
+        timer.stop();
+
+        CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns");
+        CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
+      });
+  }
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
   state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
@@ -57,6 +85,8 @@ void orc_read_common(cudf::size_type num_rows_to_read,
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
+}  // namespace
+
 template <data_type DataType, cudf::io::io_type IOType>
 void BM_orc_read_data(nvbench::state& state,
                       nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IOType>>)
@@ -79,13 +109,11 @@ void BM_orc_read_data(nvbench::state& state,
     return view.num_rows();
   }();
 
-  orc_read_common(num_rows_written, source_sink, state);
+  orc_read_common<false>(num_rows_written, source_sink, state);
 }
 
-template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
-void BM_orc_read_io_compression(
-  nvbench::state& state,
-  nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
+template <cudf::io::io_type IOType, cudf::io::compression_type Compression, bool chunked_read>
+void orc_read_io_compression(nvbench::state& state)
 {
   auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL_SIGNED),
                                          static_cast<int32_t>(data_type::FLOAT),
@@ -95,15 +123,21 @@ void BM_orc_read_io_compression(
                                          static_cast<int32_t>(data_type::LIST),
                                          static_cast<int32_t>(data_type::STRUCT)});
 
-  cudf::size_type const cardinality = state.get_int64("cardinality");
-  cudf::size_type const run_length  = state.get_int64("run_length");
+  auto const [cardinality, run_length] = [&]() -> std::pair<cudf::size_type, cudf::size_type> {
+    if constexpr (chunked_read) {
+      return {0, 4};
+    } else {
+      return {static_cast<cudf::size_type>(state.get_int64("cardinality")),
+              static_cast<cudf::size_type>(state.get_int64("run_length"))};
+    }
+  }();
   cuio_source_sink_pair source_sink(IOType);
 
   auto const num_rows_written = [&]() {
     auto const tbl = create_random_table(
       cycle_dtypes(d_type, num_cols),
       table_size_bytes{data_size},
-      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+      data_profile_builder{}.cardinality(cardinality).avg_run_length(run_length));
     auto const view = tbl->view();
 
     cudf::io::orc_writer_options opts =
@@ -113,7 +147,23 @@ void BM_orc_read_io_compression(
     return view.num_rows();
   }();
 
-  orc_read_common(num_rows_written, source_sink, state);
+  orc_read_common<chunked_read>(num_rows_written, source_sink, state);
+}
+
+template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
+void BM_orc_read_io_compression(
+  nvbench::state& state,
+  nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
+{
+  return orc_read_io_compression<IOType, Compression, false>(state);
+}
+
+template <cudf::io::compression_type Compression>
+void BM_orc_chunked_read_io_compression(nvbench::state& state,
+                                        nvbench::type_list<nvbench::enum_type<Compression>>)
+{
+  // Only run benchmark using HOST_BUFFER IO.
+  return orc_read_io_compression<cudf::io::io_type::HOST_BUFFER, Compression, true>(state);
 }
 
 using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL_SIGNED,
@@ -146,3 +196,13 @@ NVBENCH_BENCH_TYPES(BM_orc_read_io_compression, NVBENCH_TYPE_AXES(io_list, compr
   .set_min_samples(4)
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32});
+
+// Should have the same parameters as `BM_orc_read_io_compression` for comparison.
+NVBENCH_BENCH_TYPES(BM_orc_chunked_read_io_compression, NVBENCH_TYPE_AXES(compression_list))
+  .set_name("orc_chunked_read_io_compression")
+  .set_type_axes_names({"compression"})
+  .set_min_samples(4)
+  // The input has approximately 520MB and 127K rows.
+  // The limits below are given in MBs.
+  .add_int64_axis("chunk_read_limit_MB", {50, 250, 700})
+  .add_int64_axis("pass_read_limit_MB", {50, 250, 700});
diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index 93401f01026..f7984b29d6b 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -34,7 +35,7 @@
 
 CUDF_KERNEL void init_curand(curandState* state, int const nstates)
 {
-  int ithread = threadIdx.x + blockIdx.x * blockDim.x;
+  int ithread = cudf::detail::grid_1d::global_thread_id();
 
   if (ithread < nstates) { curand_init(1234ULL, ithread, 0, state + ithread); }
 }
@@ -46,13 +47,14 @@ CUDF_KERNEL void init_build_tbl(key_type* const build_tbl,
                                 curandState* state,
                                 int const num_states)
 {
-  auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  auto const stride    = blockDim.x * gridDim.x;
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id();
+  auto const stride    = cudf::detail::grid_1d::grid_stride();
   assert(start_idx < num_states);
 
   curandState localState = state[start_idx];
 
-  for (size_type idx = start_idx; idx < build_tbl_size; idx += stride) {
+  for (cudf::thread_index_type tidx = start_idx; tidx < build_tbl_size; tidx += stride) {
+    auto const idx = static_cast<size_type>(tidx);
     double const x = curand_uniform_double(&localState);
 
     build_tbl[idx] = static_cast<key_type>(x * (build_tbl_size / multiplicity));
@@ -71,13 +73,14 @@ CUDF_KERNEL void init_probe_tbl(key_type* const probe_tbl,
                                 curandState* state,
                                 int const num_states)
 {
-  auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  auto const stride    = blockDim.x * gridDim.x;
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id();
+  auto const stride    = cudf::detail::grid_1d::grid_stride();
   assert(start_idx < num_states);
 
   curandState localState = state[start_idx];
 
-  for (size_type idx = start_idx; idx < probe_tbl_size; idx += stride) {
+  for (cudf::thread_index_type tidx = start_idx; tidx < probe_tbl_size; tidx += stride) {
+    auto const idx = static_cast<size_type>(tidx);
     key_type val;
     double x = curand_uniform_double(&localState);
 
diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu
index a54d7d48dc4..c65db187f42 100644
--- a/cpp/benchmarks/json/json.cu
+++ b/cpp/benchmarks/json/json.cu
@@ -22,7 +22,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/json/json.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -77,8 +77,9 @@ struct json_benchmark_row_builder {
   cudf::column_device_view const d_book_pct;           // Book percentage
   cudf::column_device_view const d_misc_order;         // Misc-Store order
   cudf::column_device_view const d_store_order;        // Books-Bicycles order
-  int32_t* d_offsets{};
+  cudf::size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
   thrust::minstd_rand rng{5236};
   thrust::uniform_int_distribution<int> dist{};
 
@@ -155,7 +156,7 @@ struct json_benchmark_row_builder {
       output_str += Misc;
     }
     output_str += brace2;
-    if (!output_str.ptr) d_offsets[idx] = output_str.bytes;
+    if (!output_str.ptr) { d_sizes[idx] = output_str.bytes; }
   }
 };
 
@@ -177,7 +178,7 @@ auto build_json_string_column(int desired_bytes, int num_rows)
   auto d_store_order = cudf::column_device_view::create(float_2bool_columns->get_column(2));
   json_benchmark_row_builder jb{
     desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order};
-  auto [offsets, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets, chars] = cudf::strings::detail::experimental::make_strings_children(
     jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   return cudf::make_strings_column(num_rows, std::move(offsets), chars.release(), 0, {});
 }
diff --git a/cpp/benchmarks/reshape/interleave.cpp b/cpp/benchmarks/reshape/interleave.cpp
new file mode 100644
index 00000000000..4499e34af77
--- /dev/null
+++ b/cpp/benchmarks/reshape/interleave.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/reshape.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_interleave(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const num_cols  = static_cast<cudf::size_type>(state.get_int64("columns"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) * num_cols >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const str_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  std::vector<cudf::type_id> types(num_cols, cudf::type_id::STRING);
+  auto const source_table = create_random_table(types, row_count{num_rows}, str_profile);
+
+  auto const source_view = source_table->view();
+  auto const stream      = cudf::get_default_stream();
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size = cudf::strings_column_view(source_view.column(0)).chars_size(stream) +
+                    cudf::strings_column_view(source_view.column(1)).chars_size(stream);
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto result = cudf::interleave_columns(source_view);
+  });
+}
+
+NVBENCH_BENCH(bench_interleave)
+  .set_name("interleave_strings")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("columns", {2, 10, 100});
diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index 6d839c1de64..ae6c8b844c8 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -80,7 +80,7 @@ std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
 }
 
 // longer pattern lengths demand more working memory per string
-std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$"};
+std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"};
 
 static void bench_contains(nvbench::state& state)
 {
@@ -114,4 +114,4 @@ NVBENCH_BENCH(bench_contains)
   .add_int64_axis("row_width", {32, 64, 128, 256, 512})
   .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
   .add_int64_axis("hit_rate", {50, 100})  // percentage
-  .add_int64_axis("pattern", {0, 1});
+  .add_int64_axis("pattern", {0, 1, 2});
diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp
index a656010dca5..f964bc5d224 100644
--- a/cpp/benchmarks/string/count.cpp
+++ b/cpp/benchmarks/string/count.cpp
@@ -25,10 +25,13 @@
 
 #include <nvbench/nvbench.cuh>
 
+static std::string patterns[] = {"\\d+", "a"};
+
 static void bench_count(nvbench::state& state)
 {
-  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const pattern_index = static_cast<cudf::size_type>(state.get_int64("pattern"));
 
   if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
       static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
@@ -41,7 +44,7 @@ static void bench_count(nvbench::state& state)
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
 
-  std::string pattern = "\\d+";
+  auto const pattern = patterns[pattern_index];
 
   auto prog = cudf::strings::regex_program::create(pattern);
 
@@ -59,4 +62,5 @@ static void bench_count(nvbench::state& state)
 NVBENCH_BENCH(bench_count)
   .set_name("count")
   .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("pattern", {0, 1});
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
index 161328ae088..3aff75d840e 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
@@ -60,13 +60,15 @@ constexpr int block_size = 256;
 template <FunctorType functor_type, class T>
 CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols)
 {
-  using F               = Functor<T, functor_type>;
-  cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x;
-  while (index < n_rows) {
+  using F           = Functor<T, functor_type>;
+  auto tidx         = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+  while (tidx < n_rows) {
+    auto const index = static_cast<cudf::size_type>(tidx);
     for (int c = 0; c < n_cols; c++) {
       A[c][index] = F::f(A[c][index]);
     }
-    index += blockDim.x * gridDim.x;
+    tidx += stride;
   }
 }
 
@@ -74,12 +76,14 @@ CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size
 template <FunctorType functor_type, class T>
 CUDF_KERNEL void host_dispatching_kernel(cudf::mutable_column_device_view source_column)
 {
-  using F               = Functor<T, functor_type>;
-  T* A                  = source_column.data<T>();
-  cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x;
-  while (index < source_column.size()) {
-    A[index] = F::f(A[index]);
-    index += blockDim.x * gridDim.x;
+  using F           = Functor<T, functor_type>;
+  T* A              = source_column.data<T>();
+  auto tidx         = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+  while (tidx < source_column.size()) {
+    auto const index = static_cast<cudf::size_type>(tidx);
+    A[index]         = F::f(A[index]);
+    tidx += stride;
   }
 }
 
@@ -127,14 +131,15 @@ template <FunctorType functor_type>
 CUDF_KERNEL void device_dispatching_kernel(cudf::mutable_table_device_view source)
 {
   cudf::size_type const n_rows = source.num_rows();
-  cudf::size_type index        = threadIdx.x + blockIdx.x * blockDim.x;
-
-  while (index < n_rows) {
+  auto tidx                    = cudf::detail::grid_1d::global_thread_id();
+  auto const stride            = cudf::detail::grid_1d::grid_stride();
+  while (tidx < n_rows) {
+    auto const index = static_cast<cudf::size_type>(tidx);
     for (cudf::size_type i = 0; i < source.num_columns(); i++) {
       cudf::type_dispatcher(
         source.column(i).type(), RowHandle<functor_type>{}, source.column(i), index);
     }
-    index += blockDim.x * gridDim.x;
+    tidx += stride;
   }  // while
 }
 
diff --git a/cpp/cmake/thirdparty/get_nvbench.cmake b/cpp/cmake/thirdparty/get_nvbench.cmake
index bbd22693ba4..84c27dd9d56 100644
--- a/cpp/cmake/thirdparty/get_nvbench.cmake
+++ b/cpp/cmake/thirdparty/get_nvbench.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -18,9 +18,6 @@ function(find_and_configure_nvbench)
   include(${rapids-cmake-dir}/cpm/nvbench.cmake)
   include(${rapids-cmake-dir}/cpm/package_override.cmake)
 
-  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
-  rapids_cpm_package_override("${cudf_patch_dir}/nvbench_override.json")
-
   rapids_cpm_nvbench(BUILD_STATIC)
 
 endfunction()
diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json
deleted file mode 100644
index ef0deb4c1e9..00000000000
--- a/cpp/cmake/thirdparty/patches/nvbench_override.json
+++ /dev/null
@@ -1,9 +0,0 @@
-
-{
-  "packages" : {
-    "nvbench" : {
-      "git_url": "https://github.com/NVIDIA/nvbench.git",
-      "git_tag": "555d628e9b250868c9da003e4407087ff1982e8e"
-    }
-  }
-}
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index ce9840050a9..05f8e4585cc 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -84,7 +84,7 @@ prefixed with an underscore.
 
 ```c++
 template <typename IteratorType>
-void algorithm_function(int x, rmm::cuda_stream_view s, rmm::device_memory_resource* mr)
+void algorithm_function(int x, rmm::cuda_stream_view s, rmm::device_async_resource_ref mr)
 {
   ...
 }
@@ -194,9 +194,10 @@ and produce `unique_ptr`s to owning objects as output. For example,
 std::unique_ptr<table> sort(table_view const& input);
 ```
 
-## rmm::device_memory_resource
+## Memory Resources
 
-libcudf allocates all device memory via RMM memory resources (MR). See the
+libcudf allocates all device memory via RMM memory resources (MR) or CUDA MRs. Either type
+can be passed to libcudf functions via `rmm::device_async_resource_ref` parameters. See the
 [RMM documentation](https://github.com/rapidsai/rmm/blob/main/README.md) for details.
 
 ### Current Device Memory Resource
@@ -206,6 +207,27 @@ RMM provides a "default" memory resource for each device that can be accessed an
 respectively. All memory resource parameters should be defaulted to use the return value of
 `rmm::mr::get_current_device_resource()`.
 
+### Resource Refs
+
+Memory resources are passed via resource ref parameters. A resource ref is a memory resource wrapper
+that enables consumers to specify properties of resources that they expect. These are defined
+in the `cuda::mr` namespace of libcu++, but RMM provides some convenience wrappers in
+`rmm/resource_ref.hpp`:
+ - `rmm::device_resource_ref` accepts a memory resource that provides synchronous allocation
+    of device-accessible memory.
+ - `rmm::device_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
+    of device-accessible memory.
+ - `rmm::host_resource_ref` accepts a memory resource that provides synchronous allocation of host-
+    accessible memory.
+ - `rmm::host_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
+    of host-accessible memory.
+ - `rmm::host_device_resource_ref` accepts a memory resource that provides synchronous allocation of
+    host- and device-accessible memory.
+ - `rmm::host_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
+    of host- and device-accessible memory.
+
+See the libcu++ [docs on `resource_ref`](https://nvidia.github.io/cccl/libcudacxx/extended_api/memory_resource/resource_ref.html) for more information.
+
 ## cudf::column
 
 `cudf::column` is a core owning data structure in libcudf. Most libcudf public APIs produce either
@@ -519,23 +541,23 @@ how device memory is allocated.
 
 ### Output Memory
 
-Any libcudf API that allocates memory that is *returned* to a user must accept a pointer to a
-`device_memory_resource` as the last parameter. Inside the API, this memory resource must be used
-to allocate any memory for returned objects. It should therefore be passed into functions whose
-outputs will be returned. Example:
+Any libcudf API that allocates memory that is *returned* to a user must accept a
+`rmm::device_async_resource_ref` as the last parameter. Inside the API, this memory resource must
+be used to allocate any memory for returned objects. It should therefore be passed into functions
+whose outputs will be returned. Example:
 
 ```c++
 // Returned `column` contains newly allocated memory,
 // therefore the API must accept a memory resource pointer
 std::unique_ptr<column> returns_output_memory(
-  ..., rmm::device_memory_resource * mr = rmm::mr::get_current_device_resource());
+  ..., rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 // This API does not allocate any new *output* memory, therefore
 // a memory resource is unnecessary
 void does_not_allocate_output_memory(...);
 ```
 
-This rule automatically applies to all detail APIs that allocates memory. Any detail API may be
+This rule automatically applies to all detail APIs that allocate memory. Any detail API may be
 called by any public API, and therefore could be allocating memory that is returned to the user.
 To support such uses cases, all detail APIs allocating memory resources should accept an `mr`
 parameter. Callers are responsible for either passing through a provided `mr` or
@@ -549,7 +571,7 @@ obtained from `rmm::mr::get_current_device_resource()` for temporary memory allo
 
 ```c++
 rmm::device_buffer some_function(
-  ..., rmm::mr::device_memory_resource mr * = rmm::mr::get_current_device_resource()) {
+  ..., rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) {
     rmm::device_buffer returned_buffer(..., mr); // Returned buffer uses the passed in MR
     ...
     rmm::device_buffer temporary_buffer(...); // Temporary buffer uses default MR
@@ -561,11 +583,11 @@ rmm::device_buffer some_function(
 ### Memory Management
 
 libcudf code generally eschews raw pointers and direct memory allocation. Use RMM classes built to
-use `device_memory_resource`s for device memory allocation with automated lifetime management.
+use memory resources for device memory allocation with automated lifetime management.
 
 #### rmm::device_buffer
 Allocates a specified number of bytes of untyped, uninitialized device memory using a
-`device_memory_resource`. If no resource is explicitly provided, uses
+memory resource. If no `rmm::device_async_resource_ref` is explicitly provided, it uses
 `rmm::mr::get_current_device_resource()`.
 
 `rmm::device_buffer` is movable and copyable on a stream. A copy performs a deep copy of the
@@ -921,13 +943,14 @@ Use the `CUDF_EXPECTS` macro to enforce runtime conditions necessary for correct
 Example usage:
 
 ```c++
-CUDF_EXPECTS(lhs.type() == rhs.type(), "Column type mismatch");
+CUDF_EXPECTS(cudf::have_same_types(lhs, rhs), "Type mismatch", cudf::data_type_error);
 ```
 
 The first argument is the conditional expression expected to resolve to `true` under normal
-conditions. If the conditional evaluates to `false`, then an error has occurred and an instance of
-`cudf::logic_error` is thrown. The second argument to `CUDF_EXPECTS` is a short description of the
-error that has occurred and is used for the exception's `what()` message.
+conditions. The second argument to `CUDF_EXPECTS` is a short description of the error that has
+occurred and is used for the exception's `what()` message. If the conditional evaluates to
+`false`, then an error has occurred and an instance of the exception class in the third argument
+(or the default, `cudf::logic_error`) is thrown.
 
 There are times where a particular code path, if reached, should indicate an error no matter what.
 For example, often the `default` case of a `switch` statement represents an invalid alternative.
@@ -1026,6 +1049,12 @@ types such as numeric types and timestamps/durations, adding support for nested
 Enabling an algorithm differently for different types uses either template specialization or SFINAE,
 as discussed in [Specializing Type-Dispatched Code Paths](#specializing-type-dispatched-code-paths).
 
+## Comparing Data Types
+
+When comparing the data types of two columns or scalars, do not directly compare
+`a.type() == b.type()`. Nested types such as lists of structs of integers will not be handled
+properly if only the top level type is compared. Instead, use the `cudf::have_same_types` function.
+
 # Type Dispatcher
 
 libcudf stores data (for columns and scalars) "type erased" in `void*` device memory. This
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index ac5cb0ad141..8418e279ce7 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -45,29 +45,30 @@ __launch_bounds__(block_size) CUDF_KERNEL
                            mutable_column_device_view out,
                            size_type* __restrict__ const valid_count)
 {
-  size_type const tid            = threadIdx.x + blockIdx.x * block_size;
-  int const warp_id              = tid / warp_size;
-  size_type const warps_per_grid = gridDim.x * block_size / warp_size;
+  auto tidx                      = cudf::detail::grid_1d::global_thread_id<block_size>();
+  auto const stride              = cudf::detail::grid_1d::grid_stride<block_size>();
+  int const warp_id              = tidx / cudf::detail::warp_size;
+  size_type const warps_per_grid = gridDim.x * block_size / cudf::detail::warp_size;
 
   // begin/end indices for the column data
-  size_type begin = 0;
-  size_type end   = out.size();
+  size_type const begin = 0;
+  size_type const end   = out.size();
   // warp indices.  since 1 warp == 32 threads == sizeof(bitmask_type) * 8,
   // each warp will process one (32 bit) of the validity mask via
   // __ballot_sync()
-  size_type warp_begin = cudf::word_index(begin);
-  size_type warp_end   = cudf::word_index(end - 1);
+  size_type const warp_begin = cudf::word_index(begin);
+  size_type const warp_end   = cudf::word_index(end - 1);
 
   // lane id within the current warp
   constexpr size_type leader_lane{0};
-  int const lane_id = threadIdx.x % warp_size;
+  int const lane_id = threadIdx.x % cudf::detail::warp_size;
 
   size_type warp_valid_count{0};
 
   // current warp.
   size_type warp_cur = warp_begin + warp_id;
-  size_type index    = tid;
   while (warp_cur <= warp_end) {
+    auto const index = static_cast<size_type>(tidx);
     auto const opt_value =
       (index < end) ? (filter(index) ? lhs[index] : rhs[index]) : thrust::nullopt;
     if (opt_value) { out.element<T>(index) = static_cast<T>(*opt_value); }
@@ -85,7 +86,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
 
     // next grid
     warp_cur += warps_per_grid;
-    index += block_size * gridDim.x;
+    tidx += stride;
   }
 
   if (has_nulls) {
@@ -159,7 +160,7 @@ std::unique_ptr<column> copy_if_else(bool nullable,
   using Element = typename thrust::iterator_traits<LeftIter>::value_type::value_type;
 
   size_type size           = std::distance(lhs_begin, lhs_end);
-  size_type num_els        = cudf::util::round_up_safe(size, warp_size);
+  size_type num_els        = cudf::util::round_up_safe(size, cudf::detail::warp_size);
   constexpr int block_size = 256;
   cudf::detail::grid_1d grid{num_els, block_size, 1};
 
diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index 93d52d5dda3..de3d23e9470 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -85,16 +85,10 @@ struct hasher_adapter {
 template <cudf::has_nested HasNested>
 struct distinct_hash_join {
  private:
-  /// Row equality type for nested columns
-  using nested_row_equal = cudf::experimental::row::equality::strong_index_comparator_adapter<
-    cudf::experimental::row::equality::device_row_comparator<true, cudf::nullate::DYNAMIC>>;
-  /// Row equality type for flat columns
-  using flat_row_equal = cudf::experimental::row::equality::strong_index_comparator_adapter<
-    cudf::experimental::row::equality::device_row_comparator<false, cudf::nullate::DYNAMIC>>;
-
   /// Device row equal type
-  using d_equal_type =
-    std::conditional_t<HasNested == cudf::has_nested::YES, nested_row_equal, flat_row_equal>;
+  using d_equal_type = cudf::experimental::row::equality::strong_index_comparator_adapter<
+    cudf::experimental::row::equality::device_row_comparator<HasNested == cudf::has_nested::YES,
+                                                             cudf::nullate::DYNAMIC>>;
   using hasher              = hasher_adapter<thrust::identity<hash_value_type>>;
   using probing_scheme_type = cuco::linear_probing<1, hasher>;
   using cuco_storage_type   = cuco::storage<1>;
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index 7eb661f7833..80bc87731ca 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -29,7 +29,9 @@
 #include <cudf/strings/detail/scatter.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -213,8 +215,9 @@ struct column_scatterer_impl<dictionary32> {
     // check the keys match
     dictionary_column_view const source(source_in);
     dictionary_column_view const target(target_in);
-    CUDF_EXPECTS(source.keys().type() == target.keys().type(),
-                 "scatter dictionary keys must be the same type");
+    CUDF_EXPECTS(cudf::have_same_types(source.keys(), target.keys()),
+                 "scatter dictionary keys must be the same type",
+                 cudf::data_type_error);
 
     // first combine keys so both dictionaries have the same set
     auto target_matched    = dictionary::detail::add_keys(target, source.keys(), stream, mr);
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index 86c85ca8d06..f1775c6d6d7 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -93,6 +93,19 @@ class grid_1d {
     return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x);
   }
 
+  /**
+   * @brief Returns the global thread index of the current thread in a 1D grid.
+   *
+   * @tparam num_threads_per_block The number of threads per block
+   *
+   * @return thread_index_type The global thread index
+   */
+  template <thread_index_type num_threads_per_block>
+  static __device__ thread_index_type global_thread_id()
+  {
+    return global_thread_id(threadIdx.x, blockIdx.x, num_threads_per_block);
+  }
+
   /**
    * @brief Returns the stride of a 1D grid.
    *
@@ -115,6 +128,19 @@ class grid_1d {
    * @return thread_index_type The number of threads in the grid.
    */
   static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); }
+
+  /**
+   * @brief Returns the stride of the current 1D grid.
+   *
+   * @tparam num_threads_per_block The number of threads per block
+   *
+   * @return thread_index_type The number of threads in the grid.
+   */
+  template <thread_index_type num_threads_per_block>
+  static __device__ thread_index_type grid_stride()
+  {
+    return grid_stride(num_threads_per_block, gridDim.x);
+  }
 };
 
 /**
diff --git a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp b/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
index 858501877b0..6901a19473e 100644
--- a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
@@ -109,30 +109,6 @@ class rmm_host_allocator {
   {
   }
 
-  /**
-   * @brief Copy constructor
-   */
-  rmm_host_allocator(rmm_host_allocator const& other) = default;
-
-  /**
-   * @brief Move constructor
-   */
-  rmm_host_allocator(rmm_host_allocator&& other) = default;
-
-  /**
-   * @brief Assignment operator
-   */
-  rmm_host_allocator& operator=(rmm_host_allocator const& other)
-  {
-    mr = other.mr;
-    return *this;
-  }
-
-  /**
-   * @brief rmm_host_allocator's null destructor does nothing.
-   */
-  inline ~rmm_host_allocator() {}
-
   /**
    * @brief This method allocates storage for objects in host memory.
    *
@@ -183,7 +159,10 @@ class rmm_host_allocator {
    *  @param x The other \p rmm_host_allocator of interest.
    *  @return This method always returns \c true.
    */
-  inline bool operator==(rmm_host_allocator const& x) const { return x.mr == mr; }
+  inline bool operator==(rmm_host_allocator const& x) const
+  {
+    return x.mr == mr && x.stream == stream;
+  }
 
   /**
    * @brief This method tests this \p rmm_host_allocator for inequality
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index 66163d6059a..64a3c4edf78 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -50,8 +50,8 @@ CUDF_KERNEL void valid_if_kernel(
 {
   constexpr size_type leader_lane{0};
   auto const lane_id{threadIdx.x % warp_size};
-  auto i            = cudf::detail::grid_1d::global_thread_id();
-  auto const stride = cudf::detail::grid_1d::grid_stride();
+  auto i            = cudf::detail::grid_1d::global_thread_id<block_size>();
+  auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();
   size_type warp_valid_count{0};
 
   auto active_mask = __ballot_sync(0xFFFF'FFFFu, i < size);
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index cf8e23c2d93..540a584908d 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -56,22 +57,22 @@ void write_json(data_sink* sink,
 /**
  * @brief Normalize single quotes to double quotes using FST
  *
- * @param inbuf Input device buffer
+ * @param indata Input device buffer
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& inbuf,
-                                                  rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr);
+void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr);
 
 /**
  * @brief Normalize unquoted whitespace (space and tab characters) using FST
  *
- * @param inbuf Input device buffer
+ * @param indata Input device buffer
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-rmm::device_uvector<char> normalize_whitespace(rmm::device_uvector<char>&& inbuf,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::device_async_resource_ref mr);
+void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
+                          rmm::cuda_stream_view stream,
+                          rmm::device_async_resource_ref mr);
 }  // namespace cudf::io::json::detail
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 9aeb9ae4267..597ddd9cf0a 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -38,13 +38,15 @@ class chunked_orc_writer_options;
 
 namespace orc::detail {
 
+// Forward declaration of the internal reader class
+class reader_impl;
+
 /**
  * @brief Class to read ORC dataset data into columns.
  */
 class reader {
  private:
-  class impl;
-  std::unique_ptr<impl> _impl;
+  std::unique_ptr<reader_impl> _impl;
 
  public:
   /**
@@ -68,10 +70,63 @@ class reader {
   /**
    * @brief Reads the entire dataset.
    *
-   * @param options Settings for controlling reading behavior
    * @return The set of columns along with table metadata
    */
-  table_with_metadata read(orc_reader_options const& options);
+  table_with_metadata read();
+};
+
+/**
+ * @brief The reader class that supports iterative reading from an array of data sources.
+ */
+class chunked_reader {
+ private:
+  std::unique_ptr<reader_impl> _impl;
+
+ public:
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t, size_type,
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
+   *
+   * @param sources Input `datasource` objects to read the dataset from
+   */
+  explicit chunked_reader(std::size_t chunk_read_limit,
+                          std::size_t pass_read_limit,
+                          size_type output_row_granularity,
+                          std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+                          orc_reader_options const& options,
+                          rmm::cuda_stream_view stream,
+                          rmm::device_async_resource_ref mr);
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t,
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
+   *
+   * @param sources Input `datasource` objects to read the dataset from
+   */
+  explicit chunked_reader(std::size_t chunk_read_limit,
+                          std::size_t pass_read_limit,
+                          std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+                          orc_reader_options const& options,
+                          rmm::cuda_stream_view stream,
+                          rmm::device_async_resource_ref mr);
+
+  /**
+   * @brief Destructor explicitly-declared to avoid inlined in header.
+   *
+   * Since the declaration of the internal `_impl` object does not exist in this header, this
+   * destructor needs to be defined in a separate source file which can access to that object's
+   * declaration.
+   */
+  ~chunked_reader();
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::has_next
+   */
+  [[nodiscard]] bool has_next() const;
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::read_chunk
+   */
+  [[nodiscard]] table_with_metadata read_chunk() const;
 };
 
 /**
@@ -126,5 +181,6 @@ class writer {
    */
   void close();
 };
+
 }  // namespace orc::detail
 }  // namespace cudf::io
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index a6112b8db4c..7374ffc37e6 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -101,6 +101,8 @@ class json_reader_options {
   bool _lines = false;
   // Parse mixed types as a string column
   bool _mixed_types_as_string = false;
+  // Prune columns on read, selected based on the _dtypes option
+  bool _prune_columns = false;
 
   // Bytes to skip from the start
   size_t _byte_range_offset = 0;
@@ -241,6 +243,17 @@ class json_reader_options {
    */
   bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; }
 
+  /**
+   * @brief Whether to prune columns on read, selected based on the @ref set_dtypes option.
+   *
+   * When set as true, if the reader options include @ref set_dtypes, then
+   * the reader will only return those columns which are mentioned in @ref set_dtypes.
+   * If false, then all columns are returned, independent of the @ref set_dtypes setting.
+   *
+   * @return True if column pruning is enabled
+   */
+  bool is_enabled_prune_columns() const { return _prune_columns; }
+
   /**
    * @brief Whether to parse dates as DD/MM versus MM/DD.
    *
@@ -342,6 +355,17 @@ class json_reader_options {
    */
   void enable_mixed_types_as_string(bool val) { _mixed_types_as_string = val; }
 
+  /**
+   * @brief Set whether to prune columns on read, selected based on the @ref set_dtypes option.
+   *
+   * When set as true, if the reader options include @ref set_dtypes, then
+   * the reader will only return those columns which are mentioned in @ref set_dtypes.
+   * If false, then all columns are returned, independent of the @ref set_dtypes setting.
+   *
+   * @param val Boolean value to enable/disable column pruning
+   */
+  void enable_prune_columns(bool val) { _prune_columns = val; }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
@@ -508,6 +532,22 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether to prune columns on read, selected based on the @ref dtypes option.
+   *
+   * When set as true, if the reader options include @ref dtypes, then
+   * the reader will only return those columns which are mentioned in @ref dtypes.
+   * If false, then all columns are returned, independent of the @ref dtypes setting.
+   *
+   * @param val Boolean value to enable/disable column pruning
+   * @return this for chaining
+   */
+  json_reader_options_builder& prune_columns(bool val)
+  {
+    options._prune_columns = val;
+    return *this;
+  }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index bceb258cb38..8140f8897b7 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -58,10 +58,10 @@ class orc_reader_options {
 
   // List of individual stripes to read (ignored if empty)
   std::vector<std::vector<size_type>> _stripes;
-  // Rows to skip from the start; ORC stores the number of rows as uint64_t
-  uint64_t _skip_rows = 0;
+  // Rows to skip from the start
+  int64_t _skip_rows = 0;
   // Rows to read; `nullopt` is all
-  std::optional<size_type> _num_rows;
+  std::optional<int64_t> _num_rows;
 
   // Whether to use row index to speed-up reading
   bool _use_index = true;
@@ -125,7 +125,7 @@ class orc_reader_options {
    *
    * @return Number of rows to skip from the start
    */
-  uint64_t get_skip_rows() const { return _skip_rows; }
+  int64_t get_skip_rows() const { return _skip_rows; }
 
   /**
    * @brief Returns number of row to read.
@@ -133,7 +133,7 @@ class orc_reader_options {
    * @return Number of rows to read; `nullopt` if the option hasn't been set (in which case the file
    * is read until the end)
    */
-  std::optional<size_type> const& get_num_rows() const { return _num_rows; }
+  std::optional<int64_t> const& get_num_rows() const { return _num_rows; }
 
   /**
    * @brief Whether to use row index to speed-up reading.
@@ -198,10 +198,10 @@ class orc_reader_options {
    * @throw cudf::logic_error if a negative value is passed
    * @throw cudf::logic_error if stripes have been previously set
    */
-  void set_skip_rows(uint64_t rows)
+  void set_skip_rows(int64_t rows)
   {
+    CUDF_EXPECTS(rows >= 0, "skip_rows cannot be negative");
     CUDF_EXPECTS(rows == 0 or _stripes.empty(), "Can't set both skip_rows along with stripes");
-    CUDF_EXPECTS(rows <= std::numeric_limits<int64_t>::max(), "skip_rows is too large");
     _skip_rows = rows;
   }
 
@@ -213,7 +213,7 @@ class orc_reader_options {
    * @throw cudf::logic_error if a negative value is passed
    * @throw cudf::logic_error if stripes have been previously set
    */
-  void set_num_rows(size_type nrows)
+  void set_num_rows(int64_t nrows)
   {
     CUDF_EXPECTS(nrows >= 0, "num_rows cannot be negative");
     CUDF_EXPECTS(_stripes.empty(), "Can't set both num_rows and stripes");
@@ -271,7 +271,7 @@ class orc_reader_options_builder {
    *
    * @param src The source information used to read orc file
    */
-  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {};
+  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {}
 
   /**
    * @brief Sets names of the column to read.
@@ -303,7 +303,7 @@ class orc_reader_options_builder {
    * @param rows Number of rows
    * @return this for chaining
    */
-  orc_reader_options_builder& skip_rows(uint64_t rows)
+  orc_reader_options_builder& skip_rows(int64_t rows)
   {
     options.set_skip_rows(rows);
     return *this;
@@ -315,7 +315,7 @@ class orc_reader_options_builder {
    * @param nrows Number of rows
    * @return this for chaining
    */
-  orc_reader_options_builder& num_rows(size_type nrows)
+  orc_reader_options_builder& num_rows(int64_t nrows)
   {
     options.set_num_rows(nrows);
     return *this;
@@ -406,6 +406,144 @@ table_with_metadata read_orc(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief The chunked orc reader class to read an ORC file iteratively into a series of
+ * tables, chunk by chunk.
+ *
+ * This class is designed to address the reading issue when reading very large ORC files such
+ * that sizes of their columns exceed the limit that can be stored in cudf columns. By reading the
+ * file content by chunks using this class, each chunk is guaranteed to have its size stay within
+ * the given limit.
+ */
+class chunked_orc_reader {
+ public:
+  /**
+   * @brief Default constructor, this should never be used.
+   *
+   * This is added just to satisfy cython.
+   */
+  chunked_orc_reader() = default;
+
+  /**
+   * @brief Construct the reader from input/output size limits, output row granularity, along with
+   * other ORC reader options.
+   *
+   * The typical usage should be similar to this:
+   * ```
+   *  do {
+   *    auto const chunk = reader.read_chunk();
+   *    // Process chunk
+   *  } while (reader.has_next());
+   *
+   * ```
+   *
+   * If `chunk_read_limit == 0` (i.e., no output limit) and `pass_read_limit == 0` (no temporary
+   * memory size limit), a call to `read_chunk()` will read the whole data source and return a table
+   * containing all rows.
+   *
+   * The `chunk_read_limit` parameter controls the size of the output table to be returned per
+   * `read_chunk()` call. If the user specifies a 100 MB limit, the reader will attempt to return
+   * tables that have a total bytes size (over all columns) of 100 MB or less.
+   * This is a soft limit and the code will not fail if it cannot satisfy the limit.
+   *
+   * The `pass_read_limit` parameter controls how much temporary memory is used in the entire
+   * process of loading, decompressing and decoding of data. Again, this is also a soft limit and
+   * the reader will try to make the best effort.
+   *
+   * Finally, the parameter `output_row_granularity` controls the changes in row number of the
+   * output chunk. For each call to `read_chunk()`, with respect to the given `pass_read_limit`, a
+   * subset of stripes may be loaded, decompressed and decoded into an intermediate table. The
+   * reader will then subdivide that table into smaller tables for final output using
+   * `output_row_granularity` as the subdivision step.
+   *
+   * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   *        or `0` if there is no limit
+   * @param pass_read_limit Limit on temporary memory usage for reading the data sources,
+   *        or `0` if there is no limit
+   * @param output_row_granularity The granularity parameter used for subdividing the decoded
+   *        table for final output
+   * @param options Settings for controlling reading behaviors
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   *
+   * @throw cudf::logic_error if `output_row_granularity` is non-positive
+   */
+  explicit chunked_orc_reader(
+    std::size_t chunk_read_limit,
+    std::size_t pass_read_limit,
+    size_type output_row_granularity,
+    orc_reader_options const& options,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Construct the reader from input/output size limits along with other ORC reader options.
+   *
+   * This constructor implicitly call the other constructor with `output_row_granularity` set to
+   * `DEFAULT_OUTPUT_ROW_GRANULARITY` rows.
+   *
+   * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   *        or `0` if there is no limit
+   * @param pass_read_limit Limit on temporary memory usage for reading the data sources,
+   *        or `0` if there is no limit
+   * @param options Settings for controlling reading behaviors
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  explicit chunked_orc_reader(
+    std::size_t chunk_read_limit,
+    std::size_t pass_read_limit,
+    orc_reader_options const& options,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Construct the reader from output size limits along with other ORC reader options.
+   *
+   * This constructor implicitly call the other constructor with `pass_read_limit` set to `0` and
+   * `output_row_granularity` set to `DEFAULT_OUTPUT_ROW_GRANULARITY` rows.
+   *
+   * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   *        or `0` if there is no limit
+   * @param options Settings for controlling reading behaviors
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  explicit chunked_orc_reader(
+    std::size_t chunk_read_limit,
+    orc_reader_options const& options,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Destructor, destroying the internal reader instance.
+   */
+  ~chunked_orc_reader();
+
+  /**
+   * @brief Check if there is any data in the given data sources has not yet read.
+   *
+   * @return A boolean value indicating if there is any data left to read
+   */
+  [[nodiscard]] bool has_next() const;
+
+  /**
+   * @brief Read a chunk of rows in the given data sources.
+   *
+   * The sequence of returned tables, if concatenated by their order, guarantees to form a complete
+   * dataset as reading the entire given data sources at once.
+   *
+   * An empty table will be returned if the given sources are empty, or all the data has
+   * been read and returned by the previous calls.
+   *
+   * @return An output `cudf::table` along with its metadata
+   */
+  [[nodiscard]] table_with_metadata read_chunk() const;
+
+ private:
+  std::unique_ptr<cudf::io::orc::detail::chunked_reader> reader;
+};
+
 /** @} */  // end of group
 /**
  * @addtogroup io_writers
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index d0d5b1ad823..c550ad5b94f 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -101,7 +101,7 @@ std::unique_ptr<column> scatter_impl(rmm::device_uvector<unbound_list_view> cons
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(column_types_equal(source, target), "Mismatched column types.");
+  CUDF_EXPECTS(have_same_types(source, target), "Mismatched column types.");
 
   auto const child_column_type = lists_column_view(target).child().type();
 
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 94bce6bddd5..fcd74bebfe8 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -19,23 +19,19 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
-#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
-#include <thrust/advance.h>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
 namespace cudf {
@@ -226,7 +222,7 @@ rmm::device_uvector<char> gather_chars(StringIterator strings_begin,
                                        MapIterator map_begin,
                                        MapIterator map_end,
                                        cudf::detail::input_offsetalator const offsets,
-                                       size_type chars_bytes,
+                                       int64_t chars_bytes,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
@@ -239,9 +235,9 @@ rmm::device_uvector<char> gather_chars(StringIterator strings_begin,
   constexpr int warps_per_threadblock = 4;
   // String parallel strategy will be used if average string length is above this threshold.
   // Otherwise, char parallel strategy will be used.
-  constexpr size_type string_parallel_threshold = 32;
+  constexpr int64_t string_parallel_threshold = 32;
 
-  size_type average_string_length = chars_bytes / output_count;
+  int64_t const average_string_length = chars_bytes / output_count;
 
   if (average_string_length > string_parallel_threshold) {
     constexpr int max_threadblocks = 65536;
@@ -302,7 +298,7 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
     strings.is_empty() ? make_empty_column(type_id::INT32)->view() : strings.offsets(),
     strings.offset());
 
-  auto offsets_itr = thrust::make_transform_iterator(
+  auto sizes_itr = thrust::make_transform_iterator(
     begin,
     cuda::proclaim_return_type<size_type>(
       [d_strings = *d_strings, d_in_offsets] __device__(size_type idx) {
@@ -310,8 +306,8 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
         if (not d_strings.is_valid(idx)) { return 0; }
         return static_cast<size_type>(d_in_offsets[idx + 1] - d_in_offsets[idx]);
       }));
-  auto [out_offsets_column, total_bytes] =
-    cudf::detail::make_offsets_child_column(offsets_itr, offsets_itr + output_count, stream, mr);
+  auto [out_offsets_column, total_bytes] = cudf::strings::detail::make_offsets_child_column(
+    sizes_itr, sizes_itr + output_count, stream, mr);
 
   // build chars column
   auto const offsets_view =
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index 4f3b23747e6..ad12b1eef4e 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -339,15 +339,6 @@ bool has_nested_nullable_columns(table_view const& input);
  */
 std::vector<column_view> get_nullable_columns(table_view const& table);
 
-/**
- * @brief Checks if two `table_view`s have columns of same types
- *
- * @param lhs left-side table_view operand
- * @param rhs right-side table_view operand
- * @return boolean comparison result
- */
-bool have_same_types(table_view const& lhs, table_view const& rhs);
-
 /**
  * @brief Copy column_views from a table_view into another table_view according to
  * a column indices map.
diff --git a/cpp/include/cudf/utilities/type_checks.hpp b/cpp/include/cudf/utilities/type_checks.hpp
index b925fc8ae92..fd3b0581c11 100644
--- a/cpp/include/cudf/utilities/type_checks.hpp
+++ b/cpp/include/cudf/utilities/type_checks.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,16 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <algorithm>
 
 namespace cudf {
 
 /**
- * @brief Compares the type of two `column_view`s
+ * @brief Compare the types of two `column_view`s
+ *
+ * @deprecated Since 24.06. Use cudf::have_same_types instead.
  *
  * This function returns true if the type of `lhs` equals that of `rhs`.
  * - For fixed point types, the scale is compared.
@@ -34,10 +39,11 @@ namespace cudf {
  * @param rhs The second `column_view` to compare
  * @return true if column types match
  */
-bool column_types_equal(column_view const& lhs, column_view const& rhs);
+[[deprecated]] bool column_types_equal(column_view const& lhs, column_view const& rhs);
 
 /**
  * @brief Compare the type IDs of two `column_view`s
+ *
  * This function returns true if the type of `lhs` equals that of `rhs`.
  * - For fixed point types, the scale is ignored.
  *
@@ -47,4 +53,98 @@ bool column_types_equal(column_view const& lhs, column_view const& rhs);
  */
 bool column_types_equivalent(column_view const& lhs, column_view const& rhs);
 
+/**
+ * @brief Compares the type of two `column_view`s
+ *
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is compared.
+ * - For dictionary types, the type of the keys are compared if both are
+ *   non-empty columns.
+ * - For lists types, the type of child columns are compared recursively.
+ * - For struct types, the type of each field are compared in order.
+ * - For all other types, the `id` of `data_type` is compared.
+ *
+ * @param lhs The first `column_view` to compare
+ * @param rhs The second `column_view` to compare
+ * @return true if types match
+ */
+bool have_same_types(column_view const& lhs, column_view const& rhs);
+
+/**
+ * @brief Compare the types of a `column_view` and a `scalar`
+ *
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is compared.
+ * - For dictionary column types, the type of the keys is compared to the
+ *   scalar type.
+ * - For lists types, the types of child columns are compared recursively.
+ * - For struct types, the types of each field are compared in order.
+ * - For all other types, the `id` of `data_type` is compared.
+ *
+ * @param lhs The `column_view` to compare
+ * @param rhs The `scalar` to compare
+ * @return true if types match
+ */
+bool have_same_types(column_view const& lhs, scalar const& rhs);
+
+/**
+ * @brief Compare the types of a `scalar` and a `column_view`
+ *
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is compared.
+ * - For dictionary column types, the type of the keys is compared to the
+ *   scalar type.
+ * - For lists types, the types of child columns are compared recursively.
+ * - For struct types, the types of each field are compared in order.
+ * - For all other types, the `id` of `data_type` is compared.
+ *
+ * @param lhs The `scalar` to compare
+ * @param rhs The `column_view` to compare
+ * @return true if types match
+ */
+bool have_same_types(scalar const& lhs, column_view const& rhs);
+
+/**
+ * @brief Compare the types of two `scalar`s
+ *
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is compared.
+ * - For lists types, the types of child columns are compared recursively.
+ * - For struct types, the types of each field are compared in order.
+ * - For all other types, the `id` of `data_type` is compared.
+ *
+ * @param lhs The first `scalar` to compare
+ * @param rhs The second `scalar` to compare
+ * @return true if types match
+ */
+bool have_same_types(scalar const& lhs, scalar const& rhs);
+
+/**
+ * @brief Checks if two `table_view`s have columns of same types
+ *
+ * @param lhs left-side table_view operand
+ * @param rhs right-side table_view operand
+ * @return boolean comparison result
+ */
+bool have_same_types(table_view const& lhs, table_view const& rhs);
+
+/**
+ * @brief Compare the types of a range of `column_view` or `scalar` objects
+ *
+ * This function returns true if all objects in the range have the same type, in the sense of
+ * cudf::have_same_types.
+ *
+ * @tparam ForwardIt Forward iterator
+ * @param first The first iterator
+ * @param last The last iterator
+ * @return true if all types match
+ */
+template <typename ForwardIt>
+inline bool all_have_same_types(ForwardIt first, ForwardIt last)
+{
+  return first == last || std::all_of(std::next(first), last, [want = *first](auto const& c) {
+           return cudf::have_same_types(want, c);
+         });
+}
+
 }  // namespace cudf
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 4da2e502ce6..d0faeea8336 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -269,8 +269,8 @@ CUDF_KERNEL void count_set_bits_kernel(bitmask_type const* bitmask,
 
   auto const first_word_index{word_index(first_bit_index)};
   auto const last_word_index{word_index(last_bit_index)};
-  thread_index_type const tid         = grid_1d::global_thread_id();
-  thread_index_type const stride      = grid_1d::grid_stride();
+  thread_index_type const tid         = grid_1d::global_thread_id<block_size>();
+  thread_index_type const stride      = grid_1d::grid_stride<block_size>();
   thread_index_type thread_word_index = tid + first_word_index;
   size_type thread_count{0};
 
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 7c57be8e7c0..47e74a5cb48 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -30,6 +30,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -119,8 +121,8 @@ CUDF_KERNEL void concatenate_masks_kernel(column_device_view const* views,
                                           size_type number_of_mask_bits,
                                           size_type* out_valid_count)
 {
-  auto tidx         = cudf::detail::grid_1d::global_thread_id();
-  auto const stride = cudf::detail::grid_1d::grid_stride();
+  auto tidx         = cudf::detail::grid_1d::global_thread_id<block_size>();
+  auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();
   auto active_mask  = __ballot_sync(0xFFFF'FFFFu, tidx < number_of_mask_bits);
 
   size_type warp_valid_count = 0;
@@ -461,12 +463,9 @@ void traverse_children::operator()<cudf::list_view>(host_span<column_view const>
  */
 void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(std::all_of(cols.begin(),
-                           cols.end(),
-                           [expected_type = cols.front().type()](auto const& c) {
-                             return c.type() == expected_type;
-                           }),
-               "Type mismatch in columns to concatenate.");
+  CUDF_EXPECTS(cudf::all_have_same_types(cols.begin(), cols.end()),
+               "Type mismatch in columns to concatenate.",
+               cudf::data_type_error);
 
   // total size of all concatenated rows
   size_t const total_row_count =
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index 92fb2e61741..e86a1f8d6f1 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -26,6 +26,7 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -362,9 +363,10 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs and rhs columns",
                std::invalid_argument);
-  CUDF_EXPECTS(lhs.size() == rhs.size(), "Both columns must be of the size", std::invalid_argument);
   CUDF_EXPECTS(
-    lhs.type() == rhs.type(), "Both inputs must be of the same type", cudf::data_type_error);
+    lhs.size() == rhs.size(), "Both columns must be of the same size", std::invalid_argument);
+  CUDF_EXPECTS(
+    cudf::have_same_types(lhs, rhs), "Both inputs must be of the same type", cudf::data_type_error);
 
   return copy_if_else(lhs, rhs, lhs.has_nulls(), rhs.has_nulls(), boolean_mask, stream, mr);
 }
@@ -378,11 +380,8 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
   CUDF_EXPECTS(boolean_mask.size() == rhs.size(),
                "Boolean mask column must be the same size as rhs column",
                std::invalid_argument);
-
-  auto rhs_type =
-    cudf::is_dictionary(rhs.type()) ? cudf::dictionary_column_view(rhs).keys_type() : rhs.type();
   CUDF_EXPECTS(
-    lhs.type() == rhs_type, "Both inputs must be of the same type", cudf::data_type_error);
+    cudf::have_same_types(rhs, lhs), "Both inputs must be of the same type", cudf::data_type_error);
 
   return copy_if_else(lhs, rhs, !lhs.is_valid(stream), rhs.has_nulls(), boolean_mask, stream, mr);
 }
@@ -396,11 +395,8 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs column",
                std::invalid_argument);
-
-  auto lhs_type =
-    cudf::is_dictionary(lhs.type()) ? cudf::dictionary_column_view(lhs).keys_type() : lhs.type();
   CUDF_EXPECTS(
-    lhs_type == rhs.type(), "Both inputs must be of the same type", cudf::data_type_error);
+    cudf::have_same_types(lhs, rhs), "Both inputs must be of the same type", cudf::data_type_error);
 
   return copy_if_else(lhs, rhs, lhs.has_nulls(), !rhs.is_valid(stream), boolean_mask, stream, mr);
 }
@@ -412,7 +408,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(
-    lhs.type() == rhs.type(), "Both inputs must be of the same type", cudf::data_type_error);
+    cudf::have_same_types(lhs, rhs), "Both inputs must be of the same type", cudf::data_type_error);
   return copy_if_else(
     lhs, rhs, !lhs.is_valid(stream), !rhs.is_valid(stream), boolean_mask, stream, mr);
 }
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index d2ea7036952..dd18f99a3c8 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -32,6 +32,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -147,8 +148,9 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
   // check the keys in the source and target
   cudf::dictionary_column_view const dict_source(source);
   cudf::dictionary_column_view const dict_target(target);
-  CUDF_EXPECTS(dict_source.keys().type() == dict_target.keys().type(),
-               "dictionary keys must be the same type");
+  CUDF_EXPECTS(cudf::have_same_types(dict_source.keys(), dict_target.keys()),
+               "dictionary keys must be the same type",
+               cudf::data_type_error);
 
   // combine keys so both dictionaries have the same set
   auto target_matched =
@@ -211,7 +213,7 @@ void copy_range_in_place(column_view const& source,
                  (target_begin <= target.size() - (source_end - source_begin)),
                "Range is out of bounds.",
                std::out_of_range);
-  CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch.", cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(target, source), "Data type mismatch.", cudf::data_type_error);
   CUDF_EXPECTS(target.nullable() || not source.has_nulls(),
                "target should be nullable if source has null values.",
                std::invalid_argument);
@@ -239,7 +241,7 @@ std::unique_ptr<column> copy_range(column_view const& source,
                  (target_begin <= target.size() - (source_end - source_begin)),
                "Range is out of bounds.",
                std::out_of_range);
-  CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch.", cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(target, source), "Data type mismatch.", cudf::data_type_error);
 
   return cudf::type_dispatcher<dispatch_storage_type>(
     target.type(),
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index cfcbe4724df..993ee074f14 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -32,6 +32,8 @@
 #include <cudf/structs/struct_view.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -112,7 +114,7 @@ struct column_scalar_scatterer_impl {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr) const
   {
-    CUDF_EXPECTS(source.get().type() == target.type(),
+    CUDF_EXPECTS(cudf::have_same_types(target, source.get()),
                  "scalar and column types must match",
                  cudf::data_type_error);
 
@@ -145,7 +147,7 @@ struct column_scalar_scatterer_impl<string_view, MapIterator> {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr) const
   {
-    CUDF_EXPECTS(source.get().type() == target.type(),
+    CUDF_EXPECTS(cudf::have_same_types(target, source.get()),
                  "scalar and column types must match",
                  cudf::data_type_error);
 
@@ -315,12 +317,7 @@ std::unique_ptr<table> scatter(table_view const& source,
   CUDF_EXPECTS(scatter_map.size() <= source.num_rows(),
                "Size of scatter map must be equal to or less than source rows",
                std::invalid_argument);
-  CUDF_EXPECTS(std::equal(source.begin(),
-                          source.end(),
-                          target.begin(),
-                          [](auto const& col1, auto const& col2) {
-                            return col1.type().id() == col2.type().id();
-                          }),
+  CUDF_EXPECTS(cudf::have_same_types(source, target),
                "Column types do not match between source and target",
                cudf::data_type_error);
   CUDF_EXPECTS(not scatter_map.has_nulls(), "Scatter map contains nulls", std::invalid_argument);
@@ -452,14 +449,9 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
                "Mask must be of Boolean type",
                cudf::data_type_error);
   // Count valid pair of input and columns as per type at each column index i
-  CUDF_EXPECTS(
-    std::all_of(thrust::counting_iterator<size_type>(0),
-                thrust::counting_iterator<size_type>(target.num_columns()),
-                [&input, &target](auto index) {
-                  return ((input.column(index).type().id()) == (target.column(index).type().id()));
-                }),
-    "Type mismatch in input column and target column",
-    cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(input, target),
+               "Type mismatch in input column and target column",
+               cudf::data_type_error);
 
   if (target.num_rows() != 0) {
     std::vector<std::unique_ptr<column>> out_columns(target.num_columns());
@@ -496,14 +488,13 @@ std::unique_ptr<table> boolean_mask_scatter(
                cudf::data_type_error);
 
   // Count valid pair of input and columns as per type at each column/scalar index i
-  CUDF_EXPECTS(
-    std::all_of(thrust::counting_iterator<size_type>(0),
-                thrust::counting_iterator<size_type>(target.num_columns()),
-                [&input, &target](auto index) {
-                  return (input[index].get().type().id() == target.column(index).type().id());
-                }),
-    "Type mismatch in input scalar and target column",
-    cudf::data_type_error);
+  CUDF_EXPECTS(std::all_of(thrust::counting_iterator<size_type>(0),
+                           thrust::counting_iterator<size_type>(target.num_columns()),
+                           [&input, &target](auto index) {
+                             return cudf::have_same_types(target.column(index), input[index].get());
+                           }),
+               "Type mismatch in input scalar and target column",
+               cudf::data_type_error);
 
   if (target.num_rows() != 0) {
     std::vector<std::unique_ptr<column>> out_columns(target.num_columns());
diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu
index bdc741887f7..91254f21170 100644
--- a/cpp/src/copying/shift.cu
+++ b/cpp/src/copying/shift.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -158,7 +159,7 @@ std::unique_ptr<column> shift(column_view const& input,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == fill_value.type(),
+  CUDF_EXPECTS(cudf::have_same_types(input, fill_value),
                "shift requires each fill value type to match the corresponding column type.",
                cudf::data_type_error);
 
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 5fd21ee0094..0ed9006f88b 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -29,6 +29,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -54,7 +56,8 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
 {
   CUDF_EXPECTS(!new_keys.has_nulls(), "Keys must not have nulls");
   auto old_keys = dictionary_column.keys();  // [a,b,c,d,f]
-  CUDF_EXPECTS(new_keys.type() == old_keys.type(), "Keys must be the same type");
+  CUDF_EXPECTS(
+    cudf::have_same_types(new_keys, old_keys), "Keys must be the same type", cudf::data_type_error);
   // first, concatenate the keys together
   // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e]
   auto combined_keys = cudf::detail::concatenate(
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index 62a6c816493..fdc3d9d0ecf 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -26,6 +26,8 @@
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -82,13 +84,13 @@ struct compute_children_offsets_fn {
   }
 
   /**
-   * @brief Return the first keys().type of the dictionary columns.
+   * @brief Return the first keys() of the dictionary columns.
    */
-  data_type get_keys_type()
+  column_view get_keys()
   {
     auto const view(*std::find_if(
       columns_ptrs.begin(), columns_ptrs.end(), [](auto pcv) { return pcv->size() > 0; }));
-    return dictionary_column_view(*view).keys().type();
+    return dictionary_column_view(*view).keys();
   }
 
   /**
@@ -214,14 +216,16 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
 
   // concatenate the keys (and check the keys match)
   compute_children_offsets_fn child_offsets_fn{columns};
-  auto keys_type = child_offsets_fn.get_keys_type();
+  auto expected_keys = child_offsets_fn.get_keys();
   std::vector<column_view> keys_views(columns.size());
-  std::transform(columns.begin(), columns.end(), keys_views.begin(), [keys_type](auto cv) {
+  std::transform(columns.begin(), columns.end(), keys_views.begin(), [expected_keys](auto cv) {
     auto dict_view = dictionary_column_view(cv);
     // empty column may not have keys so we create an empty column_view place-holder
-    if (dict_view.is_empty()) return column_view{keys_type, 0, nullptr, nullptr, 0};
+    if (dict_view.is_empty()) return column_view{expected_keys.type(), 0, nullptr, nullptr, 0};
     auto keys = dict_view.keys();
-    CUDF_EXPECTS(keys.type() == keys_type, "key types of all dictionary columns must match");
+    CUDF_EXPECTS(cudf::have_same_types(keys, expected_keys),
+                 "key types of all dictionary columns must match",
+                 cudf::data_type_error);
     return keys;
   });
   auto all_keys =
@@ -275,7 +279,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
 
   // now recompute the indices values for the new keys_column;
   // the keys offsets (pair.first) are for mapping to the input keys
-  auto indices_column = type_dispatcher(keys_type,
+  auto indices_column = type_dispatcher(expected_keys.type(),
                                         dispatch_compute_indices{},
                                         all_keys->view(),     // old keys
                                         all_indices->view(),  // old indices
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 718ca419289..35387efa56b 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -26,6 +26,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -155,7 +157,9 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
 {
   CUDF_EXPECTS(!keys_to_remove.has_nulls(), "keys_to_remove must not have nulls");
   auto const keys_view = dictionary_column.keys();
-  CUDF_EXPECTS(keys_view.type() == keys_to_remove.type(), "keys types must match");
+  CUDF_EXPECTS(cudf::have_same_types(keys_view, keys_to_remove),
+               "keys types must match",
+               cudf::data_type_error);
 
   // locate keys to remove by searching the keys column
   auto const matches = cudf::detail::contains(keys_to_remove, keys_view, stream, mr);
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index bb6b08c243d..bc17dfd4bab 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -24,6 +24,8 @@
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -84,7 +86,9 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
 {
   if (input.is_empty()) { return cudf::empty_like(input.parent()); }
   if (!input.has_nulls()) { return std::make_unique<cudf::column>(input.parent(), stream, mr); }
-  CUDF_EXPECTS(input.keys().type() == replacement.keys().type(), "keys must match");
+  CUDF_EXPECTS(cudf::have_same_types(input.keys(), replacement.keys()),
+               "keys must match",
+               cudf::data_type_error);
   CUDF_EXPECTS(replacement.size() == input.size(), "column sizes must match");
 
   // first combine the keys so both input dictionaries have the same set
@@ -119,7 +123,9 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
   if (!input.has_nulls() || !replacement.is_valid(stream)) {
     return std::make_unique<cudf::column>(input.parent(), stream, mr);
   }
-  CUDF_EXPECTS(input.keys().type() == replacement.type(), "keys must match scalar type");
+  CUDF_EXPECTS(cudf::have_same_types(input.parent(), replacement),
+               "keys must match scalar type",
+               cudf::data_type_error);
 
   // first add the replacement to the keys so only the indices need to be processed
   auto input_matched = dictionary::detail::add_keys(
diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
index 680eadddba8..231619836f9 100644
--- a/cpp/src/dictionary/search.cu
+++ b/cpp/src/dictionary/search.cu
@@ -19,7 +19,9 @@
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/search.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -72,10 +74,12 @@ struct find_index_fn {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr) const
   {
-    if (!key.is_valid(stream))
+    if (!key.is_valid(stream)) {
       return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr);
-    CUDF_EXPECTS(input.keys().type() == key.type(),
-                 "search key type must match dictionary keys type");
+    }
+    CUDF_EXPECTS(cudf::have_same_types(input.parent(), key),
+                 "search key type must match dictionary keys type",
+                 cudf::data_type_error);
 
     using ScalarType = cudf::scalar_type_t<Element>;
     auto find_key    = static_cast<ScalarType const&>(key).value(stream);
@@ -114,10 +118,12 @@ struct find_insert_index_fn {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr) const
   {
-    if (!key.is_valid(stream))
+    if (!key.is_valid(stream)) {
       return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr);
-    CUDF_EXPECTS(input.keys().type() == key.type(),
-                 "search key type must match dictionary keys type");
+    }
+    CUDF_EXPECTS(cudf::have_same_types(input.parent(), key),
+                 "search key type must match dictionary keys type",
+                 cudf::data_type_error);
 
     using ScalarType = cudf::scalar_type_t<Element>;
     auto find_key    = static_cast<ScalarType const&>(key).value(stream);
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index b56eec9401a..08a33d40abe 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -29,6 +29,8 @@
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -116,7 +118,6 @@ struct dispatch_compute_indices {
 
 }  // namespace
 
-//
 std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& new_keys,
                                  rmm::cuda_stream_view stream,
@@ -124,7 +125,8 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
 {
   CUDF_EXPECTS(!new_keys.has_nulls(), "keys parameter must not have nulls");
   auto keys = dictionary_column.keys();
-  CUDF_EXPECTS(keys.type() == new_keys.type(), "keys types must match");
+  CUDF_EXPECTS(
+    cudf::have_same_types(keys, new_keys), "keys types must match", cudf::data_type_error);
 
   // copy the keys -- use cudf::distinct to make sure there are no duplicates,
   // then sort the results.
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index c4d786bd73b..1fc9ed31c09 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -33,6 +33,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -110,7 +111,7 @@ struct out_of_place_fill_range_dispatch {
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
   {
-    CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
+    CUDF_EXPECTS(cudf::have_same_types(input, value), "Data type mismatch.", cudf::data_type_error);
     auto p_ret = std::make_unique<cudf::column>(input, stream, mr);
 
     if (end != begin) {  // otherwise no fill
@@ -137,7 +138,7 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
+  CUDF_EXPECTS(cudf::have_same_types(input, value), "Data type mismatch.", cudf::data_type_error);
   using ScalarType = cudf::scalar_type_t<cudf::string_view>;
   auto p_scalar    = static_cast<ScalarType const*>(&value);
   return cudf::strings::detail::fill(
@@ -153,7 +154,8 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
 {
   if (input.is_empty()) return std::make_unique<cudf::column>(input, stream, mr);
   cudf::dictionary_column_view const target(input);
-  CUDF_EXPECTS(target.keys().type() == value.type(), "Data type mismatch.");
+  CUDF_EXPECTS(
+    cudf::have_same_types(target.parent(), value), "Data type mismatch.", cudf::data_type_error);
 
   // if the scalar is invalid, then just copy the column and fill the null mask
   if (!value.is_valid(stream)) {
@@ -219,7 +221,8 @@ void fill_in_place(mutable_column_view& destination,
                "Range is out of bounds.");
   CUDF_EXPECTS(destination.nullable() || value.is_valid(stream),
                "destination should be nullable or value should be non-null.");
-  CUDF_EXPECTS(destination.type() == value.type(), "Data type mismatch.");
+  CUDF_EXPECTS(
+    cudf::have_same_types(destination, value), "Data type mismatch.", cudf::data_type_error);
 
   if (end != begin) {  // otherwise no-op
     cudf::type_dispatcher(
diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu
index f7067c3a91b..ee1745b8498 100644
--- a/cpp/src/filling/sequence.cu
+++ b/cpp/src/filling/sequence.cu
@@ -24,6 +24,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -128,7 +129,9 @@ std::unique_ptr<column> sequence(size_type size,
                                  rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(init.type() == step.type(), "init and step must be of the same type.");
+  CUDF_EXPECTS(cudf::have_same_types(init, step),
+               "init and step must be of the same type.",
+               cudf::data_type_error);
   CUDF_EXPECTS(size >= 0, "size must be >= 0");
   CUDF_EXPECTS(is_numeric(init.type()), "Input scalar types must be numeric");
 
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 73cb4efd283..e43dfcb4d98 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -36,6 +36,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -312,12 +313,15 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(values.num_columns() == static_cast<size_type>(fill_values.size()),
                "Mismatch number of fill_values and columns.");
-  CUDF_EXPECTS(
-    std::all_of(thrust::make_counting_iterator(0),
-                thrust::make_counting_iterator(values.num_columns()),
-                [&](auto i) { return values.column(i).type() == fill_values[i].get().type(); }),
-    "values and fill_value should have the same type.");
-
+  CUDF_EXPECTS(std::equal(values.begin(),
+                          values.end(),
+                          fill_values.cbegin(),
+                          fill_values.cend(),
+                          [](auto const& col, auto const& scalar) {
+                            return cudf::have_same_types(col, scalar.get());
+                          }),
+               "values and fill_value should have the same type.",
+               cudf::data_type_error);
   auto stream = cudf::get_default_stream();
   std::vector<std::unique_ptr<column>> results;
   auto const& group_offsets = helper().group_offsets(stream);
diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index 8f490ada8ff..0b559e8e86c 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -309,7 +309,7 @@ std::unique_ptr<column> md5(table_view const& input,
   // Result column allocation and creation
   auto begin = thrust::make_constant_iterator(digest_size);
   auto [offsets_column, bytes] =
-    cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
+    cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
 
   rmm::device_uvector<char> chars(bytes, stream, mr);
   auto d_chars = chars.data();
@@ -322,7 +322,7 @@ std::unique_ptr<column> md5(table_view const& input,
     thrust::make_counting_iterator(0),
     thrust::make_counting_iterator(input.num_rows()),
     [d_chars, device_input = *device_input] __device__(auto row_index) {
-      MD5Hasher hasher(d_chars + (row_index * digest_size));
+      MD5Hasher hasher(d_chars + (static_cast<int64_t>(row_index) * digest_size));
       for (auto const& col : device_input) {
         if (col.is_valid(row_index)) {
           if (col.type().id() == type_id::LIST) {
diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh
index 005578cb2c2..6976241057e 100644
--- a/cpp/src/hash/sha_hash.cuh
+++ b/cpp/src/hash/sha_hash.cuh
@@ -518,7 +518,7 @@ std::unique_ptr<column> sha_hash(table_view const& input,
   // Result column allocation and creation
   auto begin = thrust::make_constant_iterator(Hasher::digest_size);
   auto [offsets_column, bytes] =
-    cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
+    cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
 
   auto chars   = rmm::device_uvector<char>(bytes, stream, mr);
   auto d_chars = chars.data();
@@ -526,19 +526,20 @@ std::unique_ptr<column> sha_hash(table_view const& input,
   auto const device_input = table_device_view::create(input, stream);
 
   // Hash each row, hashing each element sequentially left to right
-  thrust::for_each(rmm::exec_policy(stream),
-                   thrust::make_counting_iterator(0),
-                   thrust::make_counting_iterator(input.num_rows()),
-                   [d_chars, device_input = *device_input] __device__(auto row_index) {
-                     Hasher hasher(d_chars + (row_index * Hasher::digest_size));
-                     for (auto const& col : device_input) {
-                       if (col.is_valid(row_index)) {
-                         cudf::type_dispatcher<dispatch_storage_type>(
-                           col.type(), HasherDispatcher(&hasher, col), row_index);
-                       }
-                     }
-                     hasher.finalize();
-                   });
+  thrust::for_each(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(input.num_rows()),
+    [d_chars, device_input = *device_input] __device__(auto row_index) {
+      Hasher hasher(d_chars + (static_cast<int64_t>(row_index) * Hasher::digest_size));
+      for (auto const& col : device_input) {
+        if (col.is_valid(row_index)) {
+          cudf::type_dispatcher<dispatch_storage_type>(
+            col.type(), HasherDispatcher(&hasher, col), row_index);
+        }
+      }
+      hasher.finalize();
+    });
 
   return make_strings_column(input.num_rows(), std::move(offsets_column), chars.release(), 0, {});
 }
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 3109a36cbcf..78ddd7f5ad5 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -231,9 +232,9 @@ DLManagedTensor* to_dlpack(table_view const& input,
   DLDataType const dltype = data_type_to_DLDataType(type);
 
   // Ensure all columns are the same type
-  CUDF_EXPECTS(
-    std::all_of(input.begin(), input.end(), [type](auto const& col) { return col.type() == type; }),
-    "All columns required to have same data type");
+  CUDF_EXPECTS(cudf::all_have_same_types(input.begin(), input.end()),
+               "All columns required to have same data type",
+               cudf::data_type_error);
 
   // Ensure none of the columns have nulls
   CUDF_EXPECTS(
diff --git a/cpp/src/io/comp/cpu_unbz2.cpp b/cpp/src/io/comp/cpu_unbz2.cpp
index a116335b254..44535cff589 100644
--- a/cpp/src/io/comp/cpu_unbz2.cpp
+++ b/cpp/src/io/comp/cpu_unbz2.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -221,7 +221,7 @@ int32_t bz2_decompress_block(unbz_state_s* s)
   if (getbits(s, 1)) return BZ_DATA_ERROR;  // blockRandomized not supported (old bzip versions)
 
   s->origPtr = getbits(s, 24);
-  if (s->origPtr < 0 || s->origPtr > 10 + 100000 * s->blockSize100k) return BZ_DATA_ERROR;
+  if (s->origPtr > 10 + 100000 * s->blockSize100k) return BZ_DATA_ERROR;
 
   // Receive the mapping table
   inUse16 = getbits(s, 16);
@@ -436,7 +436,7 @@ int32_t bz2_decompress_block(unbz_state_s* s)
   }
 
   // Now we know what nblock is, we can do a better sanity check on s->origPtr.
-  if (s->origPtr < 0 || s->origPtr >= nblock) return BZ_DATA_ERROR;
+  if (s->origPtr >= nblock) return BZ_DATA_ERROR;
 
   // compute the T^(-1) vector
   {
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 335ce77e3e3..58a74654405 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -33,7 +33,7 @@
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
@@ -75,8 +75,9 @@ namespace {
 struct escape_strings_fn {
   column_device_view const d_column;
   string_view const d_delimiter;  // check for column delimiter
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void write_char(char_utf8 chr, char*& d_buffer, size_type& bytes)
   {
@@ -89,7 +90,7 @@ struct escape_strings_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -115,7 +116,7 @@ struct escape_strings_fn {
     }
     if (quote_row) write_char(quote, d_buffer, bytes);
 
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -182,7 +183,7 @@ struct column_to_strings_fn {
     auto d_column = column_device_view::create(column_v, stream_);
     escape_strings_fn fn{*d_column, delimiter.value(stream_)};
     auto [offsets_column, chars] =
-      cudf::strings::detail::make_strings_children(fn, column_v.size(), stream_, mr_);
+      cudf::strings::detail::experimental::make_strings_children(fn, column_v.size(), stream_, mr_);
 
     return make_strings_column(column_v.size(),
                                std::move(offsets_column),
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 12059dffa4e..98b010109ec 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -420,7 +420,7 @@ table_with_metadata read_orc(orc_reader_options const& options,
 
   auto datasources = make_datasources(options.get_source());
   auto reader = std::make_unique<orc::detail::reader>(std::move(datasources), options, stream, mr);
-  return reader->read(options);
+  return reader->read();
 }
 
 /**
@@ -440,6 +440,64 @@ void write_orc(orc_writer_options const& options, rmm::cuda_stream_view stream)
   writer->write(options.get_table());
 }
 
+chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit,
+                                       std::size_t pass_read_limit,
+                                       size_type output_row_granularity,
+                                       orc_reader_options const& options,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
+  : reader{std::make_unique<orc::detail::chunked_reader>(chunk_read_limit,
+                                                         pass_read_limit,
+                                                         output_row_granularity,
+                                                         make_datasources(options.get_source()),
+                                                         options,
+                                                         stream,
+                                                         mr)}
+{
+}
+
+chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit,
+                                       std::size_t pass_read_limit,
+                                       orc_reader_options const& options,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
+  : reader{std::make_unique<orc::detail::chunked_reader>(chunk_read_limit,
+                                                         pass_read_limit,
+                                                         make_datasources(options.get_source()),
+                                                         options,
+                                                         stream,
+                                                         mr)}
+{
+}
+
+chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit,
+                                       orc_reader_options const& options,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
+  : chunked_orc_reader(chunk_read_limit, 0UL, options, stream, mr)
+{
+}
+
+// This destructor destroys the internal reader instance.
+// Since the declaration of the internal `reader` object does not exist in the header, this
+// destructor needs to be defined in a separate source file which can access to that object's
+// declaration.
+chunked_orc_reader::~chunked_orc_reader() = default;
+
+bool chunked_orc_reader::has_next() const
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly.");
+  return reader->has_next();
+}
+
+table_with_metadata chunked_orc_reader::read_chunk() const
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly.");
+  return reader->read_chunk();
+}
+
 /**
  * @copydoc cudf::io::orc_chunked_writer::orc_chunked_writer
  */
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 7117af8948b..631f8adbd6d 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -564,7 +564,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     }
   };
   auto init_to_zero = [stream](auto& v) {
-    thrust::uninitialized_fill(rmm::exec_policy(stream), v.begin(), v.end(), 0);
+    thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0);
   };
 
   auto initialize_json_columns = [&](auto i, auto& col) {
@@ -625,13 +625,14 @@ void make_device_json_column(device_span<SymbolT const> input,
   // find column_ids which are values, but should be ignored in validity
   std::vector<uint8_t> ignore_vals(num_columns, 0);
   std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
+  std::vector<uint8_t> is_pruned(num_columns, 0);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
 
-  for (auto const this_col_id : unique_col_ids) {
-    if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
-      continue;
-    }
-    // Struct, List, String, Value
+  auto name_and_parent_index = [&is_array_of_arrays,
+                                &row_array_parent_col_id,
+                                &column_parent_ids,
+                                &column_categories,
+                                &column_names](auto this_col_id) {
     std::string name   = "";
     auto parent_col_id = column_parent_ids[this_col_id];
     if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
@@ -647,11 +648,46 @@ void make_device_json_column(device_span<SymbolT const> input,
     } else {
       CUDF_FAIL("Unexpected parent column category");
     }
+    return std::pair{name, parent_col_id};
+  };
+
+  // Prune columns that are not required to be parsed.
+  if (options.is_enabled_prune_columns()) {
+    for (auto const this_col_id : unique_col_ids) {
+      if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
+        continue;
+      }
+      // Struct, List, String, Value
+      auto [name, parent_col_id] = name_and_parent_index(this_col_id);
+      // get path of this column, and get its dtype if present in options
+      auto const nt                             = tree_path.get_path(this_col_id);
+      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+      if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) {
+        is_pruned[this_col_id] = 1;
+        continue;
+      } else {
+        // make sure all its parents are not pruned.
+        while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) {
+          is_pruned[parent_col_id] = 0;
+          parent_col_id            = column_parent_ids[parent_col_id];
+        }
+      }
+    }
+  }
+
+  // Build the column tree, also, handles mixed types.
+  for (auto const this_col_id : unique_col_ids) {
+    if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
+      continue;
+    }
+    // Struct, List, String, Value
+    auto [name, parent_col_id] = name_and_parent_index(this_col_id);
 
-    if (parent_col_id != parent_node_sentinel && is_mixed_type_column[parent_col_id] == 1) {
-      // if parent is mixed type column, ignore this column.
-      is_mixed_type_column[this_col_id] = 1;
-      ignore_vals[this_col_id]          = 1;
+    // if parent is mixed type column or this column is pruned, ignore this column.
+    if (parent_col_id != parent_node_sentinel &&
+        (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id])) {
+      ignore_vals[this_col_id] = 1;
+      if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
       continue;
     }
 
@@ -714,12 +750,13 @@ void make_device_json_column(device_span<SymbolT const> input,
                      "A mix of lists and structs within the same column is not supported");
       }
     }
+
     if (is_enabled_mixed_types_as_string) {
       // get path of this column, check if it is a struct forced as string, and enforce it
-      auto nt                          = tree_path.get_path(this_col_id);
-      std::optional<data_type> user_dt = get_path_data_type(nt, options);
-      if (column_categories[this_col_id] == NC_STRUCT and user_dt.has_value() and
-          user_dt.value().id() == type_id::STRING) {
+      auto const nt                             = tree_path.get_path(this_col_id);
+      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+      if (column_categories[this_col_id] == NC_STRUCT and user_dtype.has_value() and
+          user_dtype.value().id() == type_id::STRING) {
         is_mixed_type_column[this_col_id] = 1;
         column_categories[this_col_id]    = NC_STR;
       }
@@ -873,25 +910,27 @@ void make_device_json_column(device_span<SymbolT const> input,
   for (auto& [id, col_ref] : columns) {
     auto& col = col_ref.get();
     if (col.type == json_col_t::StringColumn) {
-      thrust::inclusive_scan(rmm::exec_policy(stream),
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
                              col.string_offsets.begin(),
                              col.string_offsets.end(),
                              col.string_offsets.begin(),
                              thrust::maximum<json_column::row_offset_t>{});
     } else if (col.type == json_col_t::ListColumn) {
-      thrust::inclusive_scan(rmm::exec_policy(stream),
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
                              col.child_offsets.begin(),
                              col.child_offsets.end(),
                              col.child_offsets.begin(),
                              thrust::maximum<json_column::row_offset_t>{});
     }
   }
+  stream.synchronize();
 }
 
 std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_column_to_cudf_column(
   device_json_column& json_col,
   device_span<SymbolT const> d_input,
   cudf::io::parse_options const& options,
+  bool prune_columns,
   std::optional<schema_element> schema,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
@@ -982,13 +1021,16 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       for (auto const& col_name : json_col.column_order) {
         auto const& col = json_col.child_columns.find(col_name);
         column_names.emplace_back(col->first);
-        auto& child_col            = col->second;
-        auto [child_column, names] = device_json_column_to_cudf_column(
-          child_col, d_input, options, get_child_schema(col_name), stream, mr);
-        CUDF_EXPECTS(num_rows == child_column->size(),
-                     "All children columns must have the same size");
-        child_columns.push_back(std::move(child_column));
-        column_names.back().children = names;
+        auto& child_col           = col->second;
+        auto child_schema_element = get_child_schema(col_name);
+        if (!prune_columns or child_schema_element.has_value()) {
+          auto [child_column, names] = device_json_column_to_cudf_column(
+            child_col, d_input, options, prune_columns, child_schema_element, stream, mr);
+          CUDF_EXPECTS(num_rows == child_column->size(),
+                       "All children columns must have the same size");
+          child_columns.push_back(std::move(child_column));
+          column_names.back().children = names;
+        }
       }
       auto [result_bitmask, null_count] = make_validity(json_col);
       // The null_mask is set after creation of struct column is to skip the superimpose_nulls and
@@ -1011,8 +1053,11 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
                                                      rmm::device_buffer{},
                                                      0);
       // Create children column
+      auto child_schema_element = json_col.child_columns.empty()
+                                    ? std::optional<schema_element>{}
+                                    : get_child_schema(json_col.child_columns.begin()->first);
       auto [child_column, names] =
-        json_col.child_columns.empty()
+        json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value())
           ? std::pair<std::unique_ptr<column>,
                       // EMPTY type could not used because gather throws exception on EMPTY type.
                       std::vector<column_name_info>>{std::make_unique<column>(
@@ -1022,13 +1067,13 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
                                                        rmm::device_buffer{},
                                                        0),
                                                      std::vector<column_name_info>{}}
-          : device_json_column_to_cudf_column(
-              json_col.child_columns.begin()->second,
-              d_input,
-              options,
-              get_child_schema(json_col.child_columns.begin()->first),
-              stream,
-              mr);
+          : device_json_column_to_cudf_column(json_col.child_columns.begin()->second,
+                                              d_input,
+                                              options,
+                                              prune_columns,
+                                              child_schema_element,
+                                              stream,
+                                              mr);
       column_names.back().children      = names;
       auto [result_bitmask, null_count] = make_validity(json_col);
       auto ret_col                      = make_lists_column(num_rows,
@@ -1140,8 +1185,6 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
   size_type column_index = 0;
   for (auto const& col_name : root_struct_col.column_order) {
     auto& json_col = root_struct_col.child_columns.find(col_name)->second;
-    // Insert this columns name into the schema
-    out_column_names.emplace_back(col_name);
 
     std::optional<schema_element> child_schema_element = std::visit(
       cudf::detail::visitor_overload{
@@ -1184,18 +1227,28 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
     debug_schema_print(child_schema_element);
 #endif
 
-    // Get this JSON column's cudf column and schema info, (modifies json_col)
-    auto [cudf_col, col_name_info] = device_json_column_to_cudf_column(
-      json_col, d_input, parse_opt, child_schema_element, stream, mr);
-    // TODO: RangeIndex as DataFrame.columns names for array of arrays
-    // if (is_array_of_arrays) {
-    //   col_name_info.back().name = "";
-    // }
-
-    out_column_names.back().children = std::move(col_name_info);
-    out_columns.emplace_back(std::move(cudf_col));
-
-    column_index++;
+    if (!options.is_enabled_prune_columns() or child_schema_element.has_value()) {
+      // Get this JSON column's cudf column and schema info, (modifies json_col)
+      auto [cudf_col, col_name_info] =
+        device_json_column_to_cudf_column(json_col,
+                                          d_input,
+                                          parse_opt,
+                                          options.is_enabled_prune_columns(),
+                                          child_schema_element,
+                                          stream,
+                                          mr);
+      // Insert this column's name into the schema
+      out_column_names.emplace_back(col_name);
+      // TODO: RangeIndex as DataFrame.columns names for array of arrays
+      // if (is_array_of_arrays) {
+      //   col_name_info.back().name = "";
+      // }
+
+      out_column_names.back().children = std::move(col_name_info);
+      out_columns.emplace_back(std::move(cudf_col));
+
+      column_index++;
+    }
   }
 
   return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {out_column_names}};
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index eb06ea0177e..ca56a12eb36 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -298,9 +298,9 @@ struct TransduceToNormalizedWS {
 
 namespace detail {
 
-rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT>&& inbuf,
-                                                     rmm::cuda_stream_view stream,
-                                                     rmm::device_async_resource_ref mr)
+void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
 {
   auto parser = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
@@ -308,10 +308,10 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
     fst::detail::make_translation_functor(normalize_quotes::TransduceToNormalizedQuotes{}),
     stream);
 
-  rmm::device_uvector<SymbolT> outbuf(inbuf.size() * 2, stream, mr);
+  rmm::device_uvector<SymbolT> outbuf(indata.size() * 2, stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(inbuf.data(),
-                   static_cast<SymbolOffsetT>(inbuf.size()),
+  parser.Transduce(indata.data(),
+                   static_cast<SymbolOffsetT>(indata.size()),
                    outbuf.data(),
                    thrust::make_discard_iterator(),
                    outbuf_size.data(),
@@ -319,12 +319,13 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
                    stream);
 
   outbuf.resize(outbuf_size.value(stream), stream);
-  return outbuf;
+  datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
+  std::swap(indata, outdata);
 }
 
-rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&& inbuf,
-                                                  rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr)
+void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
+                          rmm::cuda_stream_view stream,
+                          rmm::device_async_resource_ref mr)
 {
   auto parser = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
@@ -332,10 +333,10 @@ rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&&
     fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}),
     stream);
 
-  rmm::device_uvector<SymbolT> outbuf(inbuf.size(), stream, mr);
+  rmm::device_uvector<SymbolT> outbuf(indata.size(), stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(inbuf.data(),
-                   static_cast<SymbolOffsetT>(inbuf.size()),
+  parser.Transduce(indata.data(),
+                   static_cast<SymbolOffsetT>(indata.size()),
                    outbuf.data(),
                    thrust::make_discard_iterator(),
                    outbuf_size.data(),
@@ -343,7 +344,8 @@ rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&&
                    stream);
 
   outbuf.resize(outbuf_size.value(stream), stream);
-  return outbuf;
+  datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
+  std::swap(indata, outdata);
 }
 
 }  // namespace detail
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index a302785cee8..5817a01c21f 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -302,9 +302,16 @@ reduce_to_column_tree(tree_meta_t& tree,
 cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& options,
                                         rmm::cuda_stream_view stream);
 
-/** @copydoc host_parse_nested_json
+/**
+ * @brief Parses the given JSON string and generates table from the given input.
+ *
  * All processing is done in device memory.
  *
+ * @param input The JSON input
+ * @param options Parsing options specifying the parsing behaviour
+ * @param stream The CUDA stream to which kernels are dispatched
+ * @param mr Optional, resource with which to allocate
+ * @return The data parsed from the given JSON input
  */
 table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
                                              cudf::io::json_reader_options const& options,
@@ -319,7 +326,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
  * @return data type of the column if present
  */
 std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT>> path,
+  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path,
   cudf::io::json_reader_options const& options);
 
 /**
@@ -337,20 +344,6 @@ struct path_from_tree {
   std::vector<path_rep> get_path(NodeIndexT this_col_id);
 };
 
-/**
- * @brief Parses the given JSON string and generates table from the given input.
- *
- * @param input The JSON input
- * @param options Parsing options specifying the parsing behaviour
- * @param stream The CUDA stream to which kernels are dispatched
- * @param mr Optional, resource with which to allocate
- * @return The data parsed from the given JSON input
- */
-table_with_metadata host_parse_nested_json(device_span<SymbolT const> input,
-                                           cudf::io::json_reader_options const& options,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr);
-
 }  // namespace detail
 
 }  // namespace cudf::io::json
diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp
index 740b7523cc1..4caa5cd9e24 100644
--- a/cpp/src/io/json/parser_features.cpp
+++ b/cpp/src/io/json/parser_features.cpp
@@ -58,8 +58,15 @@ std::optional<schema_element> child_schema_element(std::string const& col_name,
 // "a": [ null]         {"a", list}, {"element", str}
 // back() is root.
 // front() is leaf.
+/**
+ * @brief Get the path data type of a column by path if present in input schema
+ *
+ * @param path path of the json column
+ * @param root root of input schema element
+ * @return data type of the column if present, otherwise std::nullopt
+ */
 std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT>> path, schema_element const& root)
+  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path, schema_element const& root)
 {
   if (path.empty() || path.size() == 1) {
     return root.type;
@@ -81,7 +88,7 @@ std::optional<data_type> get_path_data_type(
 }
 
 std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT>> path,
+  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path,
   cudf::io::json_reader_options const& options)
 {
   if (path.empty()) return {};
@@ -98,11 +105,11 @@ std::optional<data_type> get_path_data_type(
 std::vector<path_from_tree::path_rep> path_from_tree::get_path(NodeIndexT this_col_id)
 {
   std::vector<path_rep> path;
-  // TODO Need to stop at row root. so, how to find row root?
+  // stops at root.
   while (this_col_id != parent_node_sentinel) {
     auto type        = column_categories[this_col_id];
     std::string name = "";
-    // TODO make this ifelse into a separate lambda function, along with parent_col_id.
+    // code same as name_and_parent_index lambda.
     auto parent_col_id = column_parent_ids[this_col_id];
     if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
       if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 81ef3a51afc..0ead5c56264 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -20,10 +20,13 @@
 #include "read_json.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
 
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
@@ -49,17 +52,20 @@ size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
 /**
  * @brief Read from array of data sources into RMM buffer
  *
+ * @param buffer Device span buffer to which data is read
  * @param sources Array of data sources
  * @param compression Compression format of source
  * @param range_offset Number of bytes to skip from source start
  * @param range_size Number of bytes to read from source
  * @param stream CUDA stream used for device memory operations and kernel launches
+ * @returns A subspan of the input device span containing data read
  */
-rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>> sources,
-                                           compression_type compression,
-                                           size_t range_offset,
-                                           size_t range_size,
-                                           rmm::cuda_stream_view stream)
+device_span<char> ingest_raw_input(device_span<char> buffer,
+                                   host_span<std::unique_ptr<datasource>> sources,
+                                   compression_type compression,
+                                   size_t range_offset,
+                                   size_t range_size,
+                                   rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   // We append a line delimiter between two files to make sure the last line of file i and the first
@@ -68,33 +74,43 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
   auto constexpr num_delimiter_chars = 1;
   auto const num_extra_delimiters    = num_delimiter_chars * (sources.size() - 1);
 
-  // Iterate through the user defined sources and read the contents into the local buffer
-  auto const total_source_size =
-    sources_size(sources, range_offset, range_size) + num_extra_delimiters;
-
   if (compression == compression_type::NONE) {
     std::vector<size_type> delimiter_map{};
+    std::vector<size_t> prefsum_source_sizes(sources.size());
+    std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
     delimiter_map.reserve(sources.size());
-    auto d_buffer     = rmm::device_uvector<char>(total_source_size, stream);
     size_t bytes_read = 0;
-    std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
-    for (auto const& source : sources) {
-      if (!source->is_empty()) {
-        auto data_size   = (range_size != 0) ? range_size : source->size();
-        auto destination = reinterpret_cast<uint8_t*>(d_buffer.data()) + bytes_read;
-        if (source->is_device_read_preferred(data_size)) {
-          bytes_read += source->device_read(range_offset, data_size, destination, stream);
-        } else {
-          h_buffers.emplace_back(source->host_read(range_offset, data_size));
-          auto const& h_buffer = h_buffers.back();
-          CUDF_CUDA_TRY(cudaMemcpyAsync(
-            destination, h_buffer->data(), h_buffer->size(), cudaMemcpyDefault, stream.value()));
-          bytes_read += h_buffer->size();
-        }
-        delimiter_map.push_back(bytes_read);
-        bytes_read += num_delimiter_chars;
+    std::transform_inclusive_scan(sources.begin(),
+                                  sources.end(),
+                                  prefsum_source_sizes.begin(),
+                                  std::plus<int>{},
+                                  [](const std::unique_ptr<datasource>& s) { return s->size(); });
+    auto upper =
+      std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset);
+    size_t start_source = std::distance(prefsum_source_sizes.begin(), upper);
+
+    auto remaining_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset);
+    range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
+    for (size_t i = start_source; i < sources.size() && remaining_bytes_to_read; i++) {
+      if (sources[i]->is_empty()) continue;
+      auto data_size   = std::min(sources[i]->size() - range_offset, remaining_bytes_to_read);
+      auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read;
+      if (sources[i]->is_device_read_preferred(data_size)) {
+        bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
+      } else {
+        h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
+        auto const& h_buffer = h_buffers.back();
+        CUDF_CUDA_TRY(cudaMemcpyAsync(
+          destination, h_buffer->data(), h_buffer->size(), cudaMemcpyDefault, stream.value()));
+        bytes_read += h_buffer->size();
       }
+      range_offset = 0;
+      remaining_bytes_to_read -= bytes_read;
+      delimiter_map.push_back(bytes_read);
+      bytes_read += num_delimiter_chars;
     }
+    // In the case where all sources are empty, bytes_read is zero
+    if (bytes_read) bytes_read -= num_delimiter_chars;
 
     // If this is a multi-file source, we scatter the JSON line delimiters between files
     if (sources.size() > 1) {
@@ -109,23 +125,25 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
                       delimiter_source,
                       delimiter_source + d_delimiter_map.size(),
                       d_delimiter_map.data(),
-                      d_buffer.data());
+                      buffer.data());
     }
-
     stream.synchronize();
-    return d_buffer;
-
-  } else {
-    auto buffer = std::vector<uint8_t>(total_source_size);
-    // Single read because only a single compressed source is supported
-    // Reading to host because decompression of a single block is much faster on the CPU
-    sources[0]->host_read(range_offset, total_source_size, buffer.data());
-    auto const uncomp_data = decompress(compression, buffer);
-    return cudf::detail::make_device_uvector_sync(
-      host_span<char const>{reinterpret_cast<char const*>(uncomp_data.data()), uncomp_data.size()},
-      stream,
-      rmm::mr::get_current_device_resource());
+    return buffer.first(bytes_read);
   }
+  // TODO: allow byte range reading from multiple compressed files.
+  auto remaining_bytes_to_read = std::min(range_size, sources[0]->size() - range_offset);
+  auto hbuffer                 = std::vector<uint8_t>(remaining_bytes_to_read);
+  // Single read because only a single compressed source is supported
+  // Reading to host because decompression of a single block is much faster on the CPU
+  sources[0]->host_read(range_offset, remaining_bytes_to_read, hbuffer.data());
+  auto uncomp_data = decompress(compression, hbuffer);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(buffer.data(),
+                                reinterpret_cast<char*>(uncomp_data.data()),
+                                uncomp_data.size() * sizeof(char),
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
+  stream.synchronize();
+  return buffer.first(uncomp_data.size());
 }
 
 size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::datasource>> sources,
@@ -133,21 +151,19 @@ size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::data
                                         char const delimiter,
                                         rmm::cuda_stream_view stream)
 {
-  auto const buffer = ingest_raw_input(sources,
-                                       reader_opts.get_compression(),
-                                       reader_opts.get_byte_range_offset(),
-                                       reader_opts.get_byte_range_size(),
-                                       stream);
+  auto const total_source_size =
+    sources_size(sources, reader_opts.get_byte_range_offset(), reader_opts.get_byte_range_size()) +
+    (sources.size() - 1);
+  rmm::device_uvector<char> buffer(total_source_size, stream);
+  ingest_raw_input(buffer,
+                   sources,
+                   reader_opts.get_compression(),
+                   reader_opts.get_byte_range_offset(),
+                   reader_opts.get_byte_range_size(),
+                   stream);
   return find_first_delimiter(buffer, delimiter, stream);
 }
 
-bool should_load_whole_source(json_reader_options const& opts, size_t source_size)
-{
-  auto const range_offset = opts.get_byte_range_offset();
-  auto const range_size   = opts.get_byte_range_size();
-  return range_offset == 0 and (range_size == 0 or range_size >= source_size);
-}
-
 /**
  * @brief Get the byte range between record starts and ends starting from the given range.
  *
@@ -159,48 +175,90 @@ bool should_load_whole_source(json_reader_options const& opts, size_t source_siz
  * @param sources Data sources to read from
  * @param reader_opts JSON reader options with range offset and range size
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Byte range for parsing
+ * @returns Data source owning buffer enclosing the bytes read
  */
-auto get_record_range_raw_input(host_span<std::unique_ptr<datasource>> sources,
-                                json_reader_options const& reader_opts,
-                                rmm::cuda_stream_view stream)
+datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
+  host_span<std::unique_ptr<datasource>> sources,
+  json_reader_options const& reader_opts,
+  rmm::cuda_stream_view stream)
 {
-  auto buffer = ingest_raw_input(sources,
-                                 reader_opts.get_compression(),
-                                 reader_opts.get_byte_range_offset(),
-                                 reader_opts.get_byte_range_size(),
-                                 stream);
-  if (should_load_whole_source(reader_opts, sources[0]->size())) return buffer;
-  auto first_delim_pos =
-    reader_opts.get_byte_range_offset() == 0 ? 0 : find_first_delimiter(buffer, '\n', stream);
+  CUDF_FUNC_RANGE();
+  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
+
+  size_t const total_source_size            = sources_size(sources, 0, 0);
+  auto constexpr num_delimiter_chars        = 1;
+  auto const num_extra_delimiters           = num_delimiter_chars * (sources.size() - 1);
+  compression_type const reader_compression = reader_opts.get_compression();
+  size_t const chunk_offset                 = reader_opts.get_byte_range_offset();
+  size_t chunk_size                         = reader_opts.get_byte_range_size();
+
+  CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset,
+               "Invalid offsetting");
+  auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
+  chunk_size =
+    should_load_all_sources ? total_source_size - chunk_offset + num_extra_delimiters : chunk_size;
+
+  // Some magic numbers
+  constexpr int num_subchunks               = 10;  // per chunk_size
+  constexpr size_t min_subchunk_size        = 10000;
+  int const num_subchunks_prealloced        = should_load_all_sources ? 0 : 3;
+  constexpr int estimated_compression_ratio = 4;
+
+  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
+  // 10kb) and the byte range size
+
+  size_t const size_per_subchunk =
+    geometric_mean(std::ceil((double)chunk_size / num_subchunks), min_subchunk_size);
+
+  // The allocation for single source compressed input is estimated by assuming a ~4:1
+  // compression ratio. For uncompressed inputs, we can getter a better estimate using the idea
+  // of subchunks.
+  auto constexpr header_size = 4096;
+  size_t const buffer_size =
+    reader_compression != compression_type::NONE
+      ? total_source_size * estimated_compression_ratio + header_size
+      : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk);
+  rmm::device_uvector<char> buffer(buffer_size, stream);
+  device_span<char> bufspan(buffer);
+
+  // Offset within buffer indicating first read position
+  std::int64_t buffer_offset = 0;
+  auto readbufspan =
+    ingest_raw_input(bufspan, sources, reader_compression, chunk_offset, chunk_size, stream);
+
+  auto const shift_for_nonzero_offset = std::min<std::int64_t>(chunk_offset, 1);
+  auto const first_delim_pos =
+    chunk_offset == 0 ? 0 : find_first_delimiter(readbufspan, '\n', stream);
   if (first_delim_pos == -1) {
-    return rmm::device_uvector<char>{0, stream};
-  } else {
-    first_delim_pos = first_delim_pos + reader_opts.get_byte_range_offset();
+    // return empty owning datasource buffer
+    auto empty_buf = rmm::device_uvector<char>(0, stream);
+    return datasource::owning_buffer<rmm::device_uvector<char>>(std::move(empty_buf));
+  } else if (!should_load_all_sources) {
     // Find next delimiter
-    decltype(first_delim_pos) next_delim_pos = -1;
-    auto const total_source_size             = sources_size(sources, 0, 0);
-    auto current_offset = reader_opts.get_byte_range_offset() + reader_opts.get_byte_range_size();
-    while (current_offset < total_source_size and next_delim_pos == -1) {
-      buffer         = ingest_raw_input(sources,
-                                reader_opts.get_compression(),
-                                current_offset,
-                                reader_opts.get_byte_range_size(),
-                                stream);
-      next_delim_pos = find_first_delimiter(buffer, '\n', stream);
-      if (next_delim_pos == -1) { current_offset += reader_opts.get_byte_range_size(); }
+    std::int64_t next_delim_pos = -1;
+    size_t next_subchunk_start  = chunk_offset + chunk_size;
+    while (next_subchunk_start < total_source_size && next_delim_pos < buffer_offset) {
+      buffer_offset += readbufspan.size();
+      readbufspan    = ingest_raw_input(bufspan.last(buffer_size - buffer_offset),
+                                     sources,
+                                     reader_compression,
+                                     next_subchunk_start,
+                                     size_per_subchunk,
+                                     stream);
+      next_delim_pos = find_first_delimiter(readbufspan, '\n', stream) + buffer_offset;
+      if (next_delim_pos < buffer_offset) { next_subchunk_start += size_per_subchunk; }
     }
-    if (next_delim_pos == -1) {
-      next_delim_pos = total_source_size;
-    } else {
-      next_delim_pos = next_delim_pos + current_offset;
-    }
-    return ingest_raw_input(sources,
-                            reader_opts.get_compression(),
-                            first_delim_pos,
-                            next_delim_pos - first_delim_pos,
-                            stream);
+    if (next_delim_pos < buffer_offset) next_delim_pos = buffer_offset + readbufspan.size();
+
+    return datasource::owning_buffer<rmm::device_uvector<char>>(
+      std::move(buffer),
+      reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
+      next_delim_pos - first_delim_pos - shift_for_nonzero_offset);
   }
+  return datasource::owning_buffer<rmm::device_uvector<char>>(
+    std::move(buffer),
+    reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
+    readbufspan.size() - first_delim_pos - shift_for_nonzero_offset);
 }
 
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
@@ -221,8 +279,6 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
   if (reader_opts.get_byte_range_offset() != 0 or reader_opts.get_byte_range_size() != 0) {
     CUDF_EXPECTS(reader_opts.is_enabled_lines(),
                  "Specifying a byte range is supported only for JSON Lines");
-    CUDF_EXPECTS(sources.size() == 1,
-                 "Specifying a byte range is supported only for a single source");
   }
 
   if (sources.size() > 1) {
@@ -232,24 +288,25 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                  "Multiple inputs are supported only for JSON Lines format");
   }
 
-  auto buffer = get_record_range_raw_input(sources, reader_opts, stream);
+  datasource::owning_buffer<rmm::device_uvector<char>> bufview =
+    get_record_range_raw_input(sources, reader_opts, stream);
 
   // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
   // invoke pre-processing FST
   if (reader_opts.is_enabled_normalize_single_quotes()) {
-    buffer =
-      normalize_single_quotes(std::move(buffer), stream, rmm::mr::get_current_device_resource());
+    normalize_single_quotes(bufview, stream, rmm::mr::get_current_device_resource());
   }
 
   // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
   // enabled, invoke pre-processing FST
   if (reader_opts.is_enabled_normalize_whitespace()) {
-    buffer =
-      normalize_whitespace(std::move(buffer), stream, rmm::mr::get_current_device_resource());
+    normalize_whitespace(bufview, stream, rmm::mr::get_current_device_resource());
   }
 
+  auto buffer =
+    cudf::device_span<char const>(reinterpret_cast<char const*>(bufview.data()), bufview.size());
+  stream.synchronize();
   return device_parse_nested_json(buffer, reader_opts, stream, mr);
-  // For debug purposes, use host_parse_nested_json()
 }
 
 }  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 596b3381eaf..cac7149dabe 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -36,7 +36,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table.hpp>
@@ -78,8 +78,9 @@ namespace {
 struct escape_strings_fn {
   column_device_view const d_column;
   bool const append_colon{false};
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void write_char(char_utf8 chr, char*& d_buffer, size_type& bytes)
   {
@@ -123,7 +124,7 @@ struct escape_strings_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -163,15 +164,15 @@ struct escape_strings_fn {
     constexpr char_utf8 const colon = ':';  // append colon
     if (append_colon) write_char(colon, d_buffer, bytes);
 
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 
   std::unique_ptr<column> get_escaped_strings(column_view const& column_v,
                                               rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr)
   {
-    auto [offsets_column, chars] =
-      cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr);
+    auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+      *this, column_v.size(), stream, mr);
 
     return make_strings_column(column_v.size(),
                                std::move(offsets_column),
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index d54524f0f0d..94a4d146b35 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "aggregate_orc_metadata.hpp"
+#include "io/orc/aggregate_orc_metadata.hpp"
 
 #include "io/utilities/row_selection.hpp"
 
@@ -152,22 +152,28 @@ aggregate_orc_metadata::aggregate_orc_metadata(
   }
 }
 
-std::tuple<int64_t, size_type, std::vector<metadata::stripe_source_mapping>>
+std::tuple<int64_t, int64_t, std::vector<metadata::orc_stripe_info>>
 aggregate_orc_metadata::select_stripes(
   std::vector<std::vector<size_type>> const& user_specified_stripes,
   int64_t skip_rows,
-  std::optional<size_type> const& num_rows,
+  std::optional<size_type> const& num_read_rows,
   rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS((skip_rows == 0 and not num_rows.has_value()) or user_specified_stripes.empty(),
+  CUDF_EXPECTS((skip_rows == 0 and not num_read_rows.has_value()) or user_specified_stripes.empty(),
                "Can't use both the row selection and the stripe selection");
 
   auto [rows_to_skip, rows_to_read] = [&]() {
-    if (not user_specified_stripes.empty()) { return std::pair<int64_t, size_type>{0, 0}; }
-    return cudf::io::detail::skip_rows_num_rows_from_options(skip_rows, num_rows, get_num_rows());
+    if (not user_specified_stripes.empty()) { return std::pair<int64_t, int64_t>{0, 0}; }
+    return cudf::io::detail::skip_rows_num_rows_from_options(
+      skip_rows, num_read_rows, get_num_rows());
   }();
 
-  std::vector<metadata::stripe_source_mapping> selected_stripes_mapping;
+  struct stripe_source_mapping {
+    int source_idx;
+    std::vector<metadata::orc_stripe_info> stripe_info;
+  };
+
+  std::vector<stripe_source_mapping> selected_stripes_mapping;
 
   if (!user_specified_stripes.empty()) {
     CUDF_EXPECTS(user_specified_stripes.size() == per_file_metadata.size(),
@@ -176,7 +182,8 @@ aggregate_orc_metadata::select_stripes(
     // Each vector entry represents a source file; each nested vector represents the
     // user_defined_stripes to get from that source file
     for (size_t src_file_idx = 0; src_file_idx < user_specified_stripes.size(); ++src_file_idx) {
-      std::vector<OrcStripeInfo> stripe_infos;
+      std::vector<metadata::orc_stripe_info> stripe_infos;
+      stripe_infos.reserve(user_specified_stripes[src_file_idx].size());
 
       // Coalesce stripe info at the source file later since that makes downstream processing much
       // easier in impl::read
@@ -185,11 +192,19 @@ aggregate_orc_metadata::select_stripes(
           stripe_idx >= 0 and stripe_idx < static_cast<decltype(stripe_idx)>(
                                              per_file_metadata[src_file_idx].ff.stripes.size()),
           "Invalid stripe index");
-        stripe_infos.push_back(
-          std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
-        rows_to_read += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        stripe_infos.push_back({&per_file_metadata[src_file_idx].ff.stripes[stripe_idx],
+                                nullptr,
+                                static_cast<int>(src_file_idx)});
+
+        auto const stripe_rows =
+          per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        CUDF_EXPECTS(stripe_rows < static_cast<uint64_t>(std::numeric_limits<size_type>::max()),
+                     "The number of rows in one stripe exceeds the column size limit.",
+                     std::overflow_error);
+        rows_to_read += static_cast<int64_t>(stripe_rows);
       }
-      selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
+      selected_stripes_mapping.emplace_back(
+        stripe_source_mapping{static_cast<int>(src_file_idx), std::move(stripe_infos)});
     }
   } else {
     int64_t count            = 0;
@@ -198,33 +213,44 @@ aggregate_orc_metadata::select_stripes(
     for (size_t src_file_idx = 0;
          src_file_idx < per_file_metadata.size() && count < rows_to_skip + rows_to_read;
          ++src_file_idx) {
-      std::vector<OrcStripeInfo> stripe_infos;
+      std::vector<metadata::orc_stripe_info> stripe_infos;
+      stripe_infos.reserve(per_file_metadata[src_file_idx].ff.stripes.size());
 
       for (size_t stripe_idx = 0; stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size() &&
                                   count < rows_to_skip + rows_to_read;
            ++stripe_idx) {
-        count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        auto const stripe_rows =
+          per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        CUDF_EXPECTS(stripe_rows < static_cast<uint64_t>(std::numeric_limits<size_type>::max()),
+                     "The number of rows in one stripe exceeds the column size limit.",
+                     std::overflow_error);
+        count += static_cast<int64_t>(stripe_rows);
+
         if (count > rows_to_skip || count == 0) {
-          stripe_infos.push_back(
-            std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
+          stripe_infos.push_back({&per_file_metadata[src_file_idx].ff.stripes[stripe_idx],
+                                  nullptr,
+                                  static_cast<int>(src_file_idx)});
         } else {
           stripe_skip_rows = count;
         }
       }
 
-      selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
+      selected_stripes_mapping.emplace_back(
+        stripe_source_mapping{static_cast<int>(src_file_idx), std::move(stripe_infos)});
     }
     // Need to remove skipped rows from the stripes which are not selected.
     rows_to_skip -= stripe_skip_rows;
   }
 
+  std::vector<metadata::orc_stripe_info> output;
+
   // Read each stripe's stripefooter metadata
   for (auto& mapping : selected_stripes_mapping) {
     // Resize to all stripe_info for the source level
     per_file_metadata[mapping.source_idx].stripefooters.resize(mapping.stripe_info.size());
 
     for (size_t i = 0; i < mapping.stripe_info.size(); i++) {
-      auto const stripe         = mapping.stripe_info[i].first;
+      auto const stripe         = mapping.stripe_info[i].stripe_info;
       auto const sf_comp_offset = stripe->offset + stripe->indexLength + stripe->dataLength;
       auto const sf_comp_length = stripe->footerLength;
       CUDF_EXPECTS(
@@ -236,12 +262,17 @@ aggregate_orc_metadata::select_stripes(
         {buffer->data(), buffer->size()}, stream);
       ProtobufReader(sf_data.data(), sf_data.size())
         .read(per_file_metadata[mapping.source_idx].stripefooters[i]);
-      mapping.stripe_info[i].second = &per_file_metadata[mapping.source_idx].stripefooters[i];
+      mapping.stripe_info[i].stripe_footer =
+        &per_file_metadata[mapping.source_idx].stripefooters[i];
       if (stripe->indexLength == 0) { row_grp_idx_present = false; }
     }
+
+    output.insert(output.end(),
+                  std::make_move_iterator(mapping.stripe_info.begin()),
+                  std::make_move_iterator(mapping.stripe_info.end()));
   }
 
-  return {rows_to_skip, rows_to_read, selected_stripes_mapping};
+  return {rows_to_skip, rows_to_read, std::move(output)};
 }
 
 column_hierarchy aggregate_orc_metadata::select_columns(
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index d1e053be481..5da5af58b9b 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -45,8 +45,6 @@ struct column_hierarchy {
  * to aggregate that metadata from all the files.
  */
 class aggregate_orc_metadata {
-  using OrcStripeInfo = std::pair<StripeInformation const*, StripeFooter const*>;
-
   /**
    * @brief Sums up the number of rows of each source
    */
@@ -114,12 +112,22 @@ class aggregate_orc_metadata {
    * @brief Selects the stripes to read, based on the row/stripe selection parameters.
    *
    * Stripes are potentially selected from multiple files.
+   *
+   * Upon parsing stripes' information, the number of skip rows and reading rows are also updated
+   * to be matched with the actual numbers for reading stripes from data sources.
+   *
+   * @param user_specified_stripes The specified stripe indices to read
+   * @param skip_rows Number of rows to skip from reading
+   * @param num_read_rows Number of rows to read
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @return A tuple of the corrected skip_rows and num_rows values along with a vector of
+   *         stripes' metadata such as footer, data information, and source index
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<metadata::stripe_source_mapping>>
-  select_stripes(std::vector<std::vector<size_type>> const& user_specified_stripes,
-                 int64_t skip_rows,
-                 std::optional<size_type> const& num_rows,
-                 rmm::cuda_stream_view stream);
+  [[nodiscard]] std::tuple<int64_t, int64_t, std::vector<metadata::orc_stripe_info>> select_stripes(
+    std::vector<std::vector<size_type>> const& user_specified_stripes,
+    int64_t skip_rows,
+    std::optional<size_type> const& num_read_rows,
+    rmm::cuda_stream_view stream);
 
   /**
    * @brief Filters ORC file to a selection of columns, based on their paths in the file.
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 88bd260a598..fd55cbb6846 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -602,13 +602,13 @@ struct column_validity_info {
  * convenience methods for initializing and accessing metadata.
  */
 class metadata {
-  using OrcStripeInfo = std::pair<StripeInformation const*, StripeFooter const*>;
-
  public:
-  struct stripe_source_mapping {
+  struct orc_stripe_info {
+    StripeInformation const* stripe_info;
+    StripeFooter const* stripe_footer;
     int source_idx;
-    std::vector<OrcStripeInfo> stripe_info;
   };
+  std::vector<orc_stripe_info> stripe_info;
 
  public:
   explicit metadata(datasource* const src, rmm::cuda_stream_view stream);
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 77151f5b7b8..621d4c67691 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -14,42 +14,100 @@
  * limitations under the License.
  */
 
-#include "reader_impl.hpp"
-#include "reader_impl_chunking.hpp"
-#include "reader_impl_helpers.hpp"
+#include "io/orc/reader_impl.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
+#include "io/orc/reader_impl_helpers.hpp"
 
-#include <rmm/resource_ref.hpp>
+#include <cudf/detail/copy.hpp>
+
+#include <algorithm>
 
 namespace cudf::io::orc::detail {
 
-reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                   orc_reader_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::device_async_resource_ref mr)
-  : _stream(stream),
-    _mr(mr),
-    _timestamp_type{options.get_timestamp_type()},
-    _use_index{options.is_enabled_use_index()},
-    _use_np_dtypes{options.is_enabled_use_np_dtypes()},
-    _decimal128_columns{options.get_decimal128_columns()},
-    _col_meta{std::make_unique<reader_column_meta>()},
-    _sources(std::move(sources)),
-    _metadata{_sources, stream},
-    _selected_columns{_metadata.select_columns(options.get_columns())}
+// This is just the proxy to call all other data preprocessing functions.
+void reader_impl::prepare_data(read_mode mode)
 {
+  // There are no columns in the table.
+  if (_selected_columns.num_levels() == 0) { return; }
+
+  // This will be no-op if it was called before.
+  preprocess_file(mode);
+
+  if (!_chunk_read_data.more_table_chunks_to_output()) {
+    if (!_chunk_read_data.more_stripes_to_decode() && _chunk_read_data.more_stripes_to_load()) {
+      // Only load stripe data if:
+      //  - There is more stripe to load, and
+      //  - All loaded stripes were decoded, and
+      //  - All the decoded results were output.
+      load_next_stripe_data(mode);
+    }
+    if (_chunk_read_data.more_stripes_to_decode()) {
+      // Only decompress/decode the loaded stripes if:
+      //  - There are loaded stripes that were not decoded yet, and
+      //  - All the decoded results were output.
+      decompress_and_decode_stripes(mode);
+    }
+  }
 }
 
-table_with_metadata reader::impl::read(int64_t skip_rows,
-                                       std::optional<size_type> const& num_rows_opt,
-                                       std::vector<std::vector<size_type>> const& stripes)
+table_with_metadata reader_impl::make_output_chunk()
 {
-  prepare_data(skip_rows, num_rows_opt, stripes);
-  return read_chunk_internal();
+  // There are no columns in the table.
+  if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
+
+  // If no rows or stripes to read, return empty columns.
+  if (!_chunk_read_data.more_table_chunks_to_output()) {
+    std::vector<std::unique_ptr<column>> out_columns;
+    auto out_metadata = get_meta_with_user_data();
+    std::transform(_selected_columns.levels[0].begin(),
+                   _selected_columns.levels[0].end(),
+                   std::back_inserter(out_columns),
+                   [&](auto const& col_meta) {
+                     out_metadata.schema_info.emplace_back("");
+                     return create_empty_column(col_meta.id,
+                                                _metadata,
+                                                _options.decimal128_columns,
+                                                _options.use_np_dtypes,
+                                                _options.timestamp_type,
+                                                out_metadata.schema_info.back(),
+                                                _stream);
+                   });
+    return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
+  }
+
+  auto const make_output_table = [&] {
+    if (_chunk_read_data.output_table_ranges.size() == 1) {
+      // Must change the index of the current output range such that calling `has_next()` after
+      // this will return the correct answer (`false`, since there is only one range).
+      _chunk_read_data.curr_output_table_range++;
+
+      // Just hand over the decoded table without slicing.
+      return std::move(_chunk_read_data.decoded_table);
+    }
+
+    // The range of rows in the decoded table to output.
+    auto const out_range =
+      _chunk_read_data.output_table_ranges[_chunk_read_data.curr_output_table_range++];
+    auto const out_tview = cudf::detail::slice(
+      _chunk_read_data.decoded_table->view(),
+      {static_cast<size_type>(out_range.begin), static_cast<size_type>(out_range.end)},
+      _stream)[0];
+    auto output = std::make_unique<table>(out_tview, _stream, _mr);
+
+    // If this is the last slice, we also delete the decoded table to free up memory.
+    if (!_chunk_read_data.more_table_chunks_to_output()) {
+      _chunk_read_data.decoded_table.reset(nullptr);
+    }
+
+    return output;
+  };
+
+  return {make_output_table(), table_metadata{_out_metadata} /*copy cached metadata*/};
 }
 
-table_metadata reader::impl::make_output_metadata()
+table_metadata reader_impl::get_meta_with_user_data()
 {
-  if (_output_metadata) { return table_metadata{*_output_metadata}; }
+  if (_meta_with_user_data) { return table_metadata{*_meta_with_user_data}; }
 
   // Copy user data to the output metadata.
   table_metadata out_metadata;
@@ -70,69 +128,126 @@ table_metadata reader::impl::make_output_metadata()
   out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),
                             out_metadata.per_file_user_data[0].end()};
 
-  // Save the output table metadata into `_output_metadata` for reuse next time.
-  _output_metadata = std::make_unique<table_metadata>(out_metadata);
+  // Save the output table metadata into `_meta_with_user_data` for reuse next time.
+  _meta_with_user_data = std::make_unique<table_metadata>(out_metadata);
 
   return out_metadata;
 }
 
-table_with_metadata reader::impl::read_chunk_internal()
+reader_impl::reader_impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                         orc_reader_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::device_async_resource_ref mr)
+  : reader_impl::reader_impl(0UL, 0UL, std::move(sources), options, stream, mr)
 {
-  // There is no columns in the table.
-  if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
+}
 
-  std::vector<std::unique_ptr<column>> out_columns;
-  auto out_metadata = make_output_metadata();
+reader_impl::reader_impl(std::size_t chunk_read_limit,
+                         std::size_t pass_read_limit,
+                         std::vector<std::unique_ptr<datasource>>&& sources,
+                         orc_reader_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::device_async_resource_ref mr)
+  : reader_impl::reader_impl(chunk_read_limit,
+                             pass_read_limit,
+                             DEFAULT_OUTPUT_ROW_GRANULARITY,
+                             std::move(sources),
+                             options,
+                             stream,
+                             mr)
+{
+}
 
-  // If no rows or stripes to read, return empty columns
-  if (_file_itm_data->rows_to_read == 0 || _file_itm_data->selected_stripes.empty()) {
-    std::transform(_selected_columns.levels[0].begin(),
-                   _selected_columns.levels[0].end(),
-                   std::back_inserter(out_columns),
-                   [&](auto const col_meta) {
-                     out_metadata.schema_info.emplace_back("");
-                     return create_empty_column(col_meta.id,
-                                                _metadata,
-                                                _decimal128_columns,
-                                                _use_np_dtypes,
-                                                _timestamp_type,
-                                                out_metadata.schema_info.back(),
-                                                _stream);
-                   });
-    return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
-  }
+reader_impl::reader_impl(std::size_t chunk_read_limit,
+                         std::size_t pass_read_limit,
+                         size_type output_row_granularity,
+                         std::vector<std::unique_ptr<datasource>>&& sources,
+                         orc_reader_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::device_async_resource_ref mr)
+  : _stream(stream),
+    _mr(mr),
+    _options{options.get_timestamp_type(),
+             options.is_enabled_use_index(),
+             options.is_enabled_use_np_dtypes(),
+             options.get_decimal128_columns(),
+             options.get_skip_rows(),
+             options.get_num_rows(),
+             options.get_stripes()},
+    _col_meta{std::make_unique<reader_column_meta>()},
+    _sources(std::move(sources)),
+    _metadata{_sources, stream},
+    _selected_columns{_metadata.select_columns(options.get_columns())},
+    _chunk_read_data{chunk_read_limit, pass_read_limit, output_row_granularity}
+{
+  // Selected columns at different levels of nesting are stored in different elements
+  // of `selected_columns`; thus, size == 1 means no nested columns.
+  CUDF_EXPECTS(_options.skip_rows == 0 or _selected_columns.num_levels() == 1,
+               "skip_rows is not supported by nested column");
+}
+
+table_with_metadata reader_impl::read()
+{
+  prepare_data(read_mode::READ_ALL);
+  return make_output_chunk();
+}
 
-  // Create columns from buffer with respective schema information.
-  std::transform(
-    _selected_columns.levels[0].begin(),
-    _selected_columns.levels[0].end(),
-    std::back_inserter(out_columns),
-    [&](auto const& orc_col_meta) {
-      out_metadata.schema_info.emplace_back("");
-      auto col_buffer = assemble_buffer(
-        orc_col_meta.id, 0, *_col_meta, _metadata, _selected_columns, _out_buffers, _stream, _mr);
-      return make_column(col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream);
-    });
-
-  return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
+bool reader_impl::has_next()
+{
+  prepare_data(read_mode::CHUNKED_READ);
+  return _chunk_read_data.has_next();
+}
+
+table_with_metadata reader_impl::read_chunk()
+{
+  prepare_data(read_mode::CHUNKED_READ);
+  return make_output_chunk();
 }
 
-// Forward to implementation
+chunked_reader::chunked_reader(std::size_t chunk_read_limit,
+                               std::size_t pass_read_limit,
+                               std::vector<std::unique_ptr<datasource>>&& sources,
+                               orc_reader_options const& options,
+                               rmm::cuda_stream_view stream,
+                               rmm::device_async_resource_ref mr)
+  : _impl{std::make_unique<reader_impl>(
+      chunk_read_limit, pass_read_limit, std::move(sources), options, stream, mr)}
+{
+}
+
+chunked_reader::chunked_reader(std::size_t chunk_read_limit,
+                               std::size_t pass_read_limit,
+                               size_type output_row_granularity,
+                               std::vector<std::unique_ptr<datasource>>&& sources,
+                               orc_reader_options const& options,
+                               rmm::cuda_stream_view stream,
+                               rmm::device_async_resource_ref mr)
+  : _impl{std::make_unique<reader_impl>(chunk_read_limit,
+                                        pass_read_limit,
+                                        output_row_granularity,
+                                        std::move(sources),
+                                        options,
+                                        stream,
+                                        mr)}
+{
+}
+
+chunked_reader::~chunked_reader() = default;
+
+bool chunked_reader::has_next() const { return _impl->has_next(); }
+
+table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); }
+
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                orc_reader_options const& options,
                rmm::cuda_stream_view stream,
                rmm::device_async_resource_ref mr)
-  : _impl{std::make_unique<impl>(std::move(sources), options, stream, mr)}
+  : _impl{std::make_unique<reader_impl>(std::move(sources), options, stream, mr)}
 {
 }
 
-// Destructor within this translation unit
 reader::~reader() = default;
 
-// Forward to implementation
-table_with_metadata reader::read(orc_reader_options const& options)
-{
-  return _impl->read(options.get_skip_rows(), options.get_num_rows(), options.get_stripes());
-}
+table_with_metadata reader::read() { return _impl->read(); }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 8b859da07e9..94b294087b8 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include "aggregate_orc_metadata.hpp"
-#include "io/utilities/column_buffer.hpp"
+#include "io/orc/aggregate_orc_metadata.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
 
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/orc.hpp>
@@ -26,6 +26,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <io/utilities/column_buffer.hpp>
+
 #include <memory>
 #include <optional>
 #include <vector>
@@ -33,83 +35,169 @@
 namespace cudf::io::orc::detail {
 
 struct reader_column_meta;
-struct file_intermediate_data;
 
 /**
  * @brief Implementation for ORC reader.
  */
-class reader::impl {
+class reader_impl {
  public:
   /**
    * @brief Constructor from a dataset source with reader options.
    *
+   * This constructor will call the other constructor with `chunk_read_limit` and `pass_read_limit`
+   * set to `0` and `output_row_granularity` set to `DEFAULT_OUTPUT_ROW_GRANULARITY`.
+   *
    * @param sources Dataset sources
    * @param options Settings for controlling reading behavior
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                orc_reader_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::device_async_resource_ref mr);
+  explicit reader_impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                       orc_reader_options const& options,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr);
 
   /**
-   * @brief Read an entire set or a subset of data and returns a set of columns
-   *
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows
-   * @param stripes Indices of individual stripes to load if non-empty
-   * @return The set of columns along with metadata
+   * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t,
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
+   */
+  explicit reader_impl(std::size_t chunk_read_limit,
+                       std::size_t pass_read_limit,
+                       std::vector<std::unique_ptr<datasource>>&& sources,
+                       orc_reader_options const& options,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr);
+
+  /**
+   * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t,
+   * size_type, orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
    */
-  table_with_metadata read(int64_t skip_rows,
-                           std::optional<size_type> const& num_rows_opt,
-                           std::vector<std::vector<size_type>> const& stripes);
+  explicit reader_impl(std::size_t chunk_read_limit,
+                       std::size_t pass_read_limit,
+                       size_type output_row_granularity,
+                       std::vector<std::unique_ptr<datasource>>&& sources,
+                       orc_reader_options const& options,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr);
+
+  /**
+   * @copydoc cudf::io::orc::detail::reader::read
+   */
+  table_with_metadata read();
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::has_next
+   */
+  bool has_next();
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::read_chunk
+   */
+  table_with_metadata read_chunk();
 
  private:
+  /**
+   * @brief The enum indicating whether the data sources are read all at once or chunk by chunk.
+   */
+  enum class read_mode { READ_ALL, CHUNKED_READ };
+
   /**
    * @brief Perform all the necessary data preprocessing before creating an output table.
    *
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows
-   * @param stripes Indices of individual stripes to load if non-empty
+   * This is the proxy to call all other data preprocessing functions, which are prerequisite
+   * for generating the output.
+   *
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void prepare_data(int64_t skip_rows,
-                    std::optional<size_type> const& num_rows_opt,
-                    std::vector<std::vector<size_type>> const& stripes);
+  void prepare_data(read_mode mode);
 
   /**
-   * @brief Create the output table metadata from file metadata.
+   * @brief Perform a preprocessing step on the input data sources that executes exactly once
+   * for the entire duration of the reader.
    *
-   * @return Columns' metadata to output with the table read from file
+   * In this step, the metadata of all stripes in the data sources is parsed, and information about
+   * data streams of the selected columns in all stripes are generated. If the reader has a data
+   * read limit, sizes of these streams are used to split the list of all stripes into multiple
+   * subsets, each of which will be loaded into memory in the `load_next_stripe_data()` step. These
+   * subsets are computed such that memory usage will be kept to be around a fixed size limit.
+   *
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
+   */
+  void preprocess_file(read_mode mode);
+
+  /**
+   * @brief Load stripes from the input data sources into memory.
+   *
+   * If there is a data read limit, only a subset of stripes are read at a time such that
+   * their total data size does not exceed a fixed size limit. Then, the data is probed to
+   * estimate its uncompressed sizes, which are in turn used to split that stripe subset into
+   * smaller subsets, each of which to be decompressed and decoded in the next step
+   * `decompress_and_decode_stripes()`. This is to ensure that loading data from data sources
+   * together with decompression and decoding will be capped around the given data read limit.
+   *
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  table_metadata make_output_metadata();
+  void load_next_stripe_data(read_mode mode);
 
   /**
-   * @brief Read a chunk of data from the input source and return an output table with metadata.
+   * @brief Decompress and decode stripe data in the internal buffers, and store the result into
+   * an intermediate table.
+   *
+   * This function expects that the other preprocessing steps (`global preprocess()` and
+   * `load_next_stripe_data()`) have already been done.
    *
-   * This function is called internally and expects all preprocessing steps have already been done.
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
+   */
+  void decompress_and_decode_stripes(read_mode mode);
+
+  /**
+   * @brief Create the output table from the intermediate table and return it along with metadata.
    *
    * @return The output table along with columns' metadata
    */
-  table_with_metadata read_chunk_internal();
+  table_with_metadata make_output_chunk();
+
+  /**
+   * @brief Create the output table metadata storing user data in source metadata.
+   *
+   * @return Columns' user data to output with the table read from file
+   */
+  table_metadata get_meta_with_user_data();
 
   rmm::cuda_stream_view const _stream;
   rmm::device_async_resource_ref const _mr;
 
-  // Reader configs
-  data_type const _timestamp_type;  // Override output timestamp resolution
-  bool const _use_index;            // Enable or disable attempt to use row index for parsing
-  bool const _use_np_dtypes;        // Enable or disable the conversion to numpy-compatible dtypes
-  std::vector<std::string> const _decimal128_columns;   // Control decimals conversion
-  std::unique_ptr<reader_column_meta> const _col_meta;  // Track of orc mapping and child details
+  // Reader configs.
+  struct {
+    data_type timestamp_type;  // override output timestamp resolution
+    bool use_index;            // enable or disable attempt to use row index for parsing
+    bool use_np_dtypes;        // enable or disable the conversion to numpy-compatible dtypes
+    std::vector<std::string> decimal128_columns;  // control decimals conversion
 
-  // Intermediate data for internal processing.
+    // User specified reading rows/stripes selection.
+    int64_t const skip_rows;
+    std::optional<int64_t> num_read_rows;
+    std::vector<std::vector<size_type>> const selected_stripes;
+  } const _options;
+
+  // Intermediate data for reading.
+  std::unique_ptr<reader_column_meta> const _col_meta;  // Track of orc mapping and child details
   std::vector<std::unique_ptr<datasource>> const _sources;  // Unused but owns data for `_metadata`
   aggregate_orc_metadata _metadata;
   column_hierarchy const _selected_columns;  // Construct from `_metadata` thus declare after it
-  std::unique_ptr<file_intermediate_data> _file_itm_data;
-  std::unique_ptr<table_metadata> _output_metadata;
+  file_intermediate_data _file_itm_data;
+  chunk_read_data _chunk_read_data;
+
+  // Intermediate data for output.
+  std::unique_ptr<table_metadata> _meta_with_user_data;
+  table_metadata _out_metadata;
   std::vector<std::vector<cudf::io::detail::column_buffer>> _out_buffers;
+
+  // The default value used for subdividing the decoded table for final output.
+  // Larger values will reduce the computation time but will make the output table less granular.
+  // Smaller values (minimum is `1`) will increase the computation time but the output table will
+  // have size closer to the given `chunk_read_limit`.
+  static inline constexpr size_type DEFAULT_OUTPUT_ROW_GRANULARITY = 10'000;
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
new file mode 100644
index 00000000000..5034aa14a95
--- /dev/null
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -0,0 +1,723 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "io/comp/gpuinflate.hpp"
+#include "io/orc/reader_impl.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
+#include "io/orc/reader_impl_helpers.hpp"
+#include "io/utilities/hostdevice_span.hpp"
+
+#include <cudf/detail/timezone.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/device_buffer.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/scan.h>
+
+#include <algorithm>
+#include <tuple>
+
+namespace cudf::io::orc::detail {
+
+std::size_t gather_stream_info_and_column_desc(
+  std::size_t stripe_id,
+  std::size_t level,
+  orc::StripeInformation const* stripeinfo,
+  orc::StripeFooter const* stripefooter,
+  host_span<int const> orc2gdf,
+  host_span<orc::SchemaType const> types,
+  bool use_index,
+  bool apply_struct_map,
+  int64_t* num_dictionary_entries,
+  std::size_t* local_stream_order,
+  std::vector<orc_stream_info>* stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>* chunks)
+{
+  CUDF_EXPECTS((stream_info == nullptr) ^ (chunks == nullptr),
+               "Either stream_info or chunks must be provided, but not both.");
+
+  std::size_t src_offset = 0;
+  std::size_t dst_offset = 0;
+
+  auto const get_stream_index_type = [](orc::StreamKind kind) {
+    switch (kind) {
+      case orc::DATA: return gpu::CI_DATA;
+      case orc::LENGTH:
+      case orc::SECONDARY: return gpu::CI_DATA2;
+      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
+      case orc::PRESENT: return gpu::CI_PRESENT;
+      case orc::ROW_INDEX: return gpu::CI_INDEX;
+      default:
+        // Skip this stream as it's not strictly required
+        return gpu::CI_NUM_STREAMS;
+    }
+  };
+
+  for (auto const& stream : stripefooter->streams) {
+    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
+      // Ignore reading this stream from source.
+      CUDF_LOG_WARN("Unexpected stream in the input ORC source. The stream will be ignored.");
+      src_offset += stream.length;
+      continue;
+    }
+
+    auto const column_id = *stream.column_id;
+    auto col             = orc2gdf[column_id];
+
+    if (col == -1 and apply_struct_map) {
+      // A struct-type column has no data itself, but rather child columns
+      // for each of its fields. There is only a PRESENT stream, which
+      // needs to be included for the reader.
+      auto const schema_type = types[column_id];
+      if (!schema_type.subtypes.empty() && schema_type.kind == orc::STRUCT &&
+          stream.kind == orc::PRESENT) {
+        for (auto const& idx : schema_type.subtypes) {
+          auto const child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
+          if (child_idx >= 0) {
+            col = child_idx;
+            if (chunks) {
+              auto& chunk                     = (*chunks)[stripe_id][col];
+              chunk.strm_id[gpu::CI_PRESENT]  = *local_stream_order;
+              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
+            }
+          }
+        }
+      }
+    } else if (col != -1) {
+      if (chunks) {
+        if (src_offset >= stripeinfo->indexLength || use_index) {
+          auto const index_type = get_stream_index_type(stream.kind);
+          if (index_type < gpu::CI_NUM_STREAMS) {
+            auto& chunk                = (*chunks)[stripe_id][col];
+            chunk.strm_id[index_type]  = *local_stream_order;
+            chunk.strm_len[index_type] = stream.length;
+            // NOTE: skip_count field is temporarily used to track the presence of index streams
+            chunk.skip_count |= 1 << index_type;
+
+            if (index_type == gpu::CI_DICTIONARY) {
+              chunk.dictionary_start = *num_dictionary_entries;
+              chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
+              *num_dictionary_entries +=
+                static_cast<int64_t>(stripefooter->columns[column_id].dictionarySize);
+            }
+          }
+        }
+
+        (*local_stream_order)++;
+      } else {  // chunks == nullptr
+        stream_info->emplace_back(
+          orc_stream_info{stripeinfo->offset + src_offset,
+                          dst_offset,
+                          stream.length,
+                          stream_source_info{stripe_id, level, column_id, stream.kind}});
+      }
+
+      dst_offset += stream.length;
+    }
+    src_offset += stream.length;
+  }
+
+  return dst_offset;
+}
+
+template <typename T>
+std::vector<range> find_splits(host_span<T const> cumulative_sizes,
+                               std::size_t total_count,
+                               std::size_t size_limit)
+{
+  CUDF_EXPECTS(size_limit > 0, "Invalid size limit", std::invalid_argument);
+
+  std::vector<range> splits;
+  std::size_t cur_count{0};
+  int64_t cur_pos{0};
+  std::size_t cur_cumulative_size{0};
+
+  [[maybe_unused]] std::size_t cur_cumulative_rows{0};
+
+  auto const start = thrust::make_transform_iterator(
+    cumulative_sizes.begin(),
+    [&](auto const& size) { return size.size_bytes - cur_cumulative_size; });
+  auto const end = start + cumulative_sizes.size();
+
+  while (cur_count < total_count) {
+    int64_t split_pos = static_cast<int64_t>(
+      thrust::distance(start, thrust::lower_bound(thrust::seq, start + cur_pos, end, size_limit)));
+
+    // If we're past the end, or if the returned range has size exceeds the given size limit,
+    // move back one position.
+    if (split_pos >= static_cast<int64_t>(cumulative_sizes.size()) ||
+        (cumulative_sizes[split_pos].size_bytes > cur_cumulative_size + size_limit)) {
+      split_pos--;
+    }
+
+    if constexpr (std::is_same_v<T, cumulative_size_and_row>) {
+      // Similarly, while the returned range has total number of rows exceeds column size limit,
+      // move back one position.
+      while (split_pos > 0 && cumulative_sizes[split_pos].num_rows >
+                                cur_cumulative_rows +
+                                  static_cast<std::size_t>(std::numeric_limits<size_type>::max())) {
+        split_pos--;
+      }
+    }
+
+    // In case we have moved back too much in the steps above, far beyond the last split point, that
+    // means we could not find any range that has size fits within the given size limit.
+    // In such situations, we need to move forward until we move pass the last output range.
+    while (split_pos < (static_cast<int64_t>(cumulative_sizes.size()) - 1) &&
+           (split_pos < 0 || cumulative_sizes[split_pos].count <= cur_count)) {
+      split_pos++;
+    }
+
+    auto const start_count = cur_count;
+    cur_count              = cumulative_sizes[split_pos].count;
+    splits.emplace_back(range{start_count, cur_count});
+    cur_pos             = split_pos;
+    cur_cumulative_size = cumulative_sizes[split_pos].size_bytes;
+
+    if constexpr (std::is_same_v<T, cumulative_size_and_row>) {
+      cur_cumulative_rows = cumulative_sizes[split_pos].num_rows;
+    }
+  }
+
+  // If the last range has size smaller than `merge_threshold` the size of the second last one,
+  // merge it with the second last one.
+  // This is to prevent having the last range too small.
+  if (splits.size() > 1) {
+    double constexpr merge_threshold = 0.15;
+    if (auto const last = splits.back(), second_last = splits[splits.size() - 2];
+        last.size() <= static_cast<std::size_t>(merge_threshold * second_last.size())) {
+      splits.pop_back();
+      splits.back().end = last.end;
+    }
+  }
+
+  return splits;
+}
+
+// Since `find_splits` is a template function, we need to explicitly instantiate it so it can be
+// used outside of this TU.
+template std::vector<range> find_splits<cumulative_size>(host_span<cumulative_size const> sizes,
+                                                         std::size_t total_count,
+                                                         std::size_t size_limit);
+template std::vector<range> find_splits<cumulative_size_and_row>(
+  host_span<cumulative_size_and_row const> sizes, std::size_t total_count, std::size_t size_limit);
+
+// In this step, the metadata of all stripes in the data sources is parsed, and information about
+// data streams of the selected columns in all stripes are generated. If the reader has a data
+// read limit, sizes of these streams are used to split the list of all stripes into multiple
+// subsets, each of which will be loaded into memory in the `load_next_stripe_data()` step. These
+// subsets are computed such that memory usage will be kept to be around a fixed size limit.
+void reader_impl::preprocess_file(read_mode mode)
+{
+  if (_file_itm_data.global_preprocessed) { return; }
+  _file_itm_data.global_preprocessed = true;
+
+  //
+  // Load stripes' metadata:
+  //
+  std::tie(
+    _file_itm_data.rows_to_skip, _file_itm_data.rows_to_read, _file_itm_data.selected_stripes) =
+    _metadata.select_stripes(
+      _options.selected_stripes, _options.skip_rows, _options.num_read_rows, _stream);
+  if (!_file_itm_data.has_data()) { return; }
+
+  CUDF_EXPECTS(
+    mode == read_mode::CHUNKED_READ ||
+      _file_itm_data.rows_to_read <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+    "READ_ALL mode does not support reading number of rows more than cudf's column size limit. "
+    "For reading large number of rows, please use chunked_reader.",
+    std::overflow_error);
+
+  auto const& selected_stripes = _file_itm_data.selected_stripes;
+  auto const num_total_stripes = selected_stripes.size();
+  auto const num_levels        = _selected_columns.num_levels();
+
+  // Set up table for converting timestamp columns from local to UTC time
+  _file_itm_data.tz_table = [&] {
+    auto const has_timestamp_column = std::any_of(
+      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
+        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
+          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
+        });
+      });
+
+    return has_timestamp_column ? cudf::detail::make_timezone_transition_table(
+                                    {}, selected_stripes[0].stripe_footer->writerTimezone, _stream)
+                                : std::make_unique<cudf::table>();
+  }();
+
+  //
+  // Pre allocate necessary memory for data processed in the other reading steps:
+  //
+  auto& stripe_data_read_ranges = _file_itm_data.stripe_data_read_ranges;
+  stripe_data_read_ranges.resize(num_total_stripes);
+
+  auto& lvl_stripe_data          = _file_itm_data.lvl_stripe_data;
+  auto& lvl_stripe_sizes         = _file_itm_data.lvl_stripe_sizes;
+  auto& lvl_stream_info          = _file_itm_data.lvl_stream_info;
+  auto& lvl_stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges;
+  auto& lvl_column_types         = _file_itm_data.lvl_column_types;
+  auto& lvl_nested_cols          = _file_itm_data.lvl_nested_cols;
+
+  lvl_stripe_data.resize(num_levels);
+  lvl_stripe_sizes.resize(num_levels);
+  lvl_stream_info.resize(num_levels);
+  lvl_stripe_stream_ranges.resize(num_levels);
+  lvl_column_types.resize(num_levels);
+  lvl_nested_cols.resize(num_levels);
+  _out_buffers.resize(num_levels);
+
+  auto& read_info = _file_itm_data.data_read_info;
+  auto& col_meta  = *_col_meta;
+
+  //
+  // Collect columns' types:
+  //
+  for (std::size_t level = 0; level < num_levels; ++level) {
+    lvl_stripe_sizes[level].resize(num_total_stripes);
+    lvl_stripe_stream_ranges[level].resize(num_total_stripes);
+
+    // Association between each ORC column and its cudf::column
+    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
+
+    auto const& columns_level = _selected_columns.levels[level];
+    size_type col_id{0};
+
+    for (auto const& col : columns_level) {
+      // Map each ORC column to its column
+      col_meta.orc_col_map[level][col.id] = col_id++;
+
+      auto const col_type =
+        to_cudf_type(_metadata.get_col_type(col.id).kind,
+                     _options.use_np_dtypes,
+                     _options.timestamp_type.id(),
+                     to_cudf_decimal_type(_options.decimal128_columns, _metadata, col.id));
+      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
+
+      auto& column_types = lvl_column_types[level];
+      auto& nested_cols  = lvl_nested_cols[level];
+
+      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
+          col_type == type_id::DECIMAL128) {
+        // sign of the scale is changed since cuDF follows c++ libraries like CNL
+        // which uses negative scaling, but liborc and other libraries
+        // follow positive scaling.
+        auto const scale =
+          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
+        column_types.emplace_back(col_type, scale);
+      } else {
+        column_types.emplace_back(col_type);
+      }
+
+      // Map each ORC column to its column.
+      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
+        nested_cols.emplace_back(col);
+      }
+    }
+
+    // Try to reserve some memory, but the final size is unknown,
+    // since each column may have more than one stream.
+    auto const num_columns = columns_level.size();
+    lvl_stream_info[level].reserve(num_total_stripes * num_columns);
+    if (read_info.capacity() < num_total_stripes * num_columns) {
+      read_info.reserve(num_total_stripes * num_columns);
+    }
+  }
+
+  //
+  // Collect all data streams' information:
+  //
+
+  // Load all stripes if we are in READ_ALL mode or there is no read limit.
+  auto const load_all_stripes =
+    mode == read_mode::READ_ALL || _chunk_read_data.pass_read_limit == 0;
+
+  // Accumulate data size for data streams in each stripe, used for chunking.
+  // This will be used only for CHUNKED_READ mode when there is a read limit.
+  // Otherwise, we do not need this since we just load all stripes.
+  cudf::detail::hostdevice_vector<cumulative_size> total_stripe_sizes(
+    load_all_stripes ? std::size_t{0} : num_total_stripes, _stream);
+
+  for (std::size_t stripe_global_idx = 0; stripe_global_idx < num_total_stripes;
+       ++stripe_global_idx) {
+    auto const& stripe       = selected_stripes[stripe_global_idx];
+    auto const stripe_info   = stripe.stripe_info;
+    auto const stripe_footer = stripe.stripe_footer;
+
+    std::size_t this_stripe_size{0};
+    auto const last_read_size = read_info.size();
+    for (std::size_t level = 0; level < num_levels; ++level) {
+      auto& stream_info = _file_itm_data.lvl_stream_info[level];
+
+      auto stream_level_count = stream_info.size();
+      auto const stripe_level_size =
+        gather_stream_info_and_column_desc(stripe_global_idx,
+                                           level,
+                                           stripe_info,
+                                           stripe_footer,
+                                           col_meta.orc_col_map[level],
+                                           _metadata.get_types(),
+                                           false,  // use_index,
+                                           level == 0,
+                                           nullptr,  // num_dictionary_entries
+                                           nullptr,  // local_stream_order
+                                           &stream_info,
+                                           nullptr  // chunks
+        );
+
+      auto const is_stripe_data_empty = stripe_level_size == 0;
+      CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                   "Invalid index rowgroup stream data");
+
+      lvl_stripe_sizes[level][stripe_global_idx] = stripe_level_size;
+      this_stripe_size += stripe_level_size;
+
+      // Range of the streams in `stream_info` corresponding to this stripe at the current level.
+      lvl_stripe_stream_ranges[level][stripe_global_idx] =
+        range{stream_level_count, stream_info.size()};
+
+      // Coalesce consecutive streams into one read.
+      while (not is_stripe_data_empty and stream_level_count < stream_info.size()) {
+        auto const d_dst  = stream_info[stream_level_count].dst_pos;
+        auto const offset = stream_info[stream_level_count].offset;
+        auto len          = stream_info[stream_level_count].length;
+        stream_level_count++;
+
+        while (stream_level_count < stream_info.size() &&
+               stream_info[stream_level_count].offset == offset + len) {
+          len += stream_info[stream_level_count].length;
+          stream_level_count++;
+        }
+        read_info.emplace_back(stream_data_read_info{offset,
+                                                     d_dst,
+                                                     len,
+                                                     static_cast<std::size_t>(stripe.source_idx),
+                                                     stripe_global_idx,
+                                                     level});
+      }
+    }  // end loop level
+
+    if (!load_all_stripes) { total_stripe_sizes[stripe_global_idx] = {1, this_stripe_size}; }
+
+    // Range of all stream reads in `read_info` corresponding to this stripe, in all levels.
+    stripe_data_read_ranges[stripe_global_idx] = range{last_read_size, read_info.size()};
+  }
+
+  //
+  // Split range of all stripes into subranges that can be loaded separately while maintaining
+  // the memory usage under the given pass limit:
+  //
+
+  // Load range is reset to start from the first position in `load_stripe_ranges`.
+  _chunk_read_data.curr_load_stripe_range = 0;
+
+  if (load_all_stripes) {
+    _chunk_read_data.load_stripe_ranges = {range{0UL, num_total_stripes}};
+    return;
+  }
+
+  // Compute the prefix sum of stripes' data sizes.
+  total_stripe_sizes.host_to_device_async(_stream);
+  thrust::inclusive_scan(rmm::exec_policy_nosync(_stream),
+                         total_stripe_sizes.d_begin(),
+                         total_stripe_sizes.d_end(),
+                         total_stripe_sizes.d_begin(),
+                         cumulative_size_plus{});
+  total_stripe_sizes.device_to_host_sync(_stream);
+
+  auto const load_limit = [&] {
+    auto const tmp = static_cast<std::size_t>(_chunk_read_data.pass_read_limit *
+                                              chunk_read_data::load_limit_ratio);
+    // Make sure not to pass 0 byte limit (due to round-off) to `find_splits`.
+    return std::max(tmp, 1UL);
+  }();
+
+  _chunk_read_data.load_stripe_ranges =
+    find_splits<cumulative_size>(total_stripe_sizes, num_total_stripes, load_limit);
+}
+
+// If there is a data read limit, only a subset of stripes are read at a time such that
+// their total data size does not exceed a fixed size limit. Then, the data is probed to
+// estimate its uncompressed sizes, which are in turn used to split that stripe subset into
+// smaller subsets, each of which to be decompressed and decoded in the next step
+// `decompress_and_decode_stripes()`. This is to ensure that loading data from data sources
+// together with decompression and decoding will be capped around the given data read limit.
+void reader_impl::load_next_stripe_data(read_mode mode)
+{
+  if (!_file_itm_data.has_data()) { return; }
+
+  auto const load_stripe_range =
+    _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range++];
+  auto const stripe_start = load_stripe_range.begin;
+  auto const stripe_count = load_stripe_range.size();
+
+  auto& lvl_stripe_data = _file_itm_data.lvl_stripe_data;
+  auto const num_levels = _selected_columns.num_levels();
+
+  // Prepare the buffer to read raw data onto.
+  for (std::size_t level = 0; level < num_levels; ++level) {
+    auto& stripe_data = lvl_stripe_data[level];
+    stripe_data.resize(stripe_count);
+
+    for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+      auto const stripe_size = _file_itm_data.lvl_stripe_sizes[level][idx + stripe_start];
+      stripe_data[idx]       = rmm::device_buffer(
+        cudf::util::round_up_safe(stripe_size, BUFFER_PADDING_MULTIPLE), _stream);
+    }
+  }
+
+  //
+  // Load stripe data into memory:
+  //
+
+  // If we load data from sources into host buffers, we need to transfer (async) data to device
+  // memory. Such host buffers need to be kept alive until we sync the transfers.
+  std::vector<std::unique_ptr<cudf::io::datasource::buffer>> host_read_buffers;
+
+  // If we load data directly from sources into device memory, the loads are also async.
+  // Thus, we need to make sure to sync all them at the end.
+  std::vector<std::pair<std::future<std::size_t>, std::size_t>> device_read_tasks;
+
+  // Range of the read info (offset, length) to read for the current being loaded stripes.
+  auto const [read_begin, read_end] =
+    merge_selected_ranges(_file_itm_data.stripe_data_read_ranges, load_stripe_range);
+
+  for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
+    auto const& read_info = _file_itm_data.data_read_info[read_idx];
+    auto const source_ptr = _metadata.per_file_metadata[read_info.source_idx].source;
+    auto const dst_base   = static_cast<uint8_t*>(
+      lvl_stripe_data[read_info.level][read_info.stripe_idx - stripe_start].data());
+
+    if (source_ptr->is_device_read_preferred(read_info.length)) {
+      device_read_tasks.push_back(
+        std::pair(source_ptr->device_read_async(
+                    read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),
+                  read_info.length));
+
+    } else {
+      auto buffer = source_ptr->host_read(read_info.offset, read_info.length);
+      CUDF_EXPECTS(buffer->size() == read_info.length, "Unexpected discrepancy in bytes read.");
+      CUDF_CUDA_TRY(cudaMemcpyAsync(dst_base + read_info.dst_pos,
+                                    buffer->data(),
+                                    read_info.length,
+                                    cudaMemcpyDefault,
+                                    _stream.value()));
+      host_read_buffers.emplace_back(std::move(buffer));
+    }
+  }
+
+  if (host_read_buffers.size() > 0) {  // if there was host read
+    _stream.synchronize();
+    host_read_buffers.clear();  // its data was copied to device memory after stream sync
+  }
+  for (auto& task : device_read_tasks) {  // if there was device read
+    CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
+  }
+
+  // Compute number of rows in the loading stripes.
+  auto const num_loading_rows = std::accumulate(
+    _file_itm_data.selected_stripes.begin() + stripe_start,
+    _file_itm_data.selected_stripes.begin() + stripe_start + stripe_count,
+    std::size_t{0},
+    [](std::size_t count, const auto& stripe) { return count + stripe.stripe_info->numberOfRows; });
+
+  // Decoding range needs to be reset to start from the first position in `decode_stripe_ranges`.
+  _chunk_read_data.curr_decode_stripe_range = 0;
+
+  // The cudf's column size limit.
+  auto constexpr column_size_limit =
+    static_cast<std::size_t>(std::numeric_limits<size_type>::max());
+
+  // Decode all loaded stripes if there is no read limit, or if we are in READ_ALL mode,
+  // and the number of loading rows is less than the column size limit.
+  // In theory, we should just decode 'enough' stripes for output one table chunk, instead of
+  // decoding all stripes like this, for better load-balancing and reduce memory usage.
+  // However, we do not have any good way to know how many stripes are 'enough'.
+  if ((mode == read_mode::READ_ALL || _chunk_read_data.pass_read_limit == 0) &&
+      // In addition to read limit, we also need to check if the total number of
+      // rows in the loaded stripes exceeds the column size limit.
+      // If that is the case, we cannot decode all stripes at once into a cudf table.
+      num_loading_rows <= column_size_limit) {
+    _chunk_read_data.decode_stripe_ranges = {load_stripe_range};
+    return;
+  }
+
+  // From here, we have reading mode that is either:
+  // - CHUNKED_READ without read limit but the number of reading rows exceeds column size limit, or
+  // - CHUNKED_READ with a pass read limit.
+  // READ_ALL mode with number of rows more than cudf's column size limit should be handled early in
+  // `preprocess_file`. We just check again to make sure such situations never happen here.
+  CUDF_EXPECTS(
+    mode != read_mode::READ_ALL,
+    "READ_ALL mode does not support reading number of rows more than cudf's column size limit.");
+
+  // This is the post-processing step after we've done with splitting `load_stripe_range` into
+  // `decode_stripe_ranges`.
+  auto const add_range_offset = [stripe_start](std::vector<range>& new_ranges) {
+    // The split ranges always start from zero.
+    // We need to change these ranges to start from `stripe_start` which are the correct subranges
+    // of the current loaded stripe range.
+    for (auto& range : new_ranges) {
+      range.begin += stripe_start;
+      range.end += stripe_start;
+    }
+  };
+
+  // Optimized code path when we do not have any read limit but the number of rows in the
+  // loaded stripes exceeds column size limit.
+  // Note that the values `max_uncompressed_size` for each stripe are not computed here.
+  // Instead, they will be computed on the fly during decoding to avoid the overhead of
+  // storing and retrieving from memory.
+  if (_chunk_read_data.pass_read_limit == 0 && num_loading_rows > column_size_limit) {
+    std::vector<cumulative_size_and_row> cumulative_stripe_rows(stripe_count);
+    std::size_t rows{0};
+
+    for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+      auto const& stripe     = _file_itm_data.selected_stripes[idx + stripe_start];
+      auto const stripe_info = stripe.stripe_info;
+      rows += stripe_info->numberOfRows;
+
+      // We will split stripe ranges based only on stripes' number of rows, not data size.
+      // Thus, we override the cumulative `size_bytes` using the prefix sum of rows in stripes and
+      // will use the column size limit as the split size limit.
+      cumulative_stripe_rows[idx] =
+        cumulative_size_and_row{idx + 1UL /*count*/, rows /*size_bytes*/, rows};
+    }
+
+    _chunk_read_data.decode_stripe_ranges =
+      find_splits<cumulative_size_and_row>(cumulative_stripe_rows, stripe_count, column_size_limit);
+    add_range_offset(_chunk_read_data.decode_stripe_ranges);
+    return;
+  }
+
+  //
+  // Split range of loaded stripes into subranges that can be decoded separately such that the
+  // memory usage is maintained around the given limit:
+  //
+
+  // This is for estimating the decompressed sizes of the loaded stripes.
+  cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(stripe_count,
+                                                                               _stream);
+
+  // Fill up the `cumulative_size_and_row` array with initial values.
+  // Note: `hostdevice_vector::begin()` mirrors `std::vector::data()` using incorrect API name.
+  for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+    auto const& stripe     = _file_itm_data.selected_stripes[idx + stripe_start];
+    auto const stripe_info = stripe.stripe_info;
+    stripe_decomp_sizes[idx] =
+      cumulative_size_and_row{1UL /*count*/, 0UL /*size_bytes*/, stripe_info->numberOfRows};
+  }
+
+  auto& compinfo_map = _file_itm_data.compinfo_map;
+  compinfo_map.clear();  // clear cache of the last load
+
+  // For parsing decompression data.
+  // We create an array that is large enough to use for all levels, thus only need to allocate
+  // memory once.
+  auto hd_compinfo = [&] {
+    std::size_t max_num_streams{0};
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      // Find the maximum number of streams in all levels of the loaded stripes.
+      for (std::size_t level = 0; level < num_levels; ++level) {
+        auto const stream_range =
+          merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
+        max_num_streams = std::max(max_num_streams, stream_range.size());
+      }
+    }
+    return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>(max_num_streams, _stream);
+  }();
+
+  for (std::size_t level = 0; level < num_levels; ++level) {
+    auto const& stream_info = _file_itm_data.lvl_stream_info[level];
+    auto const num_columns  = _selected_columns.levels[level].size();
+
+    auto& stripe_data = lvl_stripe_data[level];
+    if (stripe_data.empty()) { continue; }
+
+    // Range of all streams in the loaded stripes.
+    auto const stream_range =
+      merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
+
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
+
+      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
+        hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size());
+      for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+        auto const& info = stream_info[stream_idx];
+        auto const dst_base =
+          static_cast<uint8_t const*>(stripe_data[info.source.stripe_idx - stripe_start].data());
+        compinfo[stream_idx - stream_range.begin] =
+          gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length);
+      }
+
+      // Estimate the uncompressed data.
+      compinfo.host_to_device_async(_stream);
+      gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                     compinfo.size(),
+                                     decompressor.GetBlockSize(),
+                                     decompressor.GetLog2MaxCompressionRatio(),
+                                     _stream);
+      compinfo.device_to_host_sync(_stream);
+
+      for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+        auto const& info           = stream_info[stream_idx];
+        auto const stream_compinfo = compinfo[stream_idx - stream_range.begin];
+
+        // Cache these parsed numbers so they can be reused in the decompression/decoding step.
+        compinfo_map[info.source] = {stream_compinfo.num_compressed_blocks,
+                                     stream_compinfo.num_uncompressed_blocks,
+                                     stream_compinfo.max_uncompressed_size};
+        stripe_decomp_sizes[info.source.stripe_idx - stripe_start].size_bytes +=
+          stream_compinfo.max_uncompressed_size;
+      }
+
+    } else {  // no decompression
+      // Set decompression sizes equal to the input sizes.
+      for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+        auto const& info = stream_info[stream_idx];
+        stripe_decomp_sizes[info.source.stripe_idx - stripe_start].size_bytes += info.length;
+      }
+    }
+  }  // end loop level
+
+  // Compute the prefix sum of stripe data sizes and rows.
+  stripe_decomp_sizes.host_to_device_async(_stream);
+  thrust::inclusive_scan(rmm::exec_policy_nosync(_stream),
+                         stripe_decomp_sizes.d_begin(),
+                         stripe_decomp_sizes.d_end(),
+                         stripe_decomp_sizes.d_begin(),
+                         cumulative_size_plus{});
+  stripe_decomp_sizes.device_to_host_sync(_stream);
+
+  auto const decode_limit = [&] {
+    auto const tmp = static_cast<std::size_t>(_chunk_read_data.pass_read_limit *
+                                              chunk_read_data::decompress_and_decode_limit_ratio);
+    // Make sure not to pass 0 byte limit to `find_splits`.
+    return std::max(tmp, 1UL);
+  }();
+
+  _chunk_read_data.decode_stripe_ranges =
+    find_splits<cumulative_size_and_row>(stripe_decomp_sizes, stripe_count, decode_limit);
+
+  add_range_offset(_chunk_read_data.decode_stripe_ranges);
+}
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 0ad0f9af589..4ef68ee8d86 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -24,18 +24,298 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <unordered_map>
+
 namespace cudf::io::orc::detail {
 
 /**
- * @brief Struct to store file-level data that remains constant for all chunks being read.
+ * @brief Struct representing a range of of data offsets.
+ */
+struct range {
+  std::size_t begin{0};
+  std::size_t end{0};
+
+  [[nodiscard]] auto size() const { return end - begin; }
+};
+
+/**
+ * @brief Expand a range of ranges into a simple range of data.
+ *
+ * @param input_ranges The list of all data ranges
+ * @param selected_ranges A range of ranges from `input_ranges`
+ * @return The range of data span by the selected range of ranges
+ */
+inline range merge_selected_ranges(host_span<range const> input_ranges,
+                                   range const& selected_ranges)
+{
+  // The first and last range.
+  auto const& first_range = input_ranges[selected_ranges.begin];
+  auto const& last_range  = input_ranges[selected_ranges.end - 1];
+
+  // The range of data covered from the first to the last range.
+  return {first_range.begin, last_range.end};
+}
+
+// Store information to identify where to read a chunk of data from source.
+// Each read corresponds to one or more consecutive streams combined.
+struct stream_data_read_info {
+  uint64_t offset;         // offset in data source
+  std::size_t dst_pos;     // offset to store data in memory relative to start of raw stripe data
+  std::size_t length;      // data length to read
+  std::size_t source_idx;  // the data source id
+  std::size_t stripe_idx;  // global stripe index
+  std::size_t level;       // nested level
+};
+
+/**
+ * @brief Compression information for a stripe at a specific nested level.
+ */
+struct stripe_level_comp_info {
+  std::size_t num_compressed_blocks{0};
+  std::size_t num_uncompressed_blocks{0};
+  std::size_t total_decomp_size{0};
+};
+
+/**
+ * @brief Struct that stores source information of an ORC streams.
+ */
+struct stream_source_info {
+  std::size_t stripe_idx;  // global stripe id throughout all data sources
+  std::size_t level;       // level of the nested column
+  uint32_t orc_col_idx;    // orc column id
+  StreamKind kind;         // stream kind
+
+  struct hash {
+    std::size_t operator()(stream_source_info const& id) const
+    {
+      auto const col_kind =
+        static_cast<std::size_t>(id.orc_col_idx) | (static_cast<std::size_t>(id.kind) << 32);
+      auto const hasher = std::hash<size_t>{};
+      return hasher(id.stripe_idx) ^ hasher(id.level) ^ hasher(col_kind);
+    }
+  };
+  struct equal_to {
+    bool operator()(stream_source_info const& lhs, stream_source_info const& rhs) const
+    {
+      return lhs.stripe_idx == rhs.stripe_idx && lhs.level == rhs.level &&
+             lhs.orc_col_idx == rhs.orc_col_idx && lhs.kind == rhs.kind;
+    }
+  };
+};
+
+/**
+ * @brief Map to lookup a value from stream source.
+ */
+template <typename T>
+using stream_source_map =
+  std::unordered_map<stream_source_info, T, stream_source_info::hash, stream_source_info::equal_to>;
+
+/**
+ * @brief Struct that stores information of an ORC stream.
+ */
+struct orc_stream_info {
+  // Data info:
+  uint64_t offset;      // offset in data source
+  std::size_t dst_pos;  // offset to store data in memory relative to start of raw stripe data
+  std::size_t length;   // stream length to read
+
+  // Store source of the stream in the stripe, so we can look up where this stream comes from.
+  stream_source_info source;
+};
+
+/**
+ * @brief Struct storing intermediate processing data loaded from data sources.
  */
 struct file_intermediate_data {
+  int64_t rows_to_skip;
+  int64_t rows_to_read;
+  std::vector<metadata::orc_stripe_info> selected_stripes;
+
+  // Check if there is data to read.
+  bool has_data() const { return rows_to_read > 0 && !selected_stripes.empty(); }
+
+  // For each stripe, we perform a number of reads for its streams.
+  // Those reads are identified by a chunk of consecutive read info stored in `data_read_info`.
+  std::vector<range> stripe_data_read_ranges;
+
+  // Identify what data to read from source.
+  std::vector<stream_data_read_info> data_read_info;
+
+  // Store the compression information for each data stream.
+  stream_source_map<stripe_level_comp_info> compinfo_map;
+
+  // Store info for each ORC stream at each nested level.
+  std::vector<std::vector<orc_stream_info>> lvl_stream_info;
+
+  // At each nested level, the streams for each stripe are stored consecutively in lvl_stream_info.
+  // This is used to identify the range of streams for each stripe from that vector.
+  std::vector<std::vector<range>> lvl_stripe_stream_ranges;
+
+  // The buffers to store raw data read from disk, initialized for each reading stripe chunks.
+  // After decoding, such buffers can be released.
+  // This can only be implemented after chunked output is ready.
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
-  std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
 
-  int64_t rows_to_skip;
-  size_type rows_to_read;
-  std::vector<metadata::stripe_source_mapping> selected_stripes;
+  // Store the size of each stripe at each nested level.
+  // This is used to initialize the stripe_data buffers.
+  std::vector<std::vector<std::size_t>> lvl_stripe_sizes;
+
+  // List of column data types at each nested level.
+  std::vector<std::vector<data_type>> lvl_column_types;
+
+  // List of nested type columns at each nested level.
+  std::vector<std::vector<orc_column_meta>> lvl_nested_cols;
+
+  // Table for converting timestamp columns from local to UTC time.
+  std::unique_ptr<cudf::table> tz_table;
+
+  bool global_preprocessed{false};
+};
+
+/**
+ * @brief Struct collecting data necessary for chunked reading.
+ */
+struct chunk_read_data {
+  explicit chunk_read_data(std::size_t output_size_limit_,
+                           std::size_t data_read_limit_,
+                           size_type output_row_granularity_)
+    : chunk_read_limit{output_size_limit_},
+      pass_read_limit{data_read_limit_},
+      output_row_granularity{output_row_granularity_}
+  {
+    CUDF_EXPECTS(output_row_granularity > 0,
+                 "The value of `output_row_granularity` must be positive.");
+  }
+
+  std::size_t const
+    chunk_read_limit;  // maximum size (in bytes) of an output chunk, or 0 for no limit
+  std::size_t const pass_read_limit;  // approximate maximum size (in bytes) used for store
+                                      // intermediate data, or 0 for no limit
+  size_type const output_row_granularity;
+
+  // Memory limits for loading data and decoding are computed as
+  // `*_limit_ratio * pass_read_limit`.
+  // This is to maintain the total memory usage to be **around** the given `pass_read_limit`.
+  // Note that sum of these limits may not be `1.0`, and their values are set empirically.
+  static double constexpr load_limit_ratio{0.25};
+  static double constexpr decompress_and_decode_limit_ratio{0.6};
+
+  // Chunks of stripes that can be loaded into memory such that their data size is within the user
+  // specified limit.
+  std::vector<range> load_stripe_ranges;
+  std::size_t curr_load_stripe_range{0};
+  bool more_stripes_to_load() const { return curr_load_stripe_range < load_stripe_ranges.size(); }
+
+  // Chunks of stripes such that their decompression size is within the user specified size limit.
+  std::vector<range> decode_stripe_ranges;
+  std::size_t curr_decode_stripe_range{0};
+  bool more_stripes_to_decode() const
+  {
+    return curr_decode_stripe_range < decode_stripe_ranges.size();
+  }
+
+  // Chunk of rows in the internal decoded table to output for each `read_chunk()`.
+  std::vector<range> output_table_ranges;
+  std::size_t curr_output_table_range{0};
+  std::unique_ptr<cudf::table> decoded_table;
+  bool more_table_chunks_to_output() const
+  {
+    return curr_output_table_range < output_table_ranges.size();
+  }
+
+  bool has_next() const
+  {
+    // Only has more chunk to output if:
+    return more_stripes_to_load() || more_stripes_to_decode() || more_table_chunks_to_output();
+  }
+};
+
+/**
+ * @brief Struct to accumulate counts and sizes of some types such as stripes or rows.
+ */
+struct cumulative_size {
+  std::size_t count{0};
+  std::size_t size_bytes{0};
 };
 
+/**
+ * @brief Struct to accumulate counts, sizes, and number of rows of some types such as stripes or
+ * rows in tables.
+ */
+struct cumulative_size_and_row : public cumulative_size {
+  std::size_t num_rows{0};
+};
+
+/**
+ * @brief Functor to sum up cumulative data.
+ */
+struct cumulative_size_plus {
+  __device__ cumulative_size operator()(cumulative_size const& a, cumulative_size const& b) const
+  {
+    return cumulative_size{a.count + b.count, a.size_bytes + b.size_bytes};
+  }
+
+  __device__ cumulative_size_and_row operator()(cumulative_size_and_row const& a,
+                                                cumulative_size_and_row const& b) const
+  {
+    return cumulative_size_and_row{
+      a.count + b.count, a.size_bytes + b.size_bytes, a.num_rows + b.num_rows};
+  }
+};
+
+/**
+ * @brief Find the splits of the input data such that each split range has cumulative size less than
+ * a given `size_limit`.
+ *
+ * Note that the given limit is just a soft limit. The function will always output ranges that
+ * have at least one count, even such ranges have sizes exceed the value of `size_limit`.
+ *
+ * @param cumulative_sizes The input cumulative sizes to compute split ranges
+ * @param total_count The total count in the entire input
+ * @param size_limit The given soft limit to compute splits; must be positive
+ * @return A vector of ranges as splits of the input
+ */
+template <typename T>
+std::vector<range> find_splits(host_span<T const> cumulative_sizes,
+                               std::size_t total_count,
+                               std::size_t size_limit);
+
+/**
+ * @brief Function that populates descriptors for either individual streams or chunks of column
+ * data, but not both.
+ *
+ * This function is firstly used in the global step, to gather information for streams of all
+ * stripes in the data sources (when `stream_info` is present). Later on, it is used again to
+ * populate column descriptors (`chunks` is present) during decompression and decoding. The two
+ * steps share most of the execution path thus this function takes mutually exclusive parameters
+ * `stream_info` or `chunks` depending on each use case.
+ *
+ * @param stripe_id The index of the current stripe, can be global index or local decoding index
+ * @param level The current processing nested level
+ * @param stripeinfo The pointer to current stripe's information
+ * @param stripefooter The pointer to current stripe's footer
+ * @param orc2gdf The mapping from ORC column ids to gdf column ids
+ * @param types The schema type
+ * @param use_index Whether to use the row index for parsing
+ * @param apply_struct_map Indicating if this is the root level
+ * @param num_dictionary_entries The number of dictionary entries
+ * @param local_stream_order For retrieving 0-based orders of streams in the decoding step
+ * @param stream_info The vector of streams' information
+ * @param chunks The vector of column descriptors
+ * @return The number of bytes in the gathered streams
+ */
+std::size_t gather_stream_info_and_column_desc(
+  std::size_t stripe_id,
+  std::size_t level,
+  orc::StripeInformation const* stripeinfo,
+  orc::StripeFooter const* stripefooter,
+  host_span<int const> orc2gdf,
+  host_span<orc::SchemaType const> types,
+  bool use_index,
+  bool apply_struct_map,
+  int64_t* num_dictionary_entries,
+  std::size_t* local_stream_order,
+  std::vector<orc_stream_info>* stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>* chunks);
+
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_decode.cu
similarity index 56%
rename from cpp/src/io/orc/reader_impl_preprocess.cu
rename to cpp/src/io/orc/reader_impl_decode.cu
index 04cb223c696..ec936b85761 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -16,17 +16,17 @@
 
 #include "io/comp/gpuinflate.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
+#include "io/orc/reader_impl.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
+#include "io/orc/reader_impl_helpers.hpp"
 #include "io/utilities/config_utils.hpp"
-#include "reader_impl.hpp"
-#include "reader_impl_chunking.hpp"
-#include "reader_impl_helpers.hpp"
+#include "io/utilities/hostdevice_span.hpp"
 
-#include <cudf/detail/timezone.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -45,175 +45,104 @@
 #include <thrust/transform.h>
 
 #include <algorithm>
-#include <iterator>
+#include <numeric>
 
 namespace cudf::io::orc::detail {
 
 namespace {
 
 /**
- * @brief Struct that maps ORC streams to columns
- */
-struct orc_stream_info {
-  explicit orc_stream_info(uint64_t offset_,
-                           std::size_t dst_pos_,
-                           uint32_t length_,
-                           uint32_t stripe_idx_)
-    : offset(offset_), dst_pos(dst_pos_), length(length_), stripe_idx(stripe_idx_)
-  {
-  }
-  uint64_t offset;      // offset in file
-  std::size_t dst_pos;  // offset in memory relative to start of compressed stripe data
-  std::size_t length;   // length in file
-  uint32_t stripe_idx;  // stripe index
-};
-
-/**
- * @brief Function that populates column descriptors stream/chunk
- */
-std::size_t gather_stream_info(std::size_t stripe_index,
-                               orc::StripeInformation const* stripeinfo,
-                               orc::StripeFooter const* stripefooter,
-                               host_span<int const> orc2gdf,
-                               host_span<orc::SchemaType const> types,
-                               bool use_index,
-                               bool apply_struct_map,
-                               int64_t* num_dictionary_entries,
-                               std::vector<orc_stream_info>& stream_info,
-                               cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks)
-{
-  uint64_t src_offset = 0;
-  uint64_t dst_offset = 0;
-
-  auto const get_stream_index_type = [](orc::StreamKind kind) {
-    switch (kind) {
-      case orc::DATA: return gpu::CI_DATA;
-      case orc::LENGTH:
-      case orc::SECONDARY: return gpu::CI_DATA2;
-      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
-      case orc::PRESENT: return gpu::CI_PRESENT;
-      case orc::ROW_INDEX: return gpu::CI_INDEX;
-      default:
-        // Skip this stream as it's not strictly required
-        return gpu::CI_NUM_STREAMS;
-    }
-  };
-
-  for (auto const& stream : stripefooter->streams) {
-    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
-      // Ignore reading this stream from source.
-      cudf::logger().warn("Unexpected stream in the input ORC source. The stream will be ignored.");
-      src_offset += stream.length;
-      continue;
-    }
-
-    auto const column_id = *stream.column_id;
-    auto col             = orc2gdf[column_id];
-
-    if (col == -1 and apply_struct_map) {
-      // A struct-type column has no data itself, but rather child columns
-      // for each of its fields. There is only a PRESENT stream, which
-      // needs to be included for the reader.
-      auto const schema_type = types[column_id];
-      if (not schema_type.subtypes.empty()) {
-        if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
-          for (auto const& idx : schema_type.subtypes) {
-            auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
-            if (child_idx >= 0) {
-              col                             = child_idx;
-              auto& chunk                     = chunks[stripe_index][col];
-              chunk.strm_id[gpu::CI_PRESENT]  = stream_info.size();
-              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
-            }
-          }
-        }
-      }
-    } else if (col != -1) {
-      if (src_offset >= stripeinfo->indexLength || use_index) {
-        auto& chunk           = chunks[stripe_index][col];
-        auto const index_type = get_stream_index_type(stream.kind);
-        if (index_type < gpu::CI_NUM_STREAMS) {
-          chunk.strm_id[index_type]  = stream_info.size();
-          chunk.strm_len[index_type] = stream.length;
-          // NOTE: skip_count field is temporarily used to track the presence of index streams
-          chunk.skip_count |= 1 << index_type;
-
-          if (index_type == gpu::CI_DICTIONARY) {
-            chunk.dictionary_start = *num_dictionary_entries;
-            chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
-            *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
-          }
-        }
-      }
-      stream_info.emplace_back(
-        stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index);
-      dst_offset += stream.length;
-    }
-    src_offset += stream.length;
-  }
-
-  return dst_offset;
-}
-
-/**
- * @brief Decompresses the stripe data, at stream granularity.
+ * @brief  Decompresses the stripe data, at stream granularity.
+ *
+ * Only the streams in the provided `stream_range` are decoded. That range is determined in
+ * the previous steps, after splitting stripes into ranges to maintain memory usage to be
+ * under data read limit.
  *
+ * @param loaded_stripe_range Range of stripes that are already loaded in memory
+ * @param stream_range Range of streams to be decoded
+ * @param num_decode_stripes Number of stripes that the decoding streams belong to
+ * @param compinfo_map A map to lookup compression info of streams
  * @param decompressor Block decompressor
  * @param stripe_data List of source stripe column data
  * @param stream_info List of stream to column mappings
  * @param chunks Vector of list of column chunk descriptors
  * @param row_groups Vector of list of row index descriptors
- * @param num_stripes Number of stripes making up column chunks
  * @param row_index_stride Distance between each row index
  * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Device buffer to decompressed page data
+ * @return Device buffer to decompressed data
  */
 rmm::device_buffer decompress_stripe_data(
+  range const& loaded_stripe_range,
+  range const& stream_range,
+  std::size_t num_decode_stripes,
+  cudf::detail::hostdevice_span<gpu::CompressedStreamInfo> compinfo,
+  stream_source_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
   host_span<rmm::device_buffer const> stripe_data,
-  host_span<orc_stream_info> stream_info,
+  host_span<orc_stream_info const> stream_info,
   cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
   cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
-  size_type num_stripes,
   size_type row_index_stride,
   bool use_base_stride,
   rmm::cuda_stream_view stream)
 {
-  // Parse the columns' compressed info
-  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
-    0, stream_info.size(), stream);
-  for (auto const& info : stream_info) {
-    compinfo.push_back(gpu::CompressedStreamInfo(
-      static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
-      info.length));
-  }
-  compinfo.host_to_device_async(stream);
-
-  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                 compinfo.size(),
-                                 decompressor.GetBlockSize(),
-                                 decompressor.GetLog2MaxCompressionRatio(),
-                                 stream);
-  compinfo.device_to_host_sync(stream);
+  // Whether we have the comppression info precomputed.
+  auto const compinfo_ready = not compinfo_map.empty();
 
   // Count the exact number of compressed blocks
   std::size_t num_compressed_blocks   = 0;
   std::size_t num_uncompressed_blocks = 0;
   std::size_t total_decomp_size       = 0;
-  for (std::size_t i = 0; i < compinfo.size(); ++i) {
-    num_compressed_blocks += compinfo[i].num_compressed_blocks;
-    num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
-    total_decomp_size += compinfo[i].max_uncompressed_size;
+
+  for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+    auto const& info = stream_info[stream_idx];
+
+    auto& stream_comp_info = compinfo[stream_idx - stream_range.begin];
+    stream_comp_info       = gpu::CompressedStreamInfo(
+      static_cast<uint8_t const*>(
+        stripe_data[info.source.stripe_idx - loaded_stripe_range.begin].data()) +
+        info.dst_pos,
+      info.length);
+
+    if (compinfo_ready) {
+      auto const& cached_comp_info             = compinfo_map.at(info.source);
+      stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
+      stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
+      stream_comp_info.max_uncompressed_size   = cached_comp_info.total_decomp_size;
+
+      num_compressed_blocks += cached_comp_info.num_compressed_blocks;
+      num_uncompressed_blocks += cached_comp_info.num_uncompressed_blocks;
+      total_decomp_size += cached_comp_info.total_decomp_size;
+    }
   }
+
+  if (!compinfo_ready) {
+    compinfo.host_to_device_async(stream);
+    gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                   compinfo.size(),
+                                   decompressor.GetBlockSize(),
+                                   decompressor.GetLog2MaxCompressionRatio(),
+                                   stream);
+    compinfo.device_to_host_sync(stream);
+
+    for (std::size_t i = 0; i < compinfo.size(); ++i) {
+      num_compressed_blocks += compinfo[i].num_compressed_blocks;
+      num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
+      total_decomp_size += compinfo[i].max_uncompressed_size;
+    }
+  }
+
   CUDF_EXPECTS(
     not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
     "Inconsistent info on compression blocks");
 
-  // Buffer needs to be padded.
-  // Required by `gpuDecodeOrcColumnData`.
+  // Buffer needs to be padded.This is required by `gpuDecodeOrcColumnData`.
   rmm::device_buffer decomp_data(
     cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
+
+  // If total_decomp_size is zero, the input data may be just empty.
+  // This is still a valid input, thus do not be panick.
   if (decomp_data.is_empty()) { return decomp_data; }
 
   rmm::device_uvector<device_span<uint8_t const>> inflate_in(
@@ -221,7 +150,7 @@ rmm::device_buffer decompress_stripe_data(
   rmm::device_uvector<device_span<uint8_t>> inflate_out(
     num_compressed_blocks + num_uncompressed_blocks, stream);
   rmm::device_uvector<compression_result> inflate_res(num_compressed_blocks, stream);
-  thrust::fill(rmm::exec_policy(stream),
+  thrust::fill(rmm::exec_policy_nosync(stream),
                inflate_res.begin(),
                inflate_res.end(),
                compression_result{0, compression_status::FAILURE});
@@ -240,13 +169,13 @@ rmm::device_buffer decompress_stripe_data(
     compinfo[i].copy_in_ctl  = inflate_in.data() + start_pos_uncomp;
     compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp;
 
-    stream_info[i].dst_pos = decomp_offset;
     decomp_offset += compinfo[i].max_uncompressed_size;
     start_pos += compinfo[i].num_compressed_blocks;
     start_pos_uncomp += compinfo[i].num_uncompressed_blocks;
     max_uncomp_block_size =
       std::max(max_uncomp_block_size, compinfo[i].max_uncompressed_block_size);
   }
+
   compinfo.host_to_device_async(stream);
   gpu::ParseCompressedStripeData(compinfo.device_ptr(),
                                  compinfo.size(),
@@ -325,7 +254,7 @@ rmm::device_buffer decompress_stripe_data(
     // Check if any block has been failed to decompress.
     // Not using `thrust::any` or `thrust::count_if` to defer stream sync.
     thrust::for_each(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       thrust::make_counting_iterator(std::size_t{0}),
       thrust::make_counting_iterator(inflate_res.size()),
       [results           = inflate_res.begin(),
@@ -351,15 +280,15 @@ rmm::device_buffer decompress_stripe_data(
   // We can check on host after stream synchronize
   CUDF_EXPECTS(not any_block_failure[0], "Error during decompression");
 
-  size_type const num_columns = chunks.size().second;
+  auto const num_columns = chunks.size().second;
 
   // Update the stream information with the updated uncompressed info
   // TBD: We could update the value from the information we already
   // have in stream_info[], but using the gpu results also updates
   // max_uncompressed_size to the actual uncompressed size, or zero if
   // decompression failed.
-  for (size_type i = 0; i < num_stripes; ++i) {
-    for (size_type j = 0; j < num_columns; ++j) {
+  for (std::size_t i = 0; i < num_decode_stripes; ++i) {
+    for (std::size_t j = 0; j < num_columns; ++j) {
       auto& chunk = chunks[i][j];
       for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
         if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) {
@@ -377,7 +306,7 @@ rmm::device_buffer decompress_stripe_data(
                             compinfo.device_ptr(),
                             chunks.base_device_ptr(),
                             num_columns,
-                            num_stripes,
+                            num_decode_stripes,
                             row_index_stride,
                             use_base_stride,
                             stream);
@@ -424,7 +353,7 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
       if (child_valid_map_base != nullptr) {
         rmm::device_uvector<uint32_t> dst_idx(child_mask_len, stream);
         // Copy indexes at which the parent has valid value.
-        thrust::copy_if(rmm::exec_policy(stream),
+        thrust::copy_if(rmm::exec_policy_nosync(stream),
                         thrust::make_counting_iterator(0),
                         thrust::make_counting_iterator(0) + parent_mask_len,
                         dst_idx.begin(),
@@ -438,7 +367,7 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
         uint32_t* dst_idx_ptr = dst_idx.data();
         // Copy child valid bits from child column to valid indexes, this will merge both child
         // and parent null masks
-        thrust::for_each(rmm::exec_policy(stream),
+        thrust::for_each(rmm::exec_policy_nosync(stream),
                          thrust::make_counting_iterator(0),
                          thrust::make_counting_iterator(0) + dst_idx.size(),
                          [child_valid_map_base, dst_idx_ptr, merged_mask] __device__(auto idx) {
@@ -484,11 +413,11 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-void decode_stream_data(std::size_t num_dicts,
+void decode_stream_data(int64_t num_dicts,
                         int64_t skip_rows,
                         size_type row_index_stride,
                         std::size_t level,
-                        table_view const& tz_table,
+                        table_device_view const& d_tz_table,
                         cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
                         cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
                         std::vector<column_buffer>& out_buffers,
@@ -497,6 +426,7 @@ void decode_stream_data(std::size_t num_dicts,
 {
   auto const num_stripes = chunks.size().first;
   auto const num_columns = chunks.size().second;
+
   thrust::counting_iterator<int> col_idx_it(0);
   thrust::counting_iterator<int> stripe_idx_it(0);
 
@@ -512,7 +442,7 @@ void decode_stream_data(std::size_t num_dicts,
   // Allocate global dictionary for deserializing
   rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
 
-  chunks.host_to_device_sync(stream);
+  chunks.host_to_device_async(stream);
   gpu::DecodeNullsAndStringDictionaries(
     chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
 
@@ -521,16 +451,14 @@ void decode_stream_data(std::size_t num_dicts,
     update_null_mask(chunks, out_buffers, stream, mr);
   }
 
-  auto const tz_table_dptr = table_device_view::create(tz_table, stream);
   rmm::device_scalar<size_type> error_count(0, stream);
-  // Update the null map for child columns
   gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
                            global_dict.data(),
                            row_groups,
                            num_columns,
                            num_stripes,
                            skip_rows,
-                           *tz_table_dptr,
+                           d_tz_table,
                            row_groups.size().first,
                            row_index_stride,
                            level,
@@ -557,40 +485,38 @@ void decode_stream_data(std::size_t num_dicts,
  * layer.
  */
 void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
-                      cudf::host_span<rmm::device_uvector<uint32_t>> prefix_sums,
+                      uint32_t* d_prefix_sums,
                       rmm::cuda_stream_view stream)
 {
   auto const num_stripes = chunks.size().first;
   if (num_stripes == 0) return;
 
   auto const num_columns = chunks.size().second;
-  std::vector<thrust::pair<size_type, cudf::device_span<uint32_t>>> prefix_sums_to_update;
+  std::vector<thrust::pair<size_type, uint32_t*>> prefix_sums_to_update;
   for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
     // Null counts sums are only needed for children of struct columns
     if (chunks[0][col_idx].type_kind == STRUCT) {
-      prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
+      prefix_sums_to_update.emplace_back(col_idx, d_prefix_sums + num_stripes * col_idx);
     }
   }
   auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
     prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
 
-  thrust::for_each(rmm::exec_policy(stream),
-                   d_prefix_sums_to_update.begin(),
-                   d_prefix_sums_to_update.end(),
-                   [chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
-                     auto const& idx_psums) {
-                     auto const col_idx = idx_psums.first;
-                     auto const psums   = idx_psums.second;
-
-                     thrust::transform(
-                       thrust::seq,
-                       thrust::make_counting_iterator(0),
-                       thrust::make_counting_iterator(0) + psums.size(),
-                       psums.begin(),
-                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
-
-                     thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin());
-                   });
+  thrust::for_each(
+    rmm::exec_policy_nosync(stream),
+    d_prefix_sums_to_update.begin(),
+    d_prefix_sums_to_update.end(),
+    [num_stripes, chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
+      auto const& idx_psums) {
+      auto const col_idx = idx_psums.first;
+      auto const psums   = idx_psums.second;
+      thrust::transform(thrust::seq,
+                        thrust::make_counting_iterator<std::size_t>(0ul),
+                        thrust::make_counting_iterator<std::size_t>(num_stripes),
+                        psums,
+                        [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
+      thrust::inclusive_scan(thrust::seq, psums, psums + num_stripes, psums);
+    });
   // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
   stream.synchronize();
 }
@@ -634,6 +560,7 @@ void aggregate_child_meta(std::size_t level,
   // For each parent column, update its child column meta for each stripe.
   std::for_each(nested_cols.begin(), nested_cols.end(), [&](auto const p_col) {
     auto const parent_col_idx = col_meta.orc_col_map[level][p_col.id];
+
     int64_t start_row         = 0;
     auto processed_row_groups = 0;
 
@@ -657,10 +584,19 @@ void aggregate_child_meta(std::size_t level,
 
       // Aggregate start row, number of rows per chunk and total number of rows in a column
       auto const child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
+
       for (size_type id = 0; id < p_col.num_children; id++) {
         auto const child_col_idx = index + id;
 
         num_child_rows[child_col_idx] += child_rows;
+
+        // The number of rows in child column should not be very large otherwise we will have
+        // size overflow.
+        // If that is the case, we need to set a read limit to reduce number of decoding stripes.
+        CUDF_EXPECTS(num_child_rows[child_col_idx] <=
+                       static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+                     "Number of rows in the child column exceeds column size limit.");
+
         num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
         // start row could be different for each column when there is nesting at each stripe level
         child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
@@ -709,264 +645,291 @@ void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_
   }
 }
 
+/**
+ * @brief Find the splits of the input table such that each split range of rows has data size less
+ * than a given `size_limit`.
+ *
+ * The parameter `segment_length` is to control the granularity of splits. The output ranges will
+ * always have numbers of rows that are multiple of this value, except the last range that contains
+ * the remaining rows.
+ *
+ * Similar to `find_splits`, the given limit is just a soft limit. This function will never output
+ * empty ranges, even they have sizes exceed the value of `size_limit`.
+ *
+ * @param input The input table to find splits
+ * @param segment_length Value to control granularity of the output ranges
+ * @param size_limit A limit on the output size of each split range
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A vector of ranges as splits of the input
+ */
+std::vector<range> find_table_splits(table_view const& input,
+                                     size_type segment_length,
+                                     std::size_t size_limit,
+                                     rmm::cuda_stream_view stream)
+{
+  if (size_limit == 0) {
+    return std::vector<range>{range{0, static_cast<std::size_t>(input.num_rows())}};
+  }
+
+  CUDF_EXPECTS(segment_length > 0, "Invalid segment_length", std::invalid_argument);
+
+  // `segmented_row_bit_count` requires that `segment_length` is not larger than number of rows.
+  segment_length = std::min(segment_length, input.num_rows());
+
+  auto const d_segmented_sizes = cudf::detail::segmented_row_bit_count(
+    input, segment_length, stream, rmm::mr::get_current_device_resource());
+
+  auto segmented_sizes =
+    cudf::detail::hostdevice_vector<cumulative_size>(d_segmented_sizes->size(), stream);
+
+  thrust::transform(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(d_segmented_sizes->size()),
+    segmented_sizes.d_begin(),
+    [segment_length,
+     num_rows = input.num_rows(),
+     d_sizes  = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
+      // Since the number of rows may not divisible by segment_length,
+      // the last segment may be shorter than the others.
+      auto const current_length =
+        cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
+      auto const size = d_sizes[segment_idx] / CHAR_BIT;  // divide by CHAR_BIT to get size in bytes
+      return cumulative_size{static_cast<std::size_t>(current_length),
+                             static_cast<std::size_t>(size)};
+    });
+
+  thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
+                         segmented_sizes.d_begin(),
+                         segmented_sizes.d_end(),
+                         segmented_sizes.d_begin(),
+                         cumulative_size_plus{});
+  segmented_sizes.device_to_host_sync(stream);
+
+  return find_splits<cumulative_size>(segmented_sizes, input.num_rows(), size_limit);
+}
+
 }  // namespace
 
-void reader::impl::prepare_data(int64_t skip_rows,
-                                std::optional<size_type> const& num_rows_opt,
-                                std::vector<std::vector<size_type>> const& stripes)
+void reader_impl::decompress_and_decode_stripes(read_mode mode)
 {
-  // Selected columns at different levels of nesting are stored in different elements
-  // of `selected_columns`; thus, size == 1 means no nested columns
-  CUDF_EXPECTS(skip_rows == 0 or _selected_columns.num_levels() == 1,
-               "skip_rows is not supported by nested columns");
-
-  // There are no columns in the table
-  if (_selected_columns.num_levels() == 0) { return; }
-
-  _file_itm_data = std::make_unique<file_intermediate_data>();
-
-  // Select only stripes required (aka row groups)
-  std::tie(
-    _file_itm_data->rows_to_skip, _file_itm_data->rows_to_read, _file_itm_data->selected_stripes) =
-    _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
-  auto const rows_to_skip      = _file_itm_data->rows_to_skip;
-  auto const rows_to_read      = _file_itm_data->rows_to_read;
-  auto const& selected_stripes = _file_itm_data->selected_stripes;
-
-  // If no rows or stripes to read, return empty columns
-  if (rows_to_read == 0 || selected_stripes.empty()) { return; }
-
-  // Set up table for converting timestamp columns from local to UTC time
-  auto const tz_table = [&, &selected_stripes = selected_stripes] {
-    auto const has_timestamp_column = std::any_of(
-      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
-        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
-          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
-        });
-      });
+  if (!_file_itm_data.has_data()) { return; }
+
+  CUDF_EXPECTS(_chunk_read_data.curr_load_stripe_range > 0, "There is not any stripe loaded.");
+
+  auto const stripe_range =
+    _chunk_read_data.decode_stripe_ranges[_chunk_read_data.curr_decode_stripe_range++];
+  auto const stripe_start = stripe_range.begin;
+  auto const stripe_end   = stripe_range.end;
+  auto const stripe_count = stripe_range.size();
+
+  // The start index of loaded stripes. They are different from decoding stripes.
+  auto const load_stripe_range =
+    _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1];
+  auto const load_stripe_start = load_stripe_range.begin;
+
+  auto const rows_to_skip      = _file_itm_data.rows_to_skip;
+  auto const& selected_stripes = _file_itm_data.selected_stripes;
+
+  // Number of rows to decode in this decompressing/decoding step.
+  int64_t rows_to_decode = 0;
+  for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
+    auto const& stripe     = selected_stripes[stripe_idx];
+    auto const stripe_rows = static_cast<int64_t>(stripe.stripe_info->numberOfRows);
+    rows_to_decode += stripe_rows;
+  }
 
-    return has_timestamp_column
-             ? cudf::detail::make_timezone_transition_table(
-                 {}, selected_stripes[0].stripe_info[0].second->writerTimezone, _stream)
-             : std::make_unique<cudf::table>();
+  CUDF_EXPECTS(rows_to_decode > rows_to_skip, "Invalid rows_to_decode computation.");
+  rows_to_decode = std::min<int64_t>(rows_to_decode - rows_to_skip, _file_itm_data.rows_to_read);
+
+  // After this step, we no longer have any rows to skip.
+  // The number of rows remains to read in the future also reduced.
+  _file_itm_data.rows_to_skip = 0;
+  _file_itm_data.rows_to_read -= rows_to_decode;
+
+  // Technically, overflow here should never happen because the `load_next_stripe_data()` step
+  // already handled it by splitting the loaded stripe range into multiple decode ranges.
+  CUDF_EXPECTS(rows_to_decode <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+               "Number or rows to decode exceeds the column size limit.",
+               std::overflow_error);
+
+  auto const tz_table_dptr = table_device_view::create(_file_itm_data.tz_table->view(), _stream);
+  auto const num_levels    = _selected_columns.num_levels();
+  _out_buffers.resize(num_levels);
+
+  // Column descriptors ('chunks').
+  // Each 'chunk' of data here corresponds to an orc column, in a stripe, at a nested level.
+  // Unfortunately we cannot create one hostdevice_vector to use for all levels because
+  // currently we do not have a hostdevice_2dspan class.
+  std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_chunks(num_levels);
+
+  // For computing null count.
+  auto null_count_prefix_sums = [&] {
+    auto const num_total_cols = std::accumulate(
+      _selected_columns.levels.begin(),
+      _selected_columns.levels.end(),
+      std::size_t{0},
+      [](auto const& sum, auto const& cols_level) { return sum + cols_level.size(); });
+
+    return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+      num_total_cols * stripe_count, _stream, rmm::mr::get_current_device_resource());
+  }();
+  std::size_t num_processed_lvl_columns      = 0;
+  std::size_t num_processed_prev_lvl_columns = 0;
+
+  // For parsing decompression data.
+  // We create one hostdevice_vector that is large enough to use for all levels,
+  // thus only need to allocate memory once.
+  auto hd_compinfo = [&] {
+    std::size_t max_num_streams{0};
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      // Find the maximum number of streams in all levels of the decoding stripes.
+      for (std::size_t level = 0; level < num_levels; ++level) {
+        auto const stream_range =
+          merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], stripe_range);
+        max_num_streams = std::max(max_num_streams, stream_range.size());
+      }
+    }
+    return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>{max_num_streams, _stream};
   }();
 
-  auto& lvl_stripe_data        = _file_itm_data->lvl_stripe_data;
-  auto& null_count_prefix_sums = _file_itm_data->null_count_prefix_sums;
-  lvl_stripe_data.resize(_selected_columns.num_levels());
-
-  _out_buffers.resize(_selected_columns.num_levels());
-
-  // Iterates through levels of nested columns, child column will be one level down
-  // compared to parent column.
   auto& col_meta = *_col_meta;
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    auto& columns_level = _selected_columns.levels[level];
-    // Association between each ORC column and its cudf::column
-    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
-    std::vector<orc_column_meta> nested_cols;
-
-    // Get a list of column data types
-    std::vector<data_type> column_types;
-    for (auto& col : columns_level) {
-      auto col_type = to_cudf_type(_metadata.get_col_type(col.id).kind,
-                                   _use_np_dtypes,
-                                   _timestamp_type.id(),
-                                   to_cudf_decimal_type(_decimal128_columns, _metadata, col.id));
-      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
-          col_type == type_id::DECIMAL128) {
-        // sign of the scale is changed since cuDF follows c++ libraries like CNL
-        // which uses negative scaling, but liborc and other libraries
-        // follow positive scaling.
-        auto const scale =
-          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
-        column_types.emplace_back(col_type, scale);
-      } else {
-        column_types.emplace_back(col_type);
-      }
+    auto const& stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges[level];
+    auto const stream_range          = merge_selected_ranges(stripe_stream_ranges, stripe_range);
 
-      // Map each ORC column to its column
-      col_meta.orc_col_map[level][col.id] = column_types.size() - 1;
-      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
-        nested_cols.emplace_back(col);
-      }
-    }
+    auto const& columns_level = _selected_columns.levels[level];
+    auto const& stream_info   = _file_itm_data.lvl_stream_info[level];
+    auto const& column_types  = _file_itm_data.lvl_column_types[level];
+    auto const& nested_cols   = _file_itm_data.lvl_nested_cols[level];
 
-    // Get the total number of stripes across all input files.
-    std::size_t total_num_stripes =
-      std::accumulate(selected_stripes.begin(),
-                      selected_stripes.end(),
-                      0,
-                      [](std::size_t sum, auto& stripe_source_mapping) {
-                        return sum + stripe_source_mapping.stripe_info.size();
-                      });
-    auto const num_columns = columns_level.size();
-    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
-      total_num_stripes, num_columns, _stream);
+    auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
+    auto& chunks      = lvl_chunks[level];
+
+    auto const num_lvl_columns = columns_level.size();
+    chunks =
+      cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(stripe_count, num_lvl_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
     const bool use_index =
-      _use_index &&
+      _options.use_index &&
       // Do stripes have row group index
       _metadata.is_row_grp_idx_present() &&
       // Only use if we don't have much work with complete columns & stripes
       // TODO: Consider nrows, gpu, and tune the threshold
-      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
-       _metadata.get_row_index_stride() != 0 && num_columns * total_num_stripes < 8 * 128) &&
+      (rows_to_decode > _metadata.get_row_index_stride() &&
+       !(_metadata.get_row_index_stride() & 7) && _metadata.get_row_index_stride() != 0 &&
+       num_lvl_columns * stripe_count < 8 * 128) &&
       // Only use if first row is aligned to a stripe boundary
       // TODO: Fix logic to handle unaligned rows
       (rows_to_skip == 0);
 
-    // Logically view streams as columns
-    std::vector<orc_stream_info> stream_info;
-
-    null_count_prefix_sums.emplace_back();
-    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
-    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
-                    _selected_columns.levels[level].size(),
-                    [&]() {
-                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                        total_num_stripes, _stream, rmm::mr::get_current_device_resource());
-                    });
-
-    // Tracker for eventually deallocating compressed and uncompressed data
-    auto& stripe_data = lvl_stripe_data[level];
-
-    int64_t stripe_start_row = 0;
-    int64_t num_dict_entries = 0;
-    int64_t num_rowgroups    = 0;
-    size_type stripe_idx     = 0;
-
-    std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
-    for (auto const& stripe_source_mapping : selected_stripes) {
-      // Iterate through the source files selected stripes
-      for (auto const& stripe : stripe_source_mapping.stripe_info) {
-        auto const stripe_info   = stripe.first;
-        auto const stripe_footer = stripe.second;
-
-        auto stream_count          = stream_info.size();
-        auto const total_data_size = gather_stream_info(stripe_idx,
-                                                        stripe_info,
-                                                        stripe_footer,
-                                                        col_meta.orc_col_map[level],
-                                                        _metadata.get_types(),
-                                                        use_index,
-                                                        level == 0,
-                                                        &num_dict_entries,
-                                                        stream_info,
-                                                        chunks);
-
-        auto const is_stripe_data_empty = total_data_size == 0;
-        CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
-                     "Invalid index rowgroup stream data");
-
-        // Buffer needs to be padded.
-        // Required by `copy_uncompressed_kernel`.
-        stripe_data.emplace_back(
-          cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), _stream);
-        auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
-
-        // Coalesce consecutive streams into one read
-        while (not is_stripe_data_empty and stream_count < stream_info.size()) {
-          auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
-          auto const offset = stream_info[stream_count].offset;
-          auto len          = stream_info[stream_count].length;
-          stream_count++;
-
-          while (stream_count < stream_info.size() &&
-                 stream_info[stream_count].offset == offset + len) {
-            len += stream_info[stream_count].length;
-            stream_count++;
-          }
-          if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                .source->is_device_read_preferred(len)) {
-            read_tasks.push_back(
-              std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                          .source->device_read_async(offset, len, d_dst, _stream),
-                        len));
-
-          } else {
-            auto const buffer =
-              _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
-                offset, len);
-            CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
-            CUDF_CUDA_TRY(
-              cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
-            _stream.synchronize();
-          }
-        }
-
-        auto const num_rows_per_stripe = stripe_info->numberOfRows;
-        auto const rowgroup_id         = num_rowgroups;
-        auto stripe_num_rowgroups      = 0;
-        if (use_index) {
-          stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
-                                 _metadata.get_row_index_stride();
+    // 0-based counters, used across all decoding stripes in this step.
+    int64_t stripe_start_row{0};
+    int64_t num_dict_entries{0};
+    uint32_t num_rowgroups{0};
+    std::size_t local_stream_order{0};
+
+    for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
+      auto const& stripe       = selected_stripes[stripe_idx];
+      auto const stripe_info   = stripe.stripe_info;
+      auto const stripe_footer = stripe.stripe_footer;
+
+      // Normalize stripe_idx to 0-based.
+      auto const stripe_local_idx = stripe_idx - stripe_start;
+
+      // The first parameter (`stripe_order`) must be normalized to 0-based.
+      auto const total_data_size = gather_stream_info_and_column_desc(stripe_local_idx,
+                                                                      level,
+                                                                      stripe_info,
+                                                                      stripe_footer,
+                                                                      col_meta.orc_col_map[level],
+                                                                      _metadata.get_types(),
+                                                                      use_index,
+                                                                      level == 0,
+                                                                      &num_dict_entries,
+                                                                      &local_stream_order,
+                                                                      nullptr,  // stream_info
+                                                                      &chunks);
+
+      auto const is_stripe_data_empty = total_data_size == 0;
+      CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                   "Invalid index rowgroup stream data");
+
+      auto const dst_base =
+        static_cast<uint8_t*>(stripe_data[stripe_idx - load_stripe_start].data());
+      auto const num_rows_in_stripe = static_cast<int64_t>(stripe_info->numberOfRows);
+
+      uint32_t const rowgroup_id = num_rowgroups;
+      uint32_t const stripe_num_rowgroups =
+        use_index ? (num_rows_in_stripe + _metadata.get_row_index_stride() - 1) /
+                      _metadata.get_row_index_stride()
+                  : 0;
+
+      // Update chunks to reference streams pointers.
+      for (std::size_t col_idx = 0; col_idx < num_lvl_columns; col_idx++) {
+        auto& chunk = chunks[stripe_local_idx][col_idx];
+        // start row, number of rows in a each stripe and total number of rows
+        // may change in lower levels of nesting
+        chunk.start_row =
+          (level == 0) ? stripe_start_row
+                       : col_meta.child_start_row[stripe_local_idx * num_lvl_columns + col_idx];
+        chunk.num_rows =
+          (level == 0)
+            ? num_rows_in_stripe
+            : col_meta.num_child_rows_per_stripe[stripe_local_idx * num_lvl_columns + col_idx];
+        chunk.column_num_rows = (level == 0) ? rows_to_decode : col_meta.num_child_rows[col_idx];
+        chunk.parent_validity_info =
+          (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
+        chunk.parent_null_count_prefix_sums =
+          (level == 0) ? nullptr
+                       : null_count_prefix_sums.data() + (num_processed_prev_lvl_columns +
+                                                          col_meta.parent_column_index[col_idx]) *
+                                                           stripe_count;
+        chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
+        chunk.type_kind =
+          _metadata.per_file_metadata[stripe.source_idx].ff.types[columns_level[col_idx].id].kind;
+
+        // num_child_rows for a struct column will be same, for other nested types it will be
+        // calculated.
+        chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+        chunk.dtype_id       = column_types[col_idx].id();
+        chunk.decimal_scale  = _metadata.per_file_metadata[stripe.source_idx]
+                                .ff.types[columns_level[col_idx].id]
+                                .scale.value_or(0);
+
+        chunk.rowgroup_id   = rowgroup_id;
+        chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
+                                ? sizeof(string_index_pair)
+                              : ((column_types[col_idx].id() == type_id::LIST) or
+                             (column_types[col_idx].id() == type_id::STRUCT))
+                                ? sizeof(size_type)
+                                : cudf::size_of(column_types[col_idx]);
+        chunk.num_rowgroups = stripe_num_rowgroups;
+
+        if (chunk.type_kind == orc::TIMESTAMP) {
+          chunk.timestamp_type_id = _options.timestamp_type.id();
         }
-        // Update chunks to reference streams pointers
-        for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-          auto& chunk = chunks[stripe_idx][col_idx];
-          // start row, number of rows in a each stripe and total number of rows
-          // may change in lower levels of nesting
-          chunk.start_row = (level == 0)
-                              ? stripe_start_row
-                              : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
-          chunk.num_rows =
-            (level == 0) ? stripe_info->numberOfRows
-                         : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
-          chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
-          chunk.parent_validity_info =
-            (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
-          chunk.parent_null_count_prefix_sums =
-            (level == 0)
-              ? nullptr
-              : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
-          chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
-          chunk.type_kind     = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                              .ff.types[columns_level[col_idx].id]
-                              .kind;
-          // num_child_rows for a struct column will be same, for other nested types it will be
-          // calculated.
-          chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
-          chunk.dtype_id       = column_types[col_idx].id();
-          chunk.decimal_scale  = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                                  .ff.types[columns_level[col_idx].id]
-                                  .scale.value_or(0);
-
-          chunk.rowgroup_id   = rowgroup_id;
-          chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
-                                  ? sizeof(string_index_pair)
-                                : ((column_types[col_idx].id() == type_id::LIST) or
-                               (column_types[col_idx].id() == type_id::STRUCT))
-                                  ? sizeof(size_type)
-                                  : cudf::size_of(column_types[col_idx]);
-          chunk.num_rowgroups = stripe_num_rowgroups;
-          if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _timestamp_type.id(); }
-          if (not is_stripe_data_empty) {
-            for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-              chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
-            }
+        if (not is_stripe_data_empty) {
+          for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
+            chunk.streams[k] =
+              dst_base + stream_info[chunk.strm_id[k] + stream_range.begin].dst_pos;
           }
         }
-        stripe_start_row += num_rows_per_stripe;
-        num_rowgroups += stripe_num_rowgroups;
-
-        stripe_idx++;
       }
-    }
-    for (auto& task : read_tasks) {
-      CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
+
+      stripe_start_row += num_rows_in_stripe;
+      num_rowgroups += stripe_num_rowgroups;
     }
 
     if (stripe_data.empty()) { continue; }
 
-    // Process dataset chunk pages into output columns
+    // Process dataset chunks into output columns.
     auto row_groups =
-      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, _stream);
+      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_lvl_columns, _stream);
     if (level > 0 and row_groups.size().first) {
       cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
-                                                     num_rowgroups * num_columns);
+                                                     num_rowgroups * num_lvl_columns);
       auto& rw_grp_meta = col_meta.rwgrp_meta;
 
       // Update start row and num rows per row group
@@ -980,19 +943,31 @@ void reader::impl::prepare_data(int64_t skip_rows,
                        return meta;
                      });
     }
-    // Setup row group descriptors if using indexes
+
+    // Setup row group descriptors if using indexes.
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      auto decomp_data = decompress_stripe_data(*_metadata.per_file_metadata[0].decompressor,
+      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
+        hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size());
+      auto decomp_data = decompress_stripe_data(load_stripe_range,
+                                                stream_range,
+                                                stripe_count,
+                                                compinfo,
+                                                _file_itm_data.compinfo_map,
+                                                *_metadata.per_file_metadata[0].decompressor,
                                                 stripe_data,
                                                 stream_info,
                                                 chunks,
                                                 row_groups,
-                                                total_num_stripes,
                                                 _metadata.get_row_index_stride(),
                                                 level == 0,
                                                 _stream);
-      stripe_data.clear();
-      stripe_data.push_back(std::move(decomp_data));
+
+      // Just save the decompressed data and clear out the raw data to free up memory.
+      stripe_data[stripe_start - load_stripe_start] = std::move(decomp_data);
+      for (std::size_t i = 1; i < stripe_count; ++i) {
+        stripe_data[i + stripe_start - load_stripe_start] = {};
+      }
+
     } else {
       if (row_groups.size().first) {
         chunks.host_to_device_async(_stream);
@@ -1001,34 +976,38 @@ void reader::impl::prepare_data(int64_t skip_rows,
         gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
                                 nullptr,
                                 chunks.base_device_ptr(),
-                                num_columns,
-                                total_num_stripes,
+                                num_lvl_columns,
+                                stripe_count,
                                 _metadata.get_row_index_stride(),
                                 level == 0,
                                 _stream);
       }
     }
 
+    _out_buffers[level].resize(0);
+
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
-      for (std::size_t j = 0; j < total_num_stripes; ++j) {
+      for (std::size_t j = 0; j < stripe_count; ++j) {
         if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
           is_nullable = true;
           break;
         }
       }
-      auto is_list_type = (column_types[i].id() == type_id::LIST);
-      auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
-      // For list column, offset column will be always size + 1
-      if (is_list_type) n_rows++;
-      _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
+
+      auto const is_list_type = (column_types[i].id() == type_id::LIST);
+      auto const n_rows       = (level == 0) ? rows_to_decode : col_meta.num_child_rows[i];
+
+      // For list column, offset column will be always size + 1.
+      _out_buffers[level].emplace_back(
+        column_types[i], is_list_type ? n_rows + 1 : n_rows, is_nullable, _stream, _mr);
     }
 
     decode_stream_data(num_dict_entries,
                        rows_to_skip,
                        _metadata.get_row_index_stride(),
                        level,
-                       tz_table->view(),
+                       *tz_table_dptr,
                        chunks,
                        row_groups,
                        _out_buffers[level],
@@ -1036,8 +1015,9 @@ void reader::impl::prepare_data(int64_t skip_rows,
                        _mr);
 
     if (nested_cols.size()) {
-      // Extract information to process nested child columns
-      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
+      // Extract information to process nested child columns.
+      scan_null_counts(
+        chunks, null_count_prefix_sums.data() + num_processed_lvl_columns * stripe_count, _stream);
 
       row_groups.device_to_host_sync(_stream);
       aggregate_child_meta(
@@ -1055,7 +1035,48 @@ void reader::impl::prepare_data(int64_t skip_rows,
 
       if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
     }
+    num_processed_prev_lvl_columns = num_processed_lvl_columns;
+    num_processed_lvl_columns += num_lvl_columns;
   }  // end loop level
+
+  // Now generate a table from the decoded result.
+  std::vector<std::unique_ptr<column>> out_columns;
+  _out_metadata = get_meta_with_user_data();
+  std::transform(
+    _selected_columns.levels[0].begin(),
+    _selected_columns.levels[0].end(),
+    std::back_inserter(out_columns),
+    [&](auto const& orc_col_meta) {
+      _out_metadata.schema_info.emplace_back("");
+      auto col_buffer = assemble_buffer(
+        orc_col_meta.id, 0, *_col_meta, _metadata, _selected_columns, _out_buffers, _stream, _mr);
+      return make_column(col_buffer, &_out_metadata.schema_info.back(), std::nullopt, _stream);
+    });
+  _chunk_read_data.decoded_table = std::make_unique<table>(std::move(out_columns));
+
+  // Free up temp memory used for decoding.
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    _out_buffers[level].resize(0);
+
+    auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      stripe_data[stripe_start - load_stripe_start] = {};
+    } else {
+      for (std::size_t i = 0; i < stripe_count; ++i) {
+        stripe_data[i + stripe_start - load_stripe_start] = {};
+      }
+    }
+  }
+
+  // Output table range is reset to start from the first position.
+  _chunk_read_data.curr_output_table_range = 0;
+
+  // Split the decoded table into ranges that be output into chunks having size within the given
+  // output size limit.
+  _chunk_read_data.output_table_ranges = find_table_splits(_chunk_read_data.decoded_table->view(),
+                                                           _chunk_read_data.output_row_granularity,
+                                                           _chunk_read_data.chunk_read_limit,
+                                                           _stream);
 }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp
index 6645eecbd29..a563fb19e15 100644
--- a/cpp/src/io/orc/reader_impl_helpers.hpp
+++ b/cpp/src/io/orc/reader_impl_helpers.hpp
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "aggregate_orc_metadata.hpp"
+#include "io/orc/aggregate_orc_metadata.hpp"
+#include "io/orc/orc.hpp"
 #include "io/utilities/column_buffer.hpp"
-#include "orc.hpp"
 
 #include <cudf/io/orc.hpp>
 
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 04a22b41247..c9212334a96 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -17,6 +17,7 @@
 #include "compact_protocol_reader.hpp"
 
 #include "parquet.hpp"
+#include "parquet_common.hpp"
 
 #include <cudf/utilities/error.hpp>
 
@@ -652,6 +653,9 @@ void CompactProtocolReader::read(ColumnChunkMetaData* c)
 {
   using optional_size_statistics =
     parquet_field_optional<SizeStatistics, parquet_field_struct<SizeStatistics>>;
+  using optional_list_enc_stats =
+    parquet_field_optional<std::vector<PageEncodingStats>,
+                           parquet_field_struct_list<PageEncodingStats>>;
   auto op = std::make_tuple(parquet_field_enum<Type>(1, c->type),
                             parquet_field_enum_list(2, c->encodings),
                             parquet_field_string_list(3, c->path_in_schema),
@@ -663,6 +667,7 @@ void CompactProtocolReader::read(ColumnChunkMetaData* c)
                             parquet_field_int64(10, c->index_page_offset),
                             parquet_field_int64(11, c->dictionary_page_offset),
                             parquet_field_struct(12, c->statistics),
+                            optional_list_enc_stats(13, c->encoding_stats),
                             optional_size_statistics(16, c->size_statistics));
   function_builder(this, op);
 }
@@ -758,13 +763,16 @@ void CompactProtocolReader::read(Statistics* s)
 {
   using optional_binary = parquet_field_optional<std::vector<uint8_t>, parquet_field_binary>;
   using optional_int64  = parquet_field_optional<int64_t, parquet_field_int64>;
+  using optional_bool   = parquet_field_optional<bool, parquet_field_bool>;
 
   auto op = std::make_tuple(optional_binary(1, s->max),
                             optional_binary(2, s->min),
                             optional_int64(3, s->null_count),
                             optional_int64(4, s->distinct_count),
                             optional_binary(5, s->max_value),
-                            optional_binary(6, s->min_value));
+                            optional_binary(6, s->min_value),
+                            optional_bool(7, s->is_max_value_exact),
+                            optional_bool(8, s->is_min_value_exact));
   function_builder(this, op);
 }
 
@@ -774,6 +782,14 @@ void CompactProtocolReader::read(ColumnOrder* c)
   function_builder(this, op);
 }
 
+void CompactProtocolReader::read(PageEncodingStats* s)
+{
+  auto op = std::make_tuple(parquet_field_enum<PageType>(1, s->page_type),
+                            parquet_field_enum<Encoding>(2, s->encoding),
+                            parquet_field_int32(3, s->count));
+  function_builder(this, op);
+}
+
 void CompactProtocolReader::read(SortingColumn* s)
 {
   auto op = std::make_tuple(parquet_field_int32(1, s->column_idx),
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index 2ad336a3052..bcc9adfc8c0 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -120,6 +120,7 @@ class CompactProtocolReader {
   void read(ColumnIndex* c);
   void read(Statistics* s);
   void read(ColumnOrder* c);
+  void read(PageEncodingStats* s);
   void read(SortingColumn* s);
 
  public:
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 1262ca1926d..14c99f728de 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -188,6 +188,7 @@ size_t CompactProtocolWriter::write(ColumnChunkMetaData const& s)
   if (s.index_page_offset != 0) { c.field_int(10, s.index_page_offset); }
   if (s.dictionary_page_offset != 0) { c.field_int(11, s.dictionary_page_offset); }
   c.field_struct(12, s.statistics);
+  if (s.encoding_stats.has_value()) { c.field_struct_list(13, s.encoding_stats.value()); }
   if (s.size_statistics.has_value()) { c.field_struct(16, s.size_statistics.value()); }
   return c.value();
 }
@@ -201,6 +202,8 @@ size_t CompactProtocolWriter::write(Statistics const& s)
   if (s.distinct_count.has_value()) { c.field_int(4, s.distinct_count.value()); }
   if (s.max_value.has_value()) { c.field_binary(5, s.max_value.value()); }
   if (s.min_value.has_value()) { c.field_binary(6, s.min_value.value()); }
+  if (s.is_max_value_exact.has_value()) { c.field_bool(7, s.is_max_value_exact.value()); }
+  if (s.is_min_value_exact.has_value()) { c.field_bool(8, s.is_min_value_exact.value()); }
   return c.value();
 }
 
@@ -248,6 +251,15 @@ size_t CompactProtocolWriter::write(ColumnOrder const& co)
   return c.value();
 }
 
+size_t CompactProtocolWriter::write(PageEncodingStats const& enc)
+{
+  CompactProtocolFieldWriter c(*this);
+  c.field_int(1, static_cast<int32_t>(enc.page_type));
+  c.field_int(2, static_cast<int32_t>(enc.encoding));
+  c.field_int(3, enc.count);
+  return c.value();
+}
+
 size_t CompactProtocolWriter::write(SortingColumn const& sc)
 {
   CompactProtocolFieldWriter c(*this);
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 2e39abadd24..c2e6178acbf 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -53,6 +53,7 @@ class CompactProtocolWriter {
   size_t write(OffsetIndex const&);
   size_t write(SizeStatistics const&);
   size_t write(ColumnOrder const&);
+  size_t write(PageEncodingStats const&);
   size_t write(SortingColumn const&);
 
  protected:
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 0c139fced24..b1f8e6dd5fe 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -122,7 +122,7 @@ struct null_count_back_copier {
  */
 constexpr bool is_string_col(PageInfo const& page, device_span<ColumnChunkDesc const> chunks)
 {
-  if (page.flags & PAGEINFO_FLAGS_DICTIONARY != 0) { return false; }
+  if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return false; }
   auto const& col = chunks[page.chunk_idx];
   return is_string_col(col);
 }
@@ -1298,9 +1298,13 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
       // be made to is_supported_encoding() in reader_impl_preprocess.cu
       switch (s->page.encoding) {
         case Encoding::PLAIN_DICTIONARY:
-        case Encoding::RLE_DICTIONARY:
+        case Encoding::RLE_DICTIONARY: {
           // RLE-packed dictionary indices, first byte indicates index length in bits
-          if (s->col.physical_type == BYTE_ARRAY && s->col.str_dict_index != nullptr) {
+          auto const is_decimal =
+            s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
+          if ((s->col.physical_type == BYTE_ARRAY or
+               s->col.physical_type == FIXED_LEN_BYTE_ARRAY) and
+              not is_decimal and s->col.str_dict_index != nullptr) {
             // String dictionary: use index
             s->dict_base = reinterpret_cast<uint8_t const*>(s->col.str_dict_index);
             s->dict_size = s->col.dict_page->num_input_values * sizeof(string_index_pair);
@@ -1314,7 +1318,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           if (s->dict_bits > 32 || (!s->dict_base && s->col.dict_page->num_input_values > 0)) {
             s->set_error_code(decode_error::INVALID_DICT_WIDTH);
           }
-          break;
+        } break;
         case Encoding::PLAIN:
         case Encoding::BYTE_STREAM_SPLIT:
           s->dict_size = static_cast<int32_t>(end - cur);
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index da1bbaebd73..0c9d4e77f0c 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -579,15 +579,18 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // now turn array of lengths into offsets
-  int value_count = nesting_info_base[leaf_level_index].value_count;
+  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
+  // latter case, offsets will be computed during string column creation.
+  if (not s->col.is_large_string_col) {
+    int value_count = nesting_info_base[leaf_level_index].value_count;
 
-  // if no repetition we haven't calculated start/end bounds and instead just skipped
-  // values until we reach first_row. account for that here.
-  if (!has_repetition) { value_count -= s->first_row; }
+    // if no repetition we haven't calculated start/end bounds and instead just skipped
+    // values until we reach first_row. account for that here.
+    if (!has_repetition) { value_count -= s->first_row; }
 
-  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+  }
 
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
@@ -738,15 +741,18 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // now turn array of lengths into offsets
-  int value_count = nesting_info_base[leaf_level_index].value_count;
+  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
+  // latter case, offsets will be computed during string column creation.
+  if (not s->col.is_large_string_col) {
+    int value_count = nesting_info_base[leaf_level_index].value_count;
 
-  // if no repetition we haven't calculated start/end bounds and instead just skipped
-  // values until we reach first_row. account for that here.
-  if (!has_repetition) { value_count -= s->first_row; }
+    // if no repetition we haven't calculated start/end bounds and instead just skipped
+    // values until we reach first_row. account for that here.
+    if (!has_repetition) { value_count -= s->first_row; }
 
-  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+  }
 
   // finally, copy the string data into place
   auto const dst = nesting_info_base[leaf_level_index].string_out;
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 227f13db60e..11b18579c58 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -2944,6 +2944,9 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start,
     auto const [min_ptr, min_size] =
       get_extremum(&s->min_value, dtype, scratch, true, NO_TRUNC_STATS);
     encoder.field_binary(6, min_ptr, min_size);
+    // cudf min/max statistics are always exact (i.e. not truncated)
+    encoder.field_bool(7, true);
+    encoder.field_bool(8, true);
   }
   encoder.end(&end);
   return end;
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 6c6afde29e4..cf0dd85e490 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -538,17 +538,28 @@ CUDF_KERNEL void __launch_bounds__(128)
     int pos = 0, cur = 0;
     for (int i = 0; i < num_entries; i++) {
       int len = 0;
-      if (cur + 4 <= dict_size) {
-        len = dict[cur + 0] | (dict[cur + 1] << 8) | (dict[cur + 2] << 16) | (dict[cur + 3] << 24);
-        if (len >= 0 && cur + 4 + len <= dict_size) {
+      if (ck->physical_type == FIXED_LEN_BYTE_ARRAY) {
+        if (cur + ck->type_length <= dict_size) {
+          len = ck->type_length;
           pos = cur;
-          cur = cur + 4 + len;
+          cur += len;
         } else {
           cur = dict_size;
         }
+      } else {
+        if (cur + 4 <= dict_size) {
+          len =
+            dict[cur + 0] | (dict[cur + 1] << 8) | (dict[cur + 2] << 16) | (dict[cur + 3] << 24);
+          if (len >= 0 && cur + 4 + len <= dict_size) {
+            pos = cur + 4;
+            cur = pos + len;
+          } else {
+            cur = dict_size;
+          }
+        }
       }
       // TODO: Could store 8 entries in shared mem, then do a single warp-wide store
-      dict_index[i].first  = reinterpret_cast<char const*>(dict + pos + 4);
+      dict_index[i].first  = reinterpret_cast<char const*>(dict + pos);
       dict_index[i].second = len;
     }
   }
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 5ba813f518f..cf1dc58b06a 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -955,7 +955,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
 {
   using cudf::detail::warp_size;
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(4) size_type last_offset;
+  __shared__ size_t last_offset;
   __shared__ __align__(16)
     page_state_buffers_s<rolling_buf_size, rolling_buf_size, rolling_buf_size>
       state_buffers;
@@ -1054,9 +1054,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
                               ? gpuGetStringData(s, sb, src_pos + skipped_leaf_values + i)
                               : cuda::std::pair<char const*, size_t>{nullptr, 0};
 
-          __shared__ cub::WarpScan<size_type>::TempStorage temp_storage;
-          size_type offset, warp_total;
-          cub::WarpScan<size_type>(temp_storage).ExclusiveSum(len, offset, warp_total);
+          __shared__ cub::WarpScan<size_t>::TempStorage temp_storage;
+          size_t offset, warp_total;
+          cub::WarpScan<size_t>(temp_storage).ExclusiveSum(len, offset, warp_total);
           offset += last_offset;
 
           // choose a character parallel string copy when the average string is longer than a warp
@@ -1075,10 +1075,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
             }
             __syncwarp();
           } else if (use_char_ll) {
-            __shared__ __align__(8) uint8_t const* pointers[warp_size];
-            __shared__ __align__(4) size_type offsets[warp_size];
-            __shared__ __align__(4) int dsts[warp_size];
-            __shared__ __align__(4) int lengths[warp_size];
+            __shared__ uint8_t const* pointers[warp_size];
+            __shared__ size_t offsets[warp_size];
+            __shared__ int dsts[warp_size];
+            __shared__ int lengths[warp_size];
 
             offsets[me]  = offset;
             pointers[me] = reinterpret_cast<uint8_t const*>(ptr);
@@ -1119,15 +1119,18 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // now turn array of lengths into offsets
-  int value_count = nesting_info_base[leaf_level_index].value_count;
+  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
+  // latter case, offsets will be computed during string column creation.
+  if (not s->col.is_large_string_col) {
+    int value_count = nesting_info_base[leaf_level_index].value_count;
 
-  // if no repetition we haven't calculated start/end bounds and instead just skipped
-  // values until we reach first_row. account for that here.
-  if (!has_repetition) { value_count -= s->first_row; }
+    // if no repetition we haven't calculated start/end bounds and instead just skipped
+    // values until we reach first_row. account for that here.
+    if (!has_repetition) { value_count -= s->first_row; }
 
-  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+  }
 
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 7f00d63b9c2..756726945cf 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -259,6 +259,10 @@ struct Statistics {
   thrust::optional<std::vector<uint8_t>> max_value;
   // min value for column determined by ColumnOrder
   thrust::optional<std::vector<uint8_t>> min_value;
+  // If true, max_value is the actual maximum value for a column
+  thrust::optional<bool> is_max_value_exact;
+  // If true, min_value is the actual minimum value for a column
+  thrust::optional<bool> is_min_value_exact;
 };
 
 /**
@@ -322,6 +326,15 @@ struct ColumnIndex {
   thrust::optional<std::vector<int64_t>> definition_level_histogram;
 };
 
+/**
+ * @brief Thrift-derived struct describing page encoding statistics
+ */
+struct PageEncodingStats {
+  PageType page_type;  // The page type (data/dic/...)
+  Encoding encoding;   // Encoding of the page
+  int32_t count;       // Number of pages of this type with this encoding
+};
+
 /**
  * @brief Thrift-derived struct describing column sort order
  */
@@ -335,21 +348,36 @@ struct SortingColumn {
  * @brief Thrift-derived struct describing a column chunk
  */
 struct ColumnChunkMetaData {
+  // Type of this column
   Type type = BOOLEAN;
+  // Set of all encodings used for this column. The purpose is to validate
+  // whether we can decode those pages.
   std::vector<Encoding> encodings;
+  // Path in schema
   std::vector<std::string> path_in_schema;
-  Compression codec  = UNCOMPRESSED;
+  // Compression codec
+  Compression codec = UNCOMPRESSED;
+  // Number of values in this column
   int64_t num_values = 0;
-  int64_t total_uncompressed_size =
-    0;  // total byte size of all uncompressed pages in this column chunk (including the headers)
-  int64_t total_compressed_size =
-    0;  // total byte size of all compressed pages in this column chunk (including the headers)
-  int64_t data_page_offset  = 0;  // Byte offset from beginning of file to first data page
-  int64_t index_page_offset = 0;  // Byte offset from beginning of file to root index page
-  int64_t dictionary_page_offset =
-    0;                    // Byte offset from the beginning of file to first (only) dictionary page
-  Statistics statistics;  // Encoded chunk-level statistics
-  thrust::optional<SizeStatistics> size_statistics;  // Size statistics for the chunk
+  // Total byte size of all uncompressed pages in this column chunk (including the headers)
+  int64_t total_uncompressed_size = 0;
+  // Total byte size of all compressed pages in this column chunk (including the headers)
+  int64_t total_compressed_size = 0;
+  // Byte offset from beginning of file to first data page
+  int64_t data_page_offset = 0;
+  // Byte offset from beginning of file to root index page
+  int64_t index_page_offset = 0;
+  // Byte offset from the beginning of file to first (only) dictionary page
+  int64_t dictionary_page_offset = 0;
+  // Optional statistics for this column chunk
+  Statistics statistics;
+  // Set of all encodings used for pages in this column chunk. This information can be used to
+  // determine if all data pages are dictionary encoded for example.
+  thrust::optional<std::vector<PageEncodingStats>> encoding_stats;
+  // Optional statistics to help estimate total memory when converted to in-memory representations.
+  // The histograms contained in these statistics can also be useful in some cases for more
+  // fine-grained nullability/list length filter pushdown.
+  thrust::optional<SizeStatistics> size_statistics;
 };
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index c06fb63acda..3b18175dccd 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -326,8 +326,8 @@ struct PageInfo {
   int32_t skipped_leaf_values;
   // for string columns only, the size of all the chars in the string for
   // this page. only valid/computed during the base preprocess pass
+  size_t str_offset;  // offset into string data for this page
   int32_t str_bytes;
-  int32_t str_offset;   // offset into string data for this page
   bool has_page_index;  // true if str_bytes, num_valids, etc are derivable from page indexes
 
   // nesting information (input/output) for each page. this array contains
@@ -420,7 +420,8 @@ struct ColumnChunkDesc {
       src_col_schema(src_col_schema_),
       h_chunk_info(chunk_info_),
       list_bytes_per_row_est(list_bytes_per_row_est_),
-      is_strings_to_cat(strings_to_categorical_)
+      is_strings_to_cat(strings_to_categorical_),
+      is_large_string_col(false)
   {
   }
 
@@ -454,7 +455,8 @@ struct ColumnChunkDesc {
 
   float list_bytes_per_row_est{};  // for LIST columns, an estimate on number of bytes per row
 
-  bool is_strings_to_cat{};  // convert strings to hashes
+  bool is_strings_to_cat{};    // convert strings to hashes
+  bool is_large_string_col{};  // `true` if string data uses 64-bit offsets
 };
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index b7172f5ba67..0602b5ec007 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -22,6 +22,7 @@
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 
 #include <rmm/resource_ref.hpp>
 
@@ -99,11 +100,21 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
     col_string_sizes = calculate_page_string_offsets();
 
     // check for overflow
-    if (std::any_of(col_string_sizes.cbegin(), col_string_sizes.cend(), [](std::size_t sz) {
-          return sz > std::numeric_limits<size_type>::max();
-        })) {
+    auto const threshold         = static_cast<size_t>(strings::detail::get_offset64_threshold());
+    auto const has_large_strings = std::any_of(col_string_sizes.cbegin(),
+                                               col_string_sizes.cend(),
+                                               [=](std::size_t sz) { return sz > threshold; });
+    if (has_large_strings and not strings::detail::is_large_strings_enabled()) {
       CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
     }
+
+    // mark any chunks that are large string columns
+    if (has_large_strings) {
+      for (auto& chunk : pass.chunks) {
+        auto const idx = chunk.src_col_index;
+        if (col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; }
+      }
+    }
   }
 
   // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
@@ -348,11 +359,13 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
       } else if (out_buf.type.id() == type_id::STRING) {
         // need to cap off the string offsets column
         auto const sz = static_cast<size_type>(col_string_sizes[idx]);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<size_type*>(out_buf.data()) + out_buf.size,
-                                      &sz,
-                                      sizeof(size_type),
-                                      cudaMemcpyDefault,
-                                      _stream.value()));
+        if (sz <= strings::detail::get_offset64_threshold()) {
+          CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<size_type*>(out_buf.data()) + out_buf.size,
+                                        &sz,
+                                        sizeof(size_type),
+                                        cudaMemcpyDefault,
+                                        _stream.value()));
+        }
       }
     }
   }
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 402ccef7a15..c47beb8d7ed 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -205,7 +205,6 @@ void metadata::sanitize_schema()
   // This code attempts to make this less messy for the code that follows.
 
   std::function<void(size_t)> process = [&](size_t schema_idx) -> void {
-    if (schema_idx < 0) { return; }
     auto& schema_elem = schema[schema_idx];
     if (schema_idx != 0 && schema_elem.type == UNDEFINED_TYPE) {
       auto const parent_type = schema[schema_elem.parent_idx].converted_type;
@@ -651,7 +650,10 @@ aggregate_reader_metadata::select_row_groups(
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
     auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options(
       skip_rows_opt, num_rows_opt, get_num_rows());
-    return std::pair{static_cast<int64_t>(from_opts.first), from_opts.second};
+    CUDF_EXPECTS(from_opts.second <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+                 "Number of reading rows exceeds cudf's column size limit.");
+    return std::pair{static_cast<int64_t>(from_opts.first),
+                     static_cast<size_type>(from_opts.second)};
   }();
 
   if (!row_group_indices.empty()) {
@@ -723,7 +725,6 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
                        int schema_idx,
                        std::vector<cudf::io::detail::inline_column_buffer>& out_col_array,
                        bool has_list_parent) {
-      if (schema_idx < 0) { return false; }
       auto const& schema_elem = get_schema(schema_idx);
 
       // if schema_elem is a stub then it does not exist in the column_name_info and column_buffer
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 4b7a64ac6ab..55633b97cf4 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -636,6 +636,15 @@ void decode_page_headers(pass_intermediate_data& pass,
   stream.synchronize();
 }
 
+constexpr bool is_string_chunk(ColumnChunkDesc const& chunk)
+{
+  auto const is_decimal =
+    chunk.logical_type.has_value() and chunk.logical_type->type == LogicalType::DECIMAL;
+  auto const is_binary =
+    chunk.physical_type == BYTE_ARRAY or chunk.physical_type == FIXED_LEN_BYTE_ARRAY;
+  return is_binary and not is_decimal;
+}
+
 struct set_str_dict_index_count {
   device_span<size_t> str_dict_index_count;
   device_span<const ColumnChunkDesc> chunks;
@@ -643,8 +652,8 @@ struct set_str_dict_index_count {
   __device__ void operator()(PageInfo const& page)
   {
     auto const& chunk = chunks[page.chunk_idx];
-    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) && chunk.physical_type == BYTE_ARRAY &&
-        (chunk.num_dict_pages > 0)) {
+    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0 and chunk.num_dict_pages > 0 and
+        is_string_chunk(chunk)) {
       // there is only ever one dictionary page per chunk, so this is safe to do in parallel.
       str_dict_index_count[page.chunk_idx] = page.num_input_values;
     }
@@ -659,7 +668,7 @@ struct set_str_dict_index_ptr {
   __device__ void operator()(size_t i)
   {
     auto& chunk = chunks[i];
-    if (chunk.physical_type == BYTE_ARRAY && (chunk.num_dict_pages > 0)) {
+    if (chunk.num_dict_pages > 0 and is_string_chunk(chunk)) {
       chunk.str_dict_index = base + str_dict_index_offsets[i];
     }
   }
@@ -1169,10 +1178,10 @@ struct page_to_string_size {
 struct page_offset_output_iter {
   PageInfo* p;
 
-  using value_type        = size_type;
-  using difference_type   = size_type;
-  using pointer           = size_type*;
-  using reference         = size_type&;
+  using value_type        = size_t;
+  using difference_type   = size_t;
+  using pointer           = size_t*;
+  using reference         = size_t&;
   using iterator_category = thrust::output_device_iterator_tag;
 
   __host__ __device__ page_offset_output_iter operator+(int i) { return {p + i}; }
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 5509a33f9f0..24aa630a05f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -22,6 +22,8 @@
 #include "compact_protocol_reader.hpp"
 #include "compact_protocol_writer.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
+#include "io/parquet/parquet.hpp"
+#include "io/parquet/parquet_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
 #include "io/utilities/column_utils.cuh"
 #include "io/utilities/config_utils.hpp"
@@ -38,6 +40,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -214,6 +217,53 @@ void update_chunk_encodings(std::vector<Encoding>& encodings, uint32_t enc_mask)
   }
 }
 
+/**
+ * @brief Update the encoding_stats field in the column chunk metadata.
+ *
+ * @param chunk_meta The `ColumnChunkMetaData` struct for the column chunk
+ * @param ck The column chunk to summarize stats for
+ * @param is_v2 True if V2 page headers are used
+ */
+void update_chunk_encoding_stats(ColumnChunkMetaData& chunk_meta,
+                                 EncColumnChunk const& ck,
+                                 bool is_v2)
+{
+  // don't set encoding stats if there are no pages
+  if (ck.num_pages == 0) { return; }
+
+  // NOTE: since cudf doesn't use mixed encodings for a chunk, we really only need to account
+  // for the dictionary page (if there is one), and the encoding used for the data pages. We can
+  // examine the chunk's encodings field to figure out the encodings without having to examine
+  // the page data.
+  auto const num_data_pages = static_cast<int32_t>(ck.num_data_pages());
+  auto const data_page_type = is_v2 ? PageType::DATA_PAGE_V2 : PageType::DATA_PAGE;
+
+  std::vector<PageEncodingStats> result;
+  if (ck.use_dictionary) {
+    // For dictionary encoding, if V1 then both data and dictionary use PLAIN_DICTIONARY. For V2
+    // the dictionary uses PLAIN and the data RLE_DICTIONARY.
+    auto const dict_enc = is_v2 ? Encoding::PLAIN : Encoding::PLAIN_DICTIONARY;
+    auto const data_enc = is_v2 ? Encoding::RLE_DICTIONARY : Encoding::PLAIN_DICTIONARY;
+    result.push_back({PageType::DICTIONARY_PAGE, dict_enc, 1});
+    if (num_data_pages > 0) { result.push_back({data_page_type, data_enc, num_data_pages}); }
+  } else {
+    // No dictionary page, the pages are encoded with something other than RLE (unless it's a
+    // boolean column).
+    for (auto const enc : chunk_meta.encodings) {
+      if (enc != Encoding::RLE) {
+        result.push_back({data_page_type, enc, num_data_pages});
+        break;
+      }
+    }
+    // if result is empty and we're using V2 headers, then assume the data is RLE as well
+    if (result.empty() and is_v2 and (ck.encodings & encoding_to_mask(Encoding::RLE)) != 0) {
+      result.push_back({data_page_type, Encoding::RLE, num_data_pages});
+    }
+  }
+
+  if (not result.empty()) { chunk_meta.encoding_stats = std::move(result); }
+}
+
 /**
  * @brief Compute size (in bytes) of the data stored in the given column.
  *
@@ -229,8 +279,9 @@ size_t column_size(column_view const& column, rmm::cuda_stream_view stream)
     return size_of(column.type()) * column.size();
   } else if (column.type().id() == type_id::STRING) {
     auto const scol = strings_column_view(column);
-    return cudf::detail::get_value<size_type>(scol.offsets(), column.size(), stream) -
-           cudf::detail::get_value<size_type>(scol.offsets(), 0, stream);
+    return cudf::strings::detail::get_offset_value(
+             scol.offsets(), column.size() + column.offset(), stream) -
+           cudf::strings::detail::get_offset_value(scol.offsets(), column.offset(), stream);
   } else if (column.type().id() == type_id::STRUCT) {
     auto const scol = structs_column_view(column);
     size_t ret      = 0;
@@ -2144,6 +2195,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         max_write_size = std::max(max_write_size, ck.compressed_size);
 
         update_chunk_encodings(column_chunk_meta.encodings, ck.encodings);
+        update_chunk_encoding_stats(column_chunk_meta, ck, write_v2_headers);
 
         if (ck.ck_stat_size != 0) {
           std::vector<uint8_t> const stats_blob = cudf::detail::make_std_vector_sync(
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 5dc2291abdc..db84778edc6 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -69,16 +69,6 @@ void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes
   _string_data = rmm::device_buffer(num_bytes, stream, _mr);
 }
 
-std::unique_ptr<column> cudf::io::detail::inline_column_buffer::make_string_column_impl(
-  rmm::cuda_stream_view stream)
-{
-  // no need for copies, just transfer ownership of the data_buffers to the columns
-  auto offsets_col = std::make_unique<column>(
-    data_type{type_to_id<size_type>()}, size + 1, std::move(_data), rmm::device_buffer{}, 0);
-  return make_strings_column(
-    size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask));
-}
-
 namespace {
 
 /**
diff --git a/cpp/src/io/utilities/column_buffer_strings.cu b/cpp/src/io/utilities/column_buffer_strings.cu
new file mode 100644
index 00000000000..4bc303a34a5
--- /dev/null
+++ b/cpp/src/io/utilities/column_buffer_strings.cu
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "column_buffer.hpp"
+
+#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/utilities/error.hpp>
+
+namespace cudf::io::detail {
+
+std::unique_ptr<column> cudf::io::detail::inline_column_buffer::make_string_column_impl(
+  rmm::cuda_stream_view stream)
+{
+  // if the size of _string_data is over the threshold for 64bit size_type, _data will contain
+  // sizes rather than offsets. need special handling for that case.
+  auto const threshold = static_cast<size_t>(strings::detail::get_offset64_threshold());
+  if (_string_data.size() > threshold) {
+    if (not strings::detail::is_large_strings_enabled()) {
+      CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
+    }
+    // create new offsets
+    auto const offsets_ptr = static_cast<size_type*>(_data.data());
+    auto offsets_col       = make_numeric_column(
+      data_type{type_id::INT64}, size + 1, mask_state::UNALLOCATED, stream, _mr);
+    auto d_offsets64 = offsets_col->mutable_view().template data<int64_t>();
+    // it's safe to call with size + 1 because _data is also sized that large
+    cudf::detail::sizes_to_offsets(offsets_ptr, offsets_ptr + size + 1, d_offsets64, stream);
+    return make_strings_column(
+      size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask));
+  } else {
+    // no need for copies, just transfer ownership of the data_buffers to the columns
+    auto offsets_col = std::make_unique<column>(
+      data_type{type_to_id<size_type>()}, size + 1, std::move(_data), rmm::device_buffer{}, 0);
+    return make_strings_column(
+      size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask));
+  }
+}
+
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/row_selection.cpp b/cpp/src/io/utilities/row_selection.cpp
index f136cd11ff7..c0bbca39167 100644
--- a/cpp/src/io/utilities/row_selection.cpp
+++ b/cpp/src/io/utilities/row_selection.cpp
@@ -23,20 +23,17 @@
 
 namespace cudf::io::detail {
 
-std::pair<int64_t, size_type> skip_rows_num_rows_from_options(
-  int64_t skip_rows, std::optional<size_type> const& num_rows, int64_t num_source_rows)
+std::pair<int64_t, int64_t> skip_rows_num_rows_from_options(int64_t skip_rows,
+                                                            std::optional<int64_t> const& num_rows,
+                                                            int64_t num_source_rows)
 {
-  auto const rows_to_skip = std::min(skip_rows, num_source_rows);
-  if (not num_rows.has_value()) {
-    CUDF_EXPECTS(num_source_rows - rows_to_skip <= std::numeric_limits<size_type>::max(),
-                 "The requested number of rows exceeds the column size limit",
-                 std::overflow_error);
-    return {rows_to_skip, num_source_rows - rows_to_skip};
-  }
+  auto const rows_to_skip      = std::min(skip_rows, num_source_rows);
+  auto const num_rows_can_read = num_source_rows - rows_to_skip;
+
+  if (not num_rows.has_value()) { return {rows_to_skip, num_rows_can_read}; }
+
   // Limit the number of rows to the end of the input
-  return {
-    rows_to_skip,
-    static_cast<size_type>(std::min<int64_t>(num_rows.value(), num_source_rows - rows_to_skip))};
+  return {rows_to_skip, std::min(num_rows.value(), num_rows_can_read)};
 }
 
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/row_selection.hpp b/cpp/src/io/utilities/row_selection.hpp
index 0b5d3aef8bd..7fdcc65d77b 100644
--- a/cpp/src/io/utilities/row_selection.hpp
+++ b/cpp/src/io/utilities/row_selection.hpp
@@ -34,7 +34,8 @@ namespace cudf::io::detail {
  *
  * @throw std::overflow_exception The requested number of rows exceeds the column size limit
  */
-std::pair<int64_t, size_type> skip_rows_num_rows_from_options(
-  int64_t skip_rows, std::optional<size_type> const& num_rows, int64_t num_source_rows);
+std::pair<int64_t, int64_t> skip_rows_num_rows_from_options(int64_t skip_rows,
+                                                            std::optional<int64_t> const& num_rows,
+                                                            int64_t num_source_rows);
 
 }  // namespace cudf::io::detail
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 095093d08e5..f02dee5f7f5 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -37,6 +37,99 @@
 namespace cudf {
 namespace detail {
 
+std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
+  join_kind join_type,
+  std::optional<std::size_t> output_size,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  if (right.num_rows() == 0) {
+    switch (join_type) {
+      case join_kind::LEFT_ANTI_JOIN:
+        return std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
+      case join_kind::LEFT_SEMI_JOIN:
+        return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  } else if (left.num_rows() == 0) {
+    switch (join_type) {
+      case join_kind::LEFT_ANTI_JOIN: [[fallthrough]];
+      case join_kind::LEFT_SEMI_JOIN:
+        return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  }
+
+  auto const has_nulls = binary_predicate.may_evaluate_null(left, right, stream);
+
+  auto const parser =
+    ast::detail::expression_parser{binary_predicate, left, right, has_nulls, stream, mr};
+  CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
+               "The expression must produce a Boolean output.");
+
+  auto left_table  = table_device_view::create(left, stream);
+  auto right_table = table_device_view::create(right, stream);
+
+  detail::grid_1d const config(left.num_rows(), DEFAULT_JOIN_BLOCK_SIZE);
+  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
+
+  // TODO: Remove the output_size parameter. It is not needed because the
+  // output size is bounded by the size of the left table.
+  std::size_t join_size;
+  if (output_size.has_value()) {
+    join_size = *output_size;
+  } else {
+    // Allocate storage for the counter used to get the size of the join output
+    rmm::device_scalar<std::size_t> size(0, stream, mr);
+    if (has_nulls) {
+      compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
+        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+          *left_table, *right_table, join_type, parser.device_expression_data, false, size.data());
+    } else {
+      compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
+        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+          *left_table, *right_table, join_type, parser.device_expression_data, false, size.data());
+    }
+    join_size = size.value(stream);
+  }
+
+  if (left.num_rows() == 0) {
+    return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+  }
+
+  rmm::device_scalar<size_type> write_index(0, stream);
+
+  auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
+
+  auto const& join_output_l = left_indices->data();
+
+  if (has_nulls) {
+    conditional_join_anti_semi<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_table,
+        *right_table,
+        join_type,
+        join_output_l,
+        write_index.data(),
+        parser.device_expression_data,
+        join_size);
+  } else {
+    conditional_join_anti_semi<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_table,
+        *right_table,
+        join_type,
+        join_output_l,
+        write_index.data(),
+        parser.device_expression_data,
+        join_size);
+  }
+  return left_indices;
+}
+
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 conditional_join(table_view const& left,
@@ -50,9 +143,7 @@ conditional_join(table_view const& left,
   // We can immediately filter out cases where the right table is empty. In
   // some cases, we return all the rows of the left table with a corresponding
   // null index for the right table; in others, we return an empty output.
-  auto right_num_rows{right.num_rows()};
-  auto left_num_rows{left.num_rows()};
-  if (right_num_rows == 0) {
+  if (right.num_rows() == 0) {
     switch (join_type) {
       // Left, left anti, and full all return all the row indices from left
       // with a corresponding NULL from the right.
@@ -67,7 +158,7 @@ conditional_join(table_view const& left,
                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       default: CUDF_FAIL("Invalid join kind."); break;
     }
-  } else if (left_num_rows == 0) {
+  } else if (left.num_rows() == 0) {
     switch (join_type) {
       // Left, left anti, left semi, and inner joins all return empty sets.
       case join_kind::LEFT_JOIN:
@@ -101,8 +192,8 @@ conditional_join(table_view const& left,
 
   // For inner joins we support optimizing the join by launching one thread for
   // whichever table is larger rather than always using the left table.
-  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-  detail::grid_1d const config(swap_tables ? right_num_rows : left_num_rows,
+  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right.num_rows() > left.num_rows());
+  detail::grid_1d const config(swap_tables ? right.num_rows() : left.num_rows(),
                                DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
   join_kind const kernel_join_type =
@@ -187,7 +278,7 @@ conditional_join(table_view const& left,
   // by any row in the left table.
   if (join_type == join_kind::FULL_JOIN) {
     auto complement_indices = detail::get_left_join_indices_complement(
-      join_indices.second, left_num_rows, right_num_rows, stream, mr);
+      join_indices.second, left.num_rows(), right.num_rows(), stream, mr);
     join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
   }
   return join_indices;
@@ -210,21 +301,19 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
   // We can immediately filter out cases where one table is empty. In
   // some cases, we return all the rows of the other table with a corresponding
   // null index for the empty table; in others, we return an empty output.
-  auto right_num_rows{right.num_rows()};
-  auto left_num_rows{left.num_rows()};
-  if (right_num_rows == 0) {
+  if (right.num_rows() == 0) {
     switch (join_type) {
       // Left, left anti, and full all return all the row indices from left
       // with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
       case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::FULL_JOIN: return left_num_rows;
+      case join_kind::FULL_JOIN: return left.num_rows();
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN: return 0;
       default: CUDF_FAIL("Invalid join kind."); break;
     }
-  } else if (left_num_rows == 0) {
+  } else if (left.num_rows() == 0) {
     switch (join_type) {
       // Left, left anti, left semi, and inner joins all return empty sets.
       case join_kind::LEFT_JOIN:
@@ -232,7 +321,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN: return 0;
       // Full joins need to return the trivial complement.
-      case join_kind::FULL_JOIN: return right_num_rows;
+      case join_kind::FULL_JOIN: return right.num_rows();
       default: CUDF_FAIL("Invalid join kind."); break;
     }
   }
@@ -254,8 +343,8 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
 
   // For inner joins we support optimizing the join by launching one thread for
   // whichever table is larger rather than always using the left table.
-  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-  detail::grid_1d const config(swap_tables ? right_num_rows : left_num_rows,
+  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right.num_rows() > left.num_rows());
+  detail::grid_1d const config(swap_tables ? right.num_rows() : left.num_rows(),
                                DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
@@ -349,14 +438,13 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::conditional_join(left,
-                                            right,
-                                            binary_predicate,
-                                            detail::join_kind::LEFT_SEMI_JOIN,
-                                            output_size,
-                                            cudf::get_default_stream(),
-                                            mr)
-                     .first);
+  return std::move(detail::conditional_join_anti_semi(left,
+                                                      right,
+                                                      binary_predicate,
+                                                      detail::join_kind::LEFT_SEMI_JOIN,
+                                                      output_size,
+                                                      cudf::get_default_stream(),
+                                                      mr));
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
@@ -367,14 +455,13 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::conditional_join(left,
-                                            right,
-                                            binary_predicate,
-                                            detail::join_kind::LEFT_ANTI_JOIN,
-                                            output_size,
-                                            cudf::get_default_stream(),
-                                            mr)
-                     .first);
+  return std::move(detail::conditional_join_anti_semi(left,
+                                                      right,
+                                                      binary_predicate,
+                                                      detail::join_kind::LEFT_ANTI_JOIN,
+                                                      output_size,
+                                                      cudf::get_default_stream(),
+                                                      mr));
 }
 
 std::size_t conditional_inner_join_size(table_view const& left,
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index cc57fa7b03b..1e16c451f5a 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -67,8 +67,8 @@ CUDF_KERNEL void compute_conditional_join_output_size(
     &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
 
   std::size_t thread_counter{0};
-  auto const start_idx = cudf::detail::grid_1d::global_thread_id();
-  auto const stride    = cudf::detail::grid_1d::grid_stride();
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id<block_size>();
+  auto const stride    = cudf::detail::grid_1d::grid_stride<block_size>();
 
   cudf::thread_index_type const left_num_rows  = left_table.num_rows();
   cudf::thread_index_type const right_num_rows = right_table.num_rows();
@@ -174,7 +174,7 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
 
   __syncwarp();
 
-  auto outer_row_index = cudf::detail::grid_1d::global_thread_id();
+  auto outer_row_index = cudf::detail::grid_1d::global_thread_id<block_size>();
 
   unsigned int const activemask = __ballot_sync(0xffff'ffffu, outer_row_index < outer_num_rows);
 
@@ -271,6 +271,100 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
   }
 }
 
+template <cudf::size_type block_size, cudf::size_type output_cache_size, bool has_nulls>
+CUDF_KERNEL void conditional_join_anti_semi(
+  table_device_view left_table,
+  table_device_view right_table,
+  join_kind join_type,
+  cudf::size_type* join_output_l,
+  cudf::size_type* current_idx,
+  cudf::ast::detail::expression_device_view device_expression_data,
+  cudf::size_type const max_size)
+{
+  constexpr int num_warps = block_size / detail::warp_size;
+  __shared__ cudf::size_type current_idx_shared[num_warps];
+  __shared__ cudf::size_type join_shared_l[num_warps][output_cache_size];
+
+  extern __shared__ char raw_intermediate_storage[];
+  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
+  auto thread_intermediate_storage =
+    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
+
+  int const warp_id                            = threadIdx.x / detail::warp_size;
+  int const lane_id                            = threadIdx.x % detail::warp_size;
+  cudf::thread_index_type const outer_num_rows = left_table.num_rows();
+  cudf::thread_index_type const inner_num_rows = right_table.num_rows();
+  auto const stride                            = cudf::detail::grid_1d::grid_stride<block_size>();
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id<block_size>();
+
+  if (0 == lane_id) { current_idx_shared[warp_id] = 0; }
+
+  __syncwarp();
+
+  unsigned int const activemask = __ballot_sync(0xffff'ffffu, start_idx < outer_num_rows);
+
+  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
+    left_table, right_table, device_expression_data);
+
+  for (cudf::thread_index_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
+       outer_row_index += stride) {
+    bool found_match = false;
+    for (thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
+         ++inner_row_index) {
+      auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
+
+      evaluator.evaluate(
+        output_dest, outer_row_index, inner_row_index, 0, thread_intermediate_storage);
+
+      if (output_dest.is_valid() && output_dest.value()) {
+        if (join_type == join_kind::LEFT_SEMI_JOIN && !found_match) {
+          add_left_to_cache(outer_row_index, current_idx_shared, warp_id, join_shared_l[warp_id]);
+        }
+        found_match = true;
+      }
+
+      __syncwarp(activemask);
+
+      auto const do_flush   = current_idx_shared[warp_id] + detail::warp_size >= output_cache_size;
+      auto const flush_mask = __ballot_sync(activemask, do_flush);
+      if (do_flush) {
+        flush_output_cache<num_warps, output_cache_size>(flush_mask,
+                                                         max_size,
+                                                         warp_id,
+                                                         lane_id,
+                                                         current_idx,
+                                                         current_idx_shared,
+                                                         join_shared_l,
+                                                         join_output_l);
+        __syncwarp(flush_mask);
+        if (0 == lane_id) { current_idx_shared[warp_id] = 0; }
+      }
+      __syncwarp(activemask);
+    }
+
+    if ((join_type == join_kind::LEFT_ANTI_JOIN) && (!found_match)) {
+      add_left_to_cache(outer_row_index, current_idx_shared, warp_id, join_shared_l[warp_id]);
+    }
+
+    __syncwarp(activemask);
+
+    auto const do_flush   = current_idx_shared[warp_id] > 0;
+    auto const flush_mask = __ballot_sync(activemask, do_flush);
+    if (do_flush) {
+      flush_output_cache<num_warps, output_cache_size>(flush_mask,
+                                                       max_size,
+                                                       warp_id,
+                                                       lane_id,
+                                                       current_idx,
+                                                       current_idx_shared,
+                                                       join_shared_l,
+                                                       join_output_l);
+    }
+    if (found_match) break;
+  }
+}
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index a3652942973..ad401bdccba 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -46,8 +46,6 @@ namespace cudf {
 namespace detail {
 namespace {
 
-static auto constexpr DISTINCT_JOIN_BLOCK_SIZE = 256;
-
 template <cudf::has_nested HasNested>
 auto prepare_device_equal(
   std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> build,
@@ -82,7 +80,7 @@ class build_keys_fn {
 
 /**
  * @brief Device output transform functor to construct `size_type` with `cuco::pair<hash_value_type,
- * lhs_index_type>`
+ * lhs_index_type>` or `cuco::pair<hash_value_type, rhs_index_type>`
  */
 struct output_fn {
   __device__ constexpr cudf::size_type operator()(
@@ -90,167 +88,12 @@ struct output_fn {
   {
     return static_cast<cudf::size_type>(x.second);
   }
-};
-
-template <typename Tile>
-__device__ void flush_buffer(Tile const& tile,
-                             cudf::size_type tile_count,
-                             cuco::pair<cudf::size_type, cudf::size_type>* buffer,
-                             cudf::size_type* counter,
-                             cudf::size_type* build_indices,
-                             cudf::size_type* probe_indices)
-{
-  cudf::size_type offset;
-  auto const lane_id = tile.thread_rank();
-  if (0 == lane_id) { offset = atomicAdd(counter, tile_count); }
-  offset = tile.shfl(offset, 0);
-
-  for (cudf::size_type i = lane_id; i < tile_count; i += tile.size()) {
-    auto const& [build_idx, probe_idx] = buffer[i];
-    *(build_indices + offset + i)      = build_idx;
-    *(probe_indices + offset + i)      = probe_idx;
-  }
-}
-
-__device__ void flush_buffer(cooperative_groups::thread_block const& block,
-                             cudf::size_type buffer_size,
-                             cuco::pair<cudf::size_type, cudf::size_type>* buffer,
-                             cudf::size_type* counter,
-                             cudf::size_type* build_indices,
-                             cudf::size_type* probe_indices)
-{
-  auto i = block.thread_rank();
-  __shared__ cudf::size_type offset;
-
-  if (i == 0) { offset = atomicAdd(counter, buffer_size); }
-  block.sync();
-
-  while (i < buffer_size) {
-    auto const& [build_idx, probe_idx] = buffer[i];
-    *(build_indices + offset + i)      = build_idx;
-    *(probe_indices + offset + i)      = probe_idx;
-
-    i += block.size();
-  }
-}
-
-// TODO: custom kernel to be replaced by cuco::static_set::retrieve
-template <typename Iter, typename HashTable>
-CUDF_KERNEL void distinct_join_probe_kernel(Iter iter,
-                                            cudf::size_type n,
-                                            HashTable hash_table,
-                                            cudf::size_type* counter,
-                                            cudf::size_type* build_indices,
-                                            cudf::size_type* probe_indices)
-{
-  namespace cg = cooperative_groups;
-
-  auto constexpr tile_size   = HashTable::cg_size;
-  auto constexpr window_size = HashTable::window_size;
-
-  auto idx          = cudf::detail::grid_1d::global_thread_id() / tile_size;
-  auto const stride = cudf::detail::grid_1d::grid_stride() / tile_size;
-  auto const block  = cg::this_thread_block();
-
-  // CG-based probing algorithm
-  if constexpr (tile_size != 1) {
-    auto const tile = cg::tiled_partition<tile_size>(block);
-
-    auto constexpr flushing_tile_size = cudf::detail::warp_size / window_size;
-    // random choice to tune
-    auto constexpr flushing_buffer_size = 2 * flushing_tile_size;
-    auto constexpr num_flushing_tiles   = DISTINCT_JOIN_BLOCK_SIZE / flushing_tile_size;
-    auto constexpr max_matches          = flushing_tile_size / tile_size;
-
-    auto const flushing_tile    = cg::tiled_partition<flushing_tile_size>(block);
-    auto const flushing_tile_id = block.thread_rank() / flushing_tile_size;
-
-    __shared__ cuco::pair<cudf::size_type, cudf::size_type>
-      flushing_tile_buffer[num_flushing_tiles][flushing_tile_size];
-    // per flushing-tile counter to track number of filled elements
-    __shared__ cudf::size_type flushing_counter[num_flushing_tiles];
-
-    if (flushing_tile.thread_rank() == 0) { flushing_counter[flushing_tile_id] = 0; }
-    flushing_tile.sync();  // sync still needed since cg.any doesn't imply a memory barrier
-
-    while (flushing_tile.any(idx < n)) {
-      bool active_flag = idx < n;
-      auto const active_flushing_tile =
-        cg::binary_partition<flushing_tile_size>(flushing_tile, active_flag);
-      if (active_flag) {
-        auto const found = hash_table.find(tile, *(iter + idx));
-        if (tile.thread_rank() == 0 and found != hash_table.end()) {
-          auto const offset = atomicAdd_block(&flushing_counter[flushing_tile_id], 1);
-          flushing_tile_buffer[flushing_tile_id][offset] = cuco::pair{
-            static_cast<cudf::size_type>(found->second), static_cast<cudf::size_type>(idx)};
-        }
-      }
-
-      flushing_tile.sync();
-      if (flushing_counter[flushing_tile_id] + max_matches > flushing_buffer_size) {
-        flush_buffer(flushing_tile,
-                     flushing_counter[flushing_tile_id],
-                     flushing_tile_buffer[flushing_tile_id],
-                     counter,
-                     build_indices,
-                     probe_indices);
-        flushing_tile.sync();
-        if (flushing_tile.thread_rank() == 0) { flushing_counter[flushing_tile_id] = 0; }
-        flushing_tile.sync();
-      }
-
-      idx += stride;
-    }  // while
-
-    if (flushing_counter[flushing_tile_id] > 0) {
-      flush_buffer(flushing_tile,
-                   flushing_counter[flushing_tile_id],
-                   flushing_tile_buffer[flushing_tile_id],
-                   counter,
-                   build_indices,
-                   probe_indices);
-    }
-  }
-  // Scalar probing for CG size 1
-  else {
-    using block_scan = cub::BlockScan<cudf::size_type, DISTINCT_JOIN_BLOCK_SIZE>;
-    __shared__ typename block_scan::TempStorage block_scan_temp_storage;
-
-    auto constexpr buffer_capacity = 2 * DISTINCT_JOIN_BLOCK_SIZE;
-    __shared__ cuco::pair<cudf::size_type, cudf::size_type> buffer[buffer_capacity];
-    cudf::size_type buffer_size = 0;
-
-    while (idx - block.thread_rank() < n) {  // the whole thread block falls into the same iteration
-      auto const found     = idx < n ? hash_table.find(*(iter + idx)) : hash_table.end();
-      auto const has_match = found != hash_table.end();
-
-      // Use a whole-block scan to calculate the output location
-      cudf::size_type offset;
-      cudf::size_type block_count;
-      block_scan(block_scan_temp_storage)
-        .ExclusiveSum(static_cast<cudf::size_type>(has_match), offset, block_count);
-
-      if (buffer_size + block_count > buffer_capacity) {
-        flush_buffer(block, buffer_size, buffer, counter, build_indices, probe_indices);
-        block.sync();
-        buffer_size = 0;
-      }
-
-      if (has_match) {
-        buffer[buffer_size + offset] = cuco::pair{static_cast<cudf::size_type>(found->second),
-                                                  static_cast<cudf::size_type>(idx)};
-      }
-      buffer_size += block_count;
-      block.sync();
-
-      idx += stride;
-    }  // while
-
-    if (buffer_size > 0) {
-      flush_buffer(block, buffer_size, buffer, counter, build_indices, probe_indices);
-    }
+  __device__ constexpr cudf::size_type operator()(
+    cuco::pair<hash_value_type, rhs_index_type> const& x) const
+  {
+    return static_cast<cudf::size_type>(x.second);
   }
-}
+};
 }  // namespace
 
 template <cudf::has_nested HasNested>
@@ -332,19 +175,16 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
   auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls});
   auto const iter           = cudf::detail::make_counting_transform_iterator(
     0, build_keys_fn<decltype(d_probe_hasher), rhs_index_type>{d_probe_hasher});
-  auto counter = rmm::device_scalar<cudf::size_type>{stream};
-  counter.set_value_to_zero_async(stream);
-
-  cudf::detail::grid_1d grid{probe_table_num_rows, DISTINCT_JOIN_BLOCK_SIZE};
-  distinct_join_probe_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    iter,
-    probe_table_num_rows,
-    this->_hash_table.ref(cuco::find),
-    counter.data(),
-    build_indices->data(),
-    probe_indices->data());
-
-  auto const actual_size = counter.value(stream);
+
+  auto const build_indices_begin =
+    thrust::make_transform_output_iterator(build_indices->begin(), output_fn{});
+  auto const probe_indices_begin =
+    thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{});
+
+  auto const [probe_indices_end, _] = this->_hash_table.retrieve(
+    iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, stream.value());
+
+  auto const actual_size = std::distance(probe_indices_begin, probe_indices_end);
   build_indices->resize(actual_size, stream);
   probe_indices->resize(actual_size, stream);
 
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index fbe16378e8c..b0184ff6a86 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -21,6 +21,8 @@
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/join.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -569,12 +571,9 @@ hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
                      std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
-  CUDF_EXPECTS(std::equal(std::cbegin(_build),
-                          std::cend(_build),
-                          std::cbegin(probe),
-                          std::cend(probe),
-                          [](auto const& b, auto const& p) { return b.type() == p.type(); }),
-               "Mismatch in joining column data types");
+  CUDF_EXPECTS(cudf::have_same_types(_build, probe),
+               "Mismatch in joining column data types",
+               cudf::data_type_error);
 
   return probe_join_indices(probe, join, output_size, stream, mr);
 }
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 9758919c5b4..31f267d5cfb 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -281,12 +281,21 @@ __inline__ __device__ void add_pair_to_cache(size_type const first,
                                              size_type* joined_shared_r)
 {
   size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
-
   // its guaranteed to fit into the shared cache
   joined_shared_l[my_current_idx] = first;
   joined_shared_r[my_current_idx] = second;
 }
 
+__inline__ __device__ void add_left_to_cache(size_type const first,
+                                             size_type* current_idx_shared,
+                                             int const warp_id,
+                                             size_type* joined_shared_l)
+{
+  size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
+
+  joined_shared_l[my_current_idx] = first;
+}
+
 template <int num_warps, cudf::size_type output_cache_size>
 __device__ void flush_output_cache(unsigned int const activemask,
                                    cudf::size_type const max_size,
@@ -300,7 +309,7 @@ __device__ void flush_output_cache(unsigned int const activemask,
                                    size_type* join_output_r)
 {
   // count how many active threads participating here which could be less than warp_size
-  int num_threads               = __popc(activemask);
+  int const num_threads         = __popc(activemask);
   cudf::size_type output_offset = 0;
 
   if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
@@ -322,6 +331,32 @@ __device__ void flush_output_cache(unsigned int const activemask,
   }
 }
 
+template <int num_warps, cudf::size_type output_cache_size>
+__device__ void flush_output_cache(unsigned int const activemask,
+                                   cudf::size_type const max_size,
+                                   int const warp_id,
+                                   int const lane_id,
+                                   cudf::size_type* current_idx,
+                                   cudf::size_type current_idx_shared[num_warps],
+                                   size_type join_shared_l[num_warps][output_cache_size],
+                                   size_type* join_output_l)
+{
+  int const num_threads         = __popc(activemask);
+  cudf::size_type output_offset = 0;
+
+  if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
+
+  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
+
+  for (int shared_out_idx = lane_id; shared_out_idx < current_idx_shared[warp_id];
+       shared_out_idx += num_threads) {
+    cudf::size_type thread_offset = output_offset + shared_out_idx;
+    if (thread_offset < max_size) {
+      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
+    }
+  }
+}
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu
index 1bfa7f39190..7ee1d540831 100644
--- a/cpp/src/labeling/label_bins.cu
+++ b/cpp/src/labeling/label_bins.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -208,8 +209,10 @@ std::unique_ptr<column> label_bins(column_view const& input,
                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE()
-  CUDF_EXPECTS((input.type() == left_edges.type()) && (input.type() == right_edges.type()),
-               "The input and edge columns must have the same types.");
+  CUDF_EXPECTS(
+    cudf::have_same_types(input, left_edges) && cudf::have_same_types(input, right_edges),
+    "The input and edge columns must have the same types.",
+    cudf::data_type_error);
   CUDF_EXPECTS(left_edges.size() == right_edges.size(),
                "The left and right edge columns must be of the same length.");
   CUDF_EXPECTS(!left_edges.has_nulls() && !right_edges.has_nulls(),
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index 38d299763a1..bc1b48b11cd 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/lists/combine.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -204,12 +205,11 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
     std::all_of(input.begin(),
                 input.end(),
                 [](column_view const& col) { return col.type().id() == cudf::type_id::LIST; }),
-    "All columns of the input table must be of lists column type.");
-  CUDF_EXPECTS(
-    std::all_of(std::next(input.begin()),
-                input.end(),
-                [a = *input.begin()](column_view const& b) { return column_types_equal(a, b); }),
-    "The types of entries in the input columns must be the same.");
+    "All columns of the input table must be of list column type.",
+    cudf::data_type_error);
+  CUDF_EXPECTS(cudf::all_have_same_types(input.begin(), input.end()),
+               "The types of entries in the input columns must be the same.",
+               cudf::data_type_error);
 
   auto const num_rows = input.num_rows();
   auto const num_cols = input.num_columns();
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 4737b077deb..f03d394d6d7 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -27,6 +27,7 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -194,7 +195,7 @@ std::unique_ptr<column> dispatch_index_of(lists_column_view const& lists,
   // comparisons.
   auto const child = lists.child();
 
-  CUDF_EXPECTS(child.type() == search_keys.type(),
+  CUDF_EXPECTS(cudf::have_same_types(child, search_keys),
                "Type/Scale of search key does not match list column element type.",
                cudf::data_type_error);
   CUDF_EXPECTS(search_keys.type().id() != type_id::EMPTY, "Type cannot be empty.");
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index 88eccf13f72..be8fad62412 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -128,12 +128,20 @@ std::unique_ptr<column> concatenate_and_gather_lists(host_span<column_view const
   return std::move(result->release()[0]);
 }
 
+// Error case when no other overload or specialization is available
+template <typename T, typename Enable = void>
+struct interleave_list_entries_impl {
+  template <typename... Args>
+  std::unique_ptr<column> operator()(Args&&...)
+  {
+    CUDF_FAIL("Called `interleave_list_entries_fn()` on non-supported types.");
+  }
+};
+
 /**
- * @brief Compute string sizes, string validities, and interleave string lists functor.
+ * @brief Interleave array of string_index_pair objects for a list of strings
  *
- * This functor is executed twice. In the first pass, the sizes and validities of the output strings
- * will be computed. In the second pass, this will interleave the lists of strings of the given
- * table containing those lists.
+ * Each thread processes the strings for the corresponding list row
  */
 struct compute_string_sizes_and_interleave_lists_fn {
   table_device_view const table_dv;
@@ -141,19 +149,10 @@ struct compute_string_sizes_and_interleave_lists_fn {
   // Store list offsets of the output lists column.
   size_type const* const dst_list_offsets;
 
-  // Flag to specify whether to compute string validities.
-  bool const has_null_mask;
-
-  // Store offsets of the strings.
-  size_type* d_offsets{nullptr};
-
-  // If d_chars == nullptr: only compute sizes and validities of the output strings.
-  // If d_chars != nullptr: only interleave lists of strings.
-  char* d_chars{nullptr};
-
-  // We need to set `1` or `0` for the validities of the strings in the child column.
-  int8_t* d_validities{nullptr};
+  using string_index_pair = cudf::strings::detail::string_index_pair;
+  string_index_pair* indices;  // output
 
+  // thread per list row per column
   __device__ void operator()(size_type const idx)
   {
     auto const num_cols = table_dv.num_columns();
@@ -161,7 +160,7 @@ struct compute_string_sizes_and_interleave_lists_fn {
     auto const list_id  = idx / num_cols;
 
     auto const& lists_col = table_dv.column(col_id);
-    if (has_null_mask and lists_col.is_null(list_id)) { return; }
+    if (lists_col.is_null(list_id)) { return; }
 
     auto const list_offsets =
       lists_col.child(lists_column_view::offsets_column_index).template data<size_type>() +
@@ -181,65 +180,40 @@ struct compute_string_sizes_and_interleave_lists_fn {
     // read_idx and write_idx are indices of string elements.
     size_type write_idx = dst_list_offsets[idx];
 
-    if (not d_chars) {  // just compute sizes and validities of strings within a list
-      for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) {
-        if (has_null_mask) {
-          d_validities[write_idx] = static_cast<int8_t>(str_col.is_valid(read_idx));
-        }
-        d_offsets[write_idx] = str_offsets[read_idx + 1] - str_offsets[read_idx];
-      }
-    } else {  // just copy the entire memory region containing all strings in the list
-      // start_byte and end_byte are indices of character of the string elements.
-      auto const start_byte = str_offsets[start_str_idx];
-      auto const end_byte   = str_offsets[end_str_idx];
-      if (start_byte < end_byte) {
-        auto const input_ptr  = str_col.template head<char>() + start_byte;
-        auto const output_ptr = d_chars + d_offsets[write_idx];
-        thrust::copy(thrust::seq, input_ptr, input_ptr + end_byte - start_byte, output_ptr);
+    for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) {
+      auto const offset        = str_offsets[read_idx];
+      auto const size          = str_offsets[read_idx + 1] - offset;
+      string_index_pair result = {nullptr, size};
+      if (str_col.is_valid(read_idx)) {
+        result.first = size > 0 ? str_col.template head<char>() + offset : "";
       }
+      indices[write_idx] = result;
     }
   }
 };
 
-// Error case when no other overload or specialization is available
-template <typename T, typename Enable = void>
-struct interleave_list_entries_impl {
-  template <typename... Args>
-  std::unique_ptr<column> operator()(Args&&...)
-  {
-    CUDF_FAIL("Called `interleave_list_entries_fn()` on non-supported types.");
-  }
-};
-
 template <typename T>
 struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
   std::unique_ptr<column> operator()(table_view const& input,
                                      column_view const& output_list_offsets,
                                      size_type num_output_lists,
                                      size_type num_output_entries,
-                                     bool data_has_null_mask,
+                                     bool,
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr) const noexcept
   {
-    auto const table_dv_ptr = table_device_view::create(input, stream);
-    auto comp_fn            = compute_string_sizes_and_interleave_lists_fn{
-      *table_dv_ptr, output_list_offsets.template begin<size_type>(), data_has_null_mask};
-
-    auto validities =
-      rmm::device_uvector<int8_t>(data_has_null_mask ? num_output_entries : 0, stream);
-    comp_fn.d_validities = validities.data();
-
-    auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-      comp_fn, num_output_lists, num_output_entries, stream, mr);
-
-    auto [null_mask, null_count] =
-      cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);
-
-    return make_strings_column(num_output_entries,
-                               std::move(offsets_column),
-                               chars.release(),
-                               null_count,
-                               std::move(null_mask));
+    auto const table_dv_ptr   = table_device_view::create(input, stream);
+    auto const d_list_offsets = output_list_offsets.template begin<size_type>();
+
+    rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(num_output_entries,
+                                                                          stream);
+    auto comp_fn =
+      compute_string_sizes_and_interleave_lists_fn{*table_dv_ptr, d_list_offsets, indices.data()};
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::counting_iterator<size_type>(0),
+                       num_output_lists,
+                       comp_fn);
+    return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
   }
 };
 
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
index cb14ae7619b..7d57d8ddb60 100644
--- a/cpp/src/lists/sequences.cu
+++ b/cpp/src/lists/sequences.cu
@@ -23,6 +23,8 @@
 #include <cudf/lists/filling.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -139,15 +141,18 @@ std::unique_ptr<column> sequences(column_view const& starts,
                "starts and sizes input columns must not have nulls.");
   CUDF_EXPECTS(starts.size() == sizes.size(),
                "starts and sizes input columns must have the same number of rows.");
-  CUDF_EXPECTS(cudf::is_index_type(sizes.type()), "Input sizes column must be of integer types.");
+  CUDF_EXPECTS(cudf::is_index_type(sizes.type()),
+               "Input sizes column must be of integer types.",
+               cudf::data_type_error);
 
   if (steps) {
     auto const& steps_cv = steps.value();
     CUDF_EXPECTS(!steps_cv.has_nulls(), "steps input column must not have nulls.");
     CUDF_EXPECTS(starts.size() == steps_cv.size(),
                  "starts and steps input columns must have the same number of rows.");
-    CUDF_EXPECTS(starts.type() == steps_cv.type(),
-                 "starts and steps input columns must have the same type.");
+    CUDF_EXPECTS(cudf::have_same_types(starts, steps_cv),
+                 "starts and steps input columns must have the same type.",
+                 cudf::data_type_error);
   }
 
   auto const n_lists = starts.size();
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index f3352a3a52d..1d18b8c677c 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -52,7 +52,7 @@ namespace {
 void check_compatibility(lists_column_view const& lhs, lists_column_view const& rhs)
 {
   CUDF_EXPECTS(lhs.size() == rhs.size(), "The input lists column must have the same size.");
-  CUDF_EXPECTS(column_types_equal(lhs.child(), rhs.child()),
+  CUDF_EXPECTS(have_same_types(lhs.child(), rhs.child()),
                "The input lists columns must have children having the same type structure");
 }
 
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 5a3be259ed9..630cf328579 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -34,6 +34,7 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index d764ea7559f..cde0274339a 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -28,6 +28,8 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -154,8 +156,9 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(!init.has_value() || col.type() == init.value().get().type(),
-               "column and initial value must be the same type");
+  CUDF_EXPECTS(!init.has_value() || cudf::have_same_types(col, init.value().get()),
+               "column and initial value must be the same type",
+               cudf::data_type_error);
   if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT ||
                             agg.kind == aggregation::MIN || agg.kind == aggregation::MAX ||
                             agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) {
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index dee16b3e503..1ae344dcace 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -112,8 +113,9 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(!init.has_value() || segmented_values.type() == init.value().get().type(),
-               "column and initial value must be the same type");
+  CUDF_EXPECTS(!init.has_value() || cudf::have_same_types(segmented_values, init.value().get()),
+               "column and initial value must be the same type",
+               cudf::data_type_error);
   if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT ||
                             agg.kind == aggregation::MIN || agg.kind == aggregation::MAX ||
                             agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) {
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index fe5a9cfbd71..cb3caf9d068 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -28,11 +28,13 @@
 #include <cudf/replace.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -52,26 +54,22 @@ namespace {
 
 template <typename OptionalScalarIterator, typename ReplaceScalarIterator>
 struct clamp_strings_fn {
+  using string_index_pair = cudf::strings::detail::string_index_pair;
   column_device_view const d_strings;
   OptionalScalarIterator lo_itr;
   ReplaceScalarIterator lo_replace_itr;
   OptionalScalarIterator hi_itr;
   ReplaceScalarIterator hi_replace_itr;
-  size_type* d_offsets{};
-  char* d_chars{};
 
-  __device__ void operator()(size_type idx) const
+  __device__ string_index_pair operator()(size_type idx) const
   {
-    if (d_strings.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
-      return;
-    }
+    if (d_strings.is_null(idx)) { return string_index_pair{nullptr, 0}; }
+
     auto const element      = d_strings.element<string_view>(idx);
     auto const d_lo         = (*lo_itr).value_or(element);
     auto const d_hi         = (*hi_itr).value_or(element);
     auto const d_lo_replace = *(*lo_replace_itr);
     auto const d_hi_replace = *(*hi_replace_itr);
-    auto d_output           = d_chars ? d_chars + d_offsets[idx] : nullptr;
 
     auto d_str = [d_lo, d_lo_replace, d_hi, d_hi_replace, element] {
       if (element < d_lo) { return d_lo_replace; }
@@ -79,11 +77,9 @@ struct clamp_strings_fn {
       return element;
     }();
 
-    if (d_output) {
-      cudf::strings::detail::copy_string(d_output, d_str);
-    } else {
-      d_offsets[idx] = d_str.size_bytes();
-    }
+    // ensures an empty string is not converted to a null row
+    return !d_str.empty() ? string_index_pair{d_str.data(), d_str.size_bytes()}
+                          : string_index_pair{"", 0};
   }
 };
 
@@ -101,14 +97,14 @@ std::unique_ptr<cudf::column> clamp_string_column(strings_column_view const& inp
 
   auto fn = clamp_strings_fn<OptionalScalarIterator, ReplaceScalarIterator>{
     d_input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr};
-  auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(fn, input.size(), stream, mr);
-
-  return make_strings_column(input.size(),
-                             std::move(offsets_column),
-                             chars.release(),
-                             input.null_count(),
-                             std::move(cudf::detail::copy_bitmask(input.parent(), stream, mr)));
+  rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(input.size(), stream);
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(input.size()),
+                    indices.begin(),
+                    fn);
+
+  return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
 }
 
 template <typename T, typename OptionalScalarIterator, typename ReplaceScalarIterator>
@@ -198,7 +194,9 @@ struct dispatch_clamp {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
   {
-    CUDF_EXPECTS(lo.type() == input.type(), "mismatching types of scalar and input");
+    CUDF_EXPECTS(cudf::have_same_types(input, lo),
+                 "mismatching types of scalar and input",
+                 cudf::data_type_error);
 
     auto lo_itr         = make_optional_iterator<T>(lo, nullate::YES{});
     auto hi_itr         = make_optional_iterator<T>(hi, nullate::YES{});
@@ -322,9 +320,14 @@ std::unique_ptr<column> clamp(column_view const& input,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(lo.type() == hi.type(), "mismatching types of limit scalars");
-  CUDF_EXPECTS(lo_replace.type() == hi_replace.type(), "mismatching types of replace scalars");
-  CUDF_EXPECTS(lo.type() == lo_replace.type(), "mismatching types of limit and replace scalars");
+  CUDF_EXPECTS(
+    cudf::have_same_types(lo, hi), "mismatching types of limit scalars", cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(lo_replace, hi_replace),
+               "mismatching types of replace scalars",
+               cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(lo, lo_replace),
+               "mismatching types of limit and replace scalars",
+               cudf::data_type_error);
 
   if ((not lo.is_valid(stream) and not hi.is_valid(stream)) or (input.is_empty())) {
     // There will be no change
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index fe3d20e372e..13e130588c1 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -38,6 +38,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -216,7 +217,8 @@ struct replace_nulls_scalar_kernel_forwarder {
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
   {
-    CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
+    CUDF_EXPECTS(
+      cudf::have_same_types(input, replacement), "Data type mismatch", cudf::data_type_error);
     std::unique_ptr<cudf::column> output = cudf::detail::allocate_like(
       input, input.size(), cudf::mask_allocation_policy::NEVER, stream, mr);
     auto output_view = output->mutable_view();
@@ -252,9 +254,10 @@ std::unique_ptr<cudf::column> replace_nulls_scalar_kernel_forwarder::operator()<
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
+  CUDF_EXPECTS(
+    cudf::have_same_types(input, replacement), "Data type mismatch", cudf::data_type_error);
   cudf::strings_column_view input_s(input);
-  cudf::string_scalar const& repl = static_cast<cudf::string_scalar const&>(replacement);
+  auto const& repl = static_cast<cudf::string_scalar const&>(replacement);
   return cudf::strings::detail::replace_nulls(input_s, repl, stream, mr);
 }
 
@@ -318,7 +321,8 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
+  CUDF_EXPECTS(
+    cudf::have_same_types(input, replacement), "Data type mismatch", cudf::data_type_error);
   CUDF_EXPECTS(replacement.size() == input.size(), "Column size mismatch");
 
   if (input.is_empty()) { return cudf::empty_like(input); }
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 7bc0bd7e0be..c2cd03cd761 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -48,6 +48,7 @@
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -303,9 +304,10 @@ std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& inpu
   CUDF_EXPECTS(values_to_replace.size() == replacement_values.size(),
                "values_to_replace and replacement_values size mismatch.");
 
-  CUDF_EXPECTS(
-    input_col.type() == values_to_replace.type() && input_col.type() == replacement_values.type(),
-    "Columns type mismatch");
+  CUDF_EXPECTS(cudf::have_same_types(input_col, values_to_replace) &&
+                 cudf::have_same_types(input_col, replacement_values),
+               "Columns type mismatch",
+               cudf::data_type_error);
   CUDF_EXPECTS(not values_to_replace.has_nulls(), "values_to_replace must not have nulls");
 
   if (input_col.is_empty() or values_to_replace.is_empty() or replacement_values.is_empty()) {
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 3d1421120fd..580db0e24c5 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/detail/interleave_columns.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -140,85 +141,53 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::struc
   }
 };
 
+struct interleave_strings_fn {
+  using string_index_pair = cudf::strings::detail::string_index_pair;
+  table_device_view d_table;
+
+  __device__ string_index_pair operator()(size_type idx)
+  {
+    auto const num_columns    = d_table.num_columns();
+    auto const source_col_idx = idx % num_columns;
+    auto const source_row_idx = idx / num_columns;
+    auto const col            = d_table.column(source_col_idx);
+    if (col.is_null(source_row_idx)) { return string_index_pair{nullptr, 0}; }
+    auto const d_str = col.element<string_view>(source_row_idx);
+    // ensures an empty string is not identified as a null row
+    return !d_str.empty() ? string_index_pair{d_str.data(), d_str.size_bytes()}
+                          : string_index_pair{"", 0};
+  }
+};
+
 template <typename T>
 struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
   std::unique_ptr<cudf::column> operator()(table_view const& strings_columns,
-                                           bool create_mask,
+                                           bool,
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
   {
     auto num_columns = strings_columns.num_columns();
-    if (num_columns == 1)  // Single strings column returns a copy
+    if (num_columns == 1) {  // Single strings column returns a copy
       return std::make_unique<column>(*(strings_columns.begin()), stream, mr);
+    }
 
     auto strings_count = strings_columns.num_rows();
-    if (strings_count == 0)  // All columns have 0 rows
+    if (strings_count == 0) {  // All columns have 0 rows
       return make_empty_column(type_id::STRING);
+    }
 
     // Create device views from the strings columns.
-    auto table       = table_device_view::create(strings_columns, stream);
-    auto d_table     = *table;
+    auto d_table     = table_device_view::create(strings_columns, stream);
     auto num_strings = num_columns * strings_count;
 
-    std::pair<rmm::device_buffer, size_type> valid_mask{};
-    if (create_mask) {
-      // Create resulting null mask
-      valid_mask = cudf::detail::valid_if(
-        thrust::make_counting_iterator<size_type>(0),
-        thrust::make_counting_iterator<size_type>(num_strings),
-        [num_columns, d_table] __device__(size_type idx) {
-          auto source_row_idx = idx % num_columns;
-          auto source_col_idx = idx / num_columns;
-          return !d_table.column(source_row_idx).is_null(source_col_idx);
-        },
-        stream,
-        mr);
-    }
-
-    auto const null_count = valid_mask.second;
-
-    // Build offsets column by computing sizes of each string in the output
-    auto offsets_transformer =
-      cuda::proclaim_return_type<size_type>([num_columns, d_table] __device__(size_type idx) {
-        // First compute the column and the row this item belongs to
-        auto source_row_idx = idx % num_columns;
-        auto source_col_idx = idx / num_columns;
-        return d_table.column(source_row_idx).is_valid(source_col_idx)
-                 ? d_table.column(source_row_idx).element<string_view>(source_col_idx).size_bytes()
-                 : 0;
-      });
-    auto offsets_transformer_itr = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<size_type>(0), offsets_transformer);
-    auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
-      offsets_transformer_itr, offsets_transformer_itr + num_strings, stream, mr);
-    auto d_results_offsets =
-      cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
-
-    // Create the chars column
-    rmm::device_uvector<char> chars(bytes, stream, mr);
-    auto d_results_chars = chars.data();
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      num_strings,
-      [num_columns, d_table, d_results_offsets, d_results_chars] __device__(size_type idx) {
-        auto source_row_idx = idx % num_columns;
-        auto source_col_idx = idx / num_columns;
-
-        // Do not write to buffer if the column value for this row is null
-        if (d_table.column(source_row_idx).is_null(source_col_idx)) return;
-
-        size_type offset = d_results_offsets[idx];
-        char* d_buffer   = d_results_chars + offset;
-        strings::detail::copy_string(
-          d_buffer, d_table.column(source_row_idx).element<string_view>(source_col_idx));
-      });
+    rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(num_strings, stream);
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(num_strings),
+                      indices.begin(),
+                      interleave_strings_fn{*d_table});
 
-    return make_strings_column(num_strings,
-                               std::move(offsets_column),
-                               chars.release(),
-                               null_count,
-                               std::move(valid_mask.first));
+    return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
   }
 };
 
diff --git a/cpp/src/rolling/detail/lead_lag_nested.cuh b/cpp/src/rolling/detail/lead_lag_nested.cuh
index 269868910c7..cfedcac8ae4 100644
--- a/cpp/src/rolling/detail/lead_lag_nested.cuh
+++ b/cpp/src/rolling/detail/lead_lag_nested.cuh
@@ -23,7 +23,9 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/scatter.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -99,8 +101,9 @@ std::unique_ptr<column> compute_lead_lag_for_nested(aggregation::Kind op,
 {
   CUDF_EXPECTS(op == aggregation::LEAD || op == aggregation::LAG,
                "Unexpected aggregation type in compute_lead_lag_for_nested");
-  CUDF_EXPECTS(default_outputs.type().id() == input.type().id(),
-               "Defaults column type must match input column.");  // Because LEAD/LAG.
+  CUDF_EXPECTS(cudf::have_same_types(input, default_outputs),
+               "Defaults column type must match input column.",
+               cudf::data_type_error);  // Because LEAD/LAG.
 
   CUDF_EXPECTS(default_outputs.is_empty() || (input.size() == default_outputs.size()),
                "Number of defaults must match input column.");
diff --git a/cpp/src/search/contains_scalar.cu b/cpp/src/search/contains_scalar.cu
index 0b344ec347b..e88acf68e28 100644
--- a/cpp/src/search/contains_scalar.cu
+++ b/cpp/src/search/contains_scalar.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,8 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -62,7 +64,9 @@ struct contains_scalar_dispatch {
                                                            scalar const& needle,
                                                            rmm::cuda_stream_view stream) const
   {
-    CUDF_EXPECTS(haystack.type() == needle.type(), "Scalar and column types must match");
+    CUDF_EXPECTS(cudf::have_same_types(haystack, needle),
+                 "Scalar and column types must match",
+                 cudf::data_type_error);
     // Don't need to check for needle validity. If it is invalid, it should be handled by the caller
     // before dispatching to this function.
 
@@ -87,7 +91,9 @@ struct contains_scalar_dispatch {
                                                           scalar const& needle,
                                                           rmm::cuda_stream_view stream) const
   {
-    CUDF_EXPECTS(haystack.type() == needle.type(), "Scalar and column types must match");
+    CUDF_EXPECTS(cudf::have_same_types(haystack, needle),
+                 "Scalar and column types must match",
+                 cudf::data_type_error);
     // Don't need to check for needle validity. If it is invalid, it should be handled by the caller
     // before dispatching to this function.
     // In addition, haystack and needle structure compatibility will be checked later on by
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index 13417fdab63..466f9093194 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -22,6 +22,7 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index 2bb85bf2c5c..031fff4086a 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/capitalize.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -64,8 +64,9 @@ struct base_fn {
   character_cases_table_type const* d_case_table;
   special_case_mapping const* d_special_case_mapping;
   column_device_view const d_column;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   base_fn(column_device_view const& d_column)
     : d_flags(get_character_flags_table()),
@@ -108,7 +109,7 @@ struct base_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -137,7 +138,7 @@ struct base_fn {
       // capitalize the next char if this one is a delimiter
       capitalize = derived.capitalize_next(chr, flag);
     }
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -231,7 +232,7 @@ std::unique_ptr<column> capitalizer(CapitalFn cfn,
                                     rmm::device_async_resource_ref mr)
 {
   auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr);
+    cudf::strings::detail::experimental::make_strings_children(cfn, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 82b590f81b3..5d5e6ba9a3e 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -23,7 +23,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -117,8 +117,9 @@ struct convert_char_fn {
  */
 struct base_upper_lower_fn {
   convert_char_fn converter;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   base_upper_lower_fn(convert_char_fn converter) : converter(converter) {}
 
@@ -137,7 +138,7 @@ struct base_upper_lower_fn {
         bytes += size;
       }
     }
-    if (!d_buffer) { d_offsets[idx] = bytes; }
+    if (!d_buffer) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -152,7 +153,7 @@ struct upper_lower_fn : public base_upper_lower_fn {
   __device__ void operator()(size_type idx) const
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str = d_strings.element<string_view>(idx);
@@ -295,8 +296,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
 
   // For smaller strings, use the regular string-parallel algorithm
   if ((chars_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
-    auto [offsets, chars] =
-      cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr);
+    auto [offsets, chars] = cudf::strings::detail::experimental::make_strings_children(
+      converter, input.size(), stream, mr);
     return make_strings_column(input.size(),
                                std::move(offsets),
                                chars.release(),
@@ -364,8 +365,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   // run case conversion over the new sub-strings
   auto const tmp_size = static_cast<size_type>(tmp_offsets.size()) - 1;
   upper_lower_ls_fn sub_conv{ccfn, input_chars, tmp_offsets.data()};
-  auto chars =
-    std::get<1>(cudf::strings::detail::make_strings_children(sub_conv, tmp_size, stream, mr));
+  auto chars = std::get<1>(
+    cudf::strings::detail::experimental::make_strings_children(sub_conv, tmp_size, stream, mr));
 
   return make_strings_column(input.size(),
                              std::move(offsets),
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 28068cf7e78..7716cf0cc29 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -21,7 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -130,8 +130,9 @@ struct filter_chars_fn {
   string_character_types const types_to_remove;
   string_character_types const types_to_keep;
   string_view const d_replacement;  ///< optional replacement for removed characters
-  int32_t* d_offsets{};             ///< size of the output string stored here during first pass
-  char* d_chars{};                  ///< this is null only during the first pass
+  size_type* d_sizes{};
+  char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Returns true if the given character should be replaced.
@@ -150,7 +151,7 @@ struct filter_chars_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str  = d_column.element<string_view>(idx);
@@ -165,7 +166,7 @@ struct filter_chars_fn {
       nbytes += d_newchar.size_bytes() - char_size;
       if (out_ptr) out_ptr = cudf::strings::detail::copy_string(out_ptr, d_newchar);
     }
-    if (!out_ptr) d_offsets[idx] = nbytes;
+    if (!out_ptr) { d_sizes[idx] = nbytes; }
   }
 };
 
@@ -202,7 +203,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
 
   // this utility calls filterer to build the offsets and chars columns
   auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
+    cudf::strings::detail::experimental::make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return make_strings_column(strings_count,
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index d1d9afbb85f..2e30e01df21 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -22,6 +22,7 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/detail/combine.hpp>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -84,8 +85,9 @@ struct join_base_fn {
  * This functor is suitable for make_strings_children
  */
 struct join_fn : public join_base_fn {
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   join_fn(column_device_view const d_strings,
           string_view d_separator,
@@ -106,7 +108,7 @@ struct join_fn : public join_base_fn {
     } else {
       bytes += d_str.size_bytes() + d_sep.size_bytes();
     }
-    if (!d_chars) { d_offsets[idx] = bytes; }
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -148,7 +150,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
     if ((input.size() == input.null_count()) ||
         ((input.chars_size(stream) / (input.size() - input.null_count())) <=
          AVG_CHAR_BYTES_THRESHOLD)) {
-      return std::get<1>(make_strings_children(
+      return std::get<1>(experimental::make_strings_children(
                            join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr))
         .release();
     }
@@ -160,16 +162,16 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
     return std::move(*chars_data);
   }();
 
+  // API returns a single output row which cannot exceed row limit(max of size_type).
+  CUDF_EXPECTS(chars.size() < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
+               "The output exceeds the row size limit",
+               std::overflow_error);
+
   // build the offsets: single string output has offsets [0,chars-size]
   auto offsets_column = [&] {
-    if (chars.size() < static_cast<std::size_t>(get_offset64_threshold())) {
-      auto offsets32 = cudf::detail::make_device_uvector_async(
-        std::vector<int32_t>({0, static_cast<int32_t>(chars.size())}), stream, mr);
-      return std::make_unique<column>(std::move(offsets32), rmm::device_buffer{}, 0);
-    }
-    auto offsets64 = cudf::detail::make_device_uvector_async(
-      std::vector<int64_t>({0L, static_cast<int64_t>(chars.size())}), stream, mr);
-    return std::make_unique<column>(std::move(offsets64), rmm::device_buffer{}, 0);
+    auto offsets = cudf::detail::make_device_uvector_async(
+      std::vector<size_type>({0, static_cast<size_type>(chars.size())}), stream, mr);
+    return std::make_unique<column>(std::move(offsets), rmm::device_buffer{}, 0);
   }();
 
   // build the null mask: only one output row so it is either all-valid or all-null
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index a54ea5263fe..b0073452741 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -22,7 +22,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -60,11 +60,12 @@ struct compute_size_and_concatenate_fn {
   separator_on_nulls const separate_nulls;
   output_if_empty_list const empty_list_policy;
 
-  size_type* d_offsets{nullptr};
+  size_type* d_sizes{nullptr};
 
   // If d_chars == nullptr: only compute sizes and validities of the output strings.
   // If d_chars != nullptr: only concatenate strings.
   char* d_chars{nullptr};
+  cudf::detail::input_offsetalator d_offsets;
 
   [[nodiscard]] __device__ bool output_is_null(size_type const idx,
                                                size_type const start_idx,
@@ -84,7 +85,7 @@ struct compute_size_and_concatenate_fn {
     auto const end_idx   = list_offsets[idx + 1];
 
     if (!d_chars && output_is_null(idx, start_idx, end_idx)) {
-      d_offsets[idx] = 0;
+      d_sizes[idx] = 0;
       return;
     }
 
@@ -120,7 +121,7 @@ struct compute_size_and_concatenate_fn {
 
     // If there are all null elements, the output should be the same as having an empty list input:
     // a null or an empty string
-    if (!d_chars) { d_offsets[idx] = has_valid_element ? size_bytes : 0; }
+    if (!d_chars) { d_sizes[idx] = has_valid_element ? size_bytes : 0; }
   }
 };
 
@@ -208,7 +209,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                                     separate_nulls,
                                                     empty_list_policy};
 
-  auto [offsets_column, chars] = make_strings_children(comp_fn, num_rows, stream, mr);
+  auto [offsets_column, chars] = experimental::make_strings_children(comp_fn, num_rows, stream, mr);
   auto [null_mask, null_count] =
     cudf::detail::valid_if(thrust::counting_iterator<size_type>(0),
                            thrust::counting_iterator<size_type>(num_rows),
@@ -283,7 +284,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                                     separate_nulls,
                                                     empty_list_policy};
 
-  auto [offsets_column, chars] = make_strings_children(comp_fn, num_rows, stream, mr);
+  auto [offsets_column, chars] = experimental::make_strings_children(comp_fn, num_rows, stream, mr);
   auto [null_mask, null_count] =
     cudf::detail::valid_if(thrust::counting_iterator<size_type>(0),
                            thrust::counting_iterator<size_type>(num_rows),
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index bf73800ad06..6b64006fa24 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -16,23 +16,19 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_booleans.hpp>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/traits.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
 
@@ -99,13 +95,14 @@ struct from_booleans_fn {
   column_device_view const d_column;
   string_view d_true;
   string_view d_false;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx) const
   {
     if (d_column.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -113,7 +110,7 @@ struct from_booleans_fn {
       auto const result = d_column.element<bool>(idx) ? d_true : d_false;
       memcpy(d_chars + d_offsets[idx], result.data(), result.size_bytes());
     } else {
-      d_offsets[idx] = d_column.element<bool>(idx) ? d_true.size_bytes() : d_false.size_bytes();
+      d_sizes[idx] = d_column.element<bool>(idx) ? d_true.size_bytes() : d_false.size_bytes();
     }
   };
 };
@@ -143,8 +140,8 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
   // copy null mask
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(booleans, stream, mr);
 
-  auto [offsets, chars] =
-    make_strings_children(from_booleans_fn{d_column, d_true, d_false}, strings_count, stream, mr);
+  auto [offsets, chars] = experimental::make_strings_children(
+    from_booleans_fn{d_column, d_true, d_false}, strings_count, stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index d6449fbb6c8..ddf68eae951 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -37,7 +37,6 @@
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/logical.h>
@@ -756,8 +755,9 @@ struct datetime_formatter_fn {
   column_device_view const d_timestamps;
   column_device_view const d_format_names;
   device_span<format_item const> const d_format_items;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Specialized modulo expression that handles negative values.
@@ -1087,14 +1087,14 @@ struct datetime_formatter_fn {
   __device__ void operator()(size_type idx) const
   {
     if (d_timestamps.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const tstamp = d_timestamps.element<T>(idx);
     if (d_chars) {
       timestamp_to_string(tstamp, d_chars + d_offsets[idx]);
     } else {
-      d_offsets[idx] = compute_output_size(tstamp);
+      d_sizes[idx] = compute_output_size(tstamp);
     }
   }
 };
@@ -1109,7 +1109,7 @@ struct dispatch_from_timestamps_fn {
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr) const
   {
-    return make_strings_children(
+    return experimental::make_strings_children(
       datetime_formatter_fn<T>{d_timestamps, d_format_names, d_format_items},
       d_timestamps.size(),
       stream,
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 77c750848cf..faf9a83f016 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -17,7 +17,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/convert/int_to_string.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -26,10 +26,8 @@
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 
@@ -192,8 +190,9 @@ struct from_durations_fn {
   column_device_view d_durations;
   format_item const* d_format_items;
   size_type items_count;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ int8_t format_length(char format_char, duration_component const* const timeparts) const
   {
@@ -378,14 +377,14 @@ struct from_durations_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_durations.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
 
     if (d_chars != nullptr) {
       set_chars(idx);
     } else {
-      d_offsets[idx] = string_size(d_durations.template element<T>(idx));
+      d_sizes[idx] = string_size(d_durations.template element<T>(idx));
     }
   }
 };
@@ -415,11 +414,11 @@ struct dispatch_from_durations_fn {
     // copy null mask
     rmm::device_buffer null_mask = cudf::detail::copy_bitmask(durations, stream, mr);
 
-    auto [offsets, chars] =
-      make_strings_children(from_durations_fn<T>{d_column, d_format_items, compiler.items_count()},
-                            strings_count,
-                            stream,
-                            mr);
+    auto [offsets, chars] = experimental::make_strings_children(
+      from_durations_fn<T>{d_column, d_format_items, compiler.items_count()},
+      strings_count,
+      stream,
+      mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 446baa8dea9..34f81b8b407 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -23,7 +23,7 @@
 #include <cudf/strings/detail/convert/fixed_point.cuh>
 #include <cudf/strings/detail/convert/fixed_point_to_string.cuh>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -37,10 +37,7 @@
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
-#include <thrust/generate.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/transform.h>
 
 namespace cudf {
@@ -198,8 +195,9 @@ namespace {
 template <typename DecimalType>
 struct from_fixed_point_fn {
   column_device_view d_decimals;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Converts a decimal element into a string.
@@ -219,13 +217,13 @@ struct from_fixed_point_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_decimals.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       fixed_point_element_to_string(idx);
     } else {
-      d_offsets[idx] =
+      d_sizes[idx] =
         fixed_point_string_size(d_decimals.element<DecimalType>(idx), d_decimals.type().scale());
     }
   }
@@ -244,8 +242,8 @@ struct dispatch_from_fixed_point_fn {
 
     auto const d_column = column_device_view::create(input, stream);
 
-    auto [offsets, chars] =
-      make_strings_children(from_fixed_point_fn<DecimalType>{*d_column}, input.size(), stream, mr);
+    auto [offsets, chars] = experimental::make_strings_children(
+      from_fixed_point_fn<DecimalType>{*d_column}, input.size(), stream, mr);
 
     return make_strings_column(input.size(),
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index c6061f7d8e6..0ed80b976fd 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -21,7 +21,7 @@
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/detail/convert/string_to_float.cuh>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -32,9 +32,7 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/transform.h>
 
 #include <cmath>
@@ -356,8 +354,9 @@ struct ftos_converter {
 template <typename FloatType>
 struct from_floats_fn {
   column_device_view d_floats;
-  size_type* d_offsets;
+  size_type* d_sizes;
   char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ size_type compute_output_size(FloatType value)
   {
@@ -375,13 +374,13 @@ struct from_floats_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_floats.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       float_to_string(idx);
     } else {
-      d_offsets[idx] = compute_output_size(d_floats.element<FloatType>(idx));
+      d_sizes[idx] = compute_output_size(d_floats.element<FloatType>(idx));
     }
   }
 };
@@ -404,8 +403,8 @@ struct dispatch_from_floats_fn {
     // copy the null mask
     rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr);
 
-    auto [offsets, chars] =
-      make_strings_children(from_floats_fn<FloatType>{d_column}, strings_count, stream, mr);
+    auto [offsets, chars] = experimental::make_strings_children(
+      from_floats_fn<FloatType>{d_column}, strings_count, stream, mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index 95af378fc3f..1f9fc3858f8 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_integers.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -123,8 +123,9 @@ struct dispatch_hex_to_integers_fn {
 template <typename IntegerType>
 struct integer_to_hex_fn {
   column_device_view const d_column;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void byte_to_hex(uint8_t byte, char* hex)
   {
@@ -141,7 +142,7 @@ struct integer_to_hex_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -167,7 +168,7 @@ struct integer_to_hex_fn {
         --byte_index;
       }
     } else {
-      d_offsets[idx] = static_cast<size_type>(bytes) * 2;  // 2 hex characters per byte
+      d_sizes[idx] = static_cast<size_type>(bytes) * 2;  // 2 hex characters per byte
     }
   }
 };
@@ -181,7 +182,7 @@ struct dispatch_integers_to_hex_fn {
   {
     auto const d_column = column_device_view::create(input, stream);
 
-    auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+    auto [offsets_column, chars] = experimental::make_strings_children(
       integer_to_hex_fn<IntegerType>{*d_column}, input.size(), stream, mr);
 
     return make_strings_column(input.size(),
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index f3e639817a6..918369ead4d 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -23,7 +23,7 @@
 #include <cudf/strings/detail/convert/int_to_string.cuh>
 #include <cudf/strings/detail/convert/string_to_int.cuh>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -34,9 +34,7 @@
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/logical.h>
 #include <thrust/pair.h>
 #include <thrust/transform.h>
@@ -314,8 +312,9 @@ namespace {
 template <typename IntegerType>
 struct from_integers_fn {
   column_device_view d_integers;
-  size_type* d_offsets;
+  size_type* d_sizes;
   char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Converts an integer element into a string.
@@ -334,13 +333,13 @@ struct from_integers_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_integers.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       integer_element_to_string(idx);
     } else {
-      d_offsets[idx] = count_digits(d_integers.element<IntegerType>(idx));
+      d_sizes[idx] = count_digits(d_integers.element<IntegerType>(idx));
     }
   }
 };
@@ -363,8 +362,8 @@ struct dispatch_from_integers_fn {
     // copy the null mask
     rmm::device_buffer null_mask = cudf::detail::copy_bitmask(integers, stream, mr);
 
-    auto [offsets, chars] =
-      make_strings_children(from_integers_fn<IntegerType>{d_column}, strings_count, stream, mr);
+    auto [offsets, chars] = experimental::make_strings_children(
+      from_integers_fn<IntegerType>{d_column}, strings_count, stream, mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 3d259f0ab82..33f6c553001 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_ipv4.hpp>
 #include <cudf/strings/detail/convert/int_to_string.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -124,13 +124,14 @@ namespace {
  */
 struct integers_to_ipv4_fn {
   column_device_view const d_column;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -151,7 +152,7 @@ struct integers_to_ipv4_fn {
       shift_bits -= 8;
     }
 
-    if (!d_chars) { d_offsets[idx] = bytes; }
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -167,7 +168,7 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
   CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type");
 
   auto d_column                = column_device_view::create(integers, stream);
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = experimental::make_strings_children(
     integers_to_ipv4_fn{*d_column}, integers.size(), stream, mr);
 
   return make_strings_column(integers.size(),
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
index ed898bd6f72..198e6c11ef3 100644
--- a/cpp/src/strings/convert/convert_lists.cu
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -17,7 +17,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_lists.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
@@ -66,8 +66,9 @@ struct format_lists_fn {
   string_view const d_na_rep;
   stack_item* d_stack;
   size_type const max_depth;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ column_device_view get_nested_child(size_type idx)
   {
@@ -184,7 +185,7 @@ struct format_lists_fn {
       }
     }
 
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -217,7 +218,7 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
   auto const d_separators = column_device_view::create(separators.parent(), stream);
   auto const d_na_rep     = na_rep.value(stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = experimental::make_strings_children(
     format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth},
     input.size(),
     stream,
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 644ffbb4bd1..d9920be045f 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/convert/convert_urls.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -50,8 +50,9 @@ namespace {
 //
 struct url_encoder_fn {
   column_device_view const d_strings;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   // utility to create 2-byte hex characters from single binary byte
   __device__ void byte_to_hex(uint8_t byte, char* hex)
@@ -80,7 +81,7 @@ struct url_encoder_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -117,7 +118,7 @@ struct url_encoder_fn {
         }
       }
     }
-    if (!d_chars) d_offsets[idx] = nbytes;
+    if (!d_chars) { d_sizes[idx] = nbytes; }
   }
 };
 
@@ -132,8 +133,8 @@ std::unique_ptr<column> url_encode(strings_column_view const& input,
 
   auto d_column = column_device_view::create(input.parent(), stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-    url_encoder_fn{*d_column}, input.size(), stream, mr);
+  auto [offsets_column, chars] =
+    experimental::make_strings_children(url_encoder_fn{*d_column}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
@@ -201,10 +202,11 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings,
   __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size];
   __shared__ typename cub::WarpReduce<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
 
-  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
-  auto const global_warp_id   = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
-  auto const local_warp_id    = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
-  auto const warp_lane        = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
+  auto const global_thread_id =
+    cudf::detail::grid_1d::global_thread_id<num_warps_per_threadblock * cudf::detail::warp_size>();
+  auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
+  auto const local_warp_id  = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
+  auto const warp_lane      = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
   auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
   char* in_chars_shared = temporary_buffer[local_warp_id];
 
@@ -286,10 +288,11 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
   __shared__ typename cub::WarpScan<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
   __shared__ size_type out_idx[num_warps_per_threadblock];
 
-  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
-  auto const global_warp_id   = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
-  auto const local_warp_id    = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
-  auto const warp_lane        = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
+  auto const global_thread_id =
+    cudf::detail::grid_1d::global_thread_id<num_warps_per_threadblock * cudf::detail::warp_size>();
+  auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
+  auto const local_warp_id  = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
+  auto const warp_lane      = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
   auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
   char* in_chars_shared = temporary_buffer[local_warp_id];
 
diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu
index 562ee6a7088..5bba4855390 100644
--- a/cpp/src/strings/copying/shift.cu
+++ b/cpp/src/strings/copying/shift.cu
@@ -19,8 +19,8 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/strings/detail/copying.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -104,8 +104,8 @@ std::unique_ptr<column> shift(strings_column_view const& input,
   auto const d_input = column_device_view::create(input.parent(), stream);
   auto sizes_itr     = cudf::detail::make_counting_transform_iterator(
     0, output_sizes_fn{*d_input, d_fill_str, offset});
-  auto [offsets_column, total_bytes] =
-    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto [offsets_column, total_bytes] = cudf::strings::detail::make_offsets_child_column(
+    sizes_itr, sizes_itr + input.size(), stream, mr);
   auto offsets_view = offsets_column->view();
 
   // compute the shift-offset for the output characters child column
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 32717dac78d..4705ae519cd 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -57,8 +57,9 @@ struct filter_fn {
   rmm::device_uvector<char_range>::iterator table_begin;
   rmm::device_uvector<char_range>::iterator table_end;
   string_view const d_replacement;
-  int32_t* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Return true if this character should be removed.
@@ -87,7 +88,7 @@ struct filter_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str = d_strings.element<string_view>(idx);
@@ -104,7 +105,7 @@ struct filter_fn {
       else
         nbytes += d_newchar.size_bytes() - char_size;
     }
-    if (!out_ptr) d_offsets[idx] = nbytes;
+    if (!out_ptr) { d_sizes[idx] = nbytes; }
   }
 };
 
@@ -141,7 +142,7 @@ std::unique_ptr<column> filter_characters(
   // this utility calls the strip_fn to build the offsets and chars columns
   filter_fn ffn{*d_strings, keep_characters, table.begin(), table.end(), d_replacement};
   auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(ffn, strings.size(), stream, mr);
+    cudf::strings::detail::experimental::make_strings_children(ffn, strings.size(), stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index d8a3055772e..3cfbf79a8f3 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/pad_impl.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/padding.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -47,8 +47,9 @@ struct base_fn {
   column_device_view const d_column;
   size_type const width;
   size_type const fill_char_size;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   base_fn(column_device_view const& d_column, size_type width, size_type fill_char_size)
     : d_column(d_column), width(width), fill_char_size(fill_char_size)
@@ -58,7 +59,7 @@ struct base_fn {
   __device__ void operator()(size_type idx) const
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -67,7 +68,7 @@ struct base_fn {
     if (d_chars) {
       derived.pad(d_str, d_chars + d_offsets[idx]);
     } else {
-      d_offsets[idx] = compute_padded_size(d_str, width, fill_char_size);
+      d_sizes[idx] = compute_padded_size(d_str, width, fill_char_size);
     }
   };
 };
@@ -116,13 +117,13 @@ std::unique_ptr<column> pad(strings_column_view const& input,
   auto [offsets_column, chars] = [&] {
     if (side == side_type::LEFT) {
       auto fn = pad_fn<side_type::LEFT>{*d_strings, width, fill_char_size, d_fill_char};
-      return make_strings_children(fn, input.size(), stream, mr);
+      return experimental::make_strings_children(fn, input.size(), stream, mr);
     } else if (side == side_type::RIGHT) {
       auto fn = pad_fn<side_type::RIGHT>{*d_strings, width, fill_char_size, d_fill_char};
-      return make_strings_children(fn, input.size(), stream, mr);
+      return experimental::make_strings_children(fn, input.size(), stream, mr);
     }
     auto fn = pad_fn<side_type::BOTH>{*d_strings, width, fill_char_size, d_fill_char};
-    return make_strings_children(fn, input.size(), stream, mr);
+    return experimental::make_strings_children(fn, input.size(), stream, mr);
   }();
 
   return make_strings_column(input.size(),
@@ -153,7 +154,7 @@ std::unique_ptr<column> zfill(strings_column_view const& input,
 
   auto d_strings = column_device_view::create(input.parent(), stream);
   auto [offsets_column, chars] =
-    make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr);
+    experimental::make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index ce12dc17aa4..10e06505094 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -217,6 +217,15 @@ __device__ __forceinline__ reprog_device reprog_device::load(reprog_device const
                                             : reinterpret_cast<reprog_device*>(buffer)[0];
 }
 
+__device__ __forceinline__ static string_view::const_iterator find_char(
+  cudf::char_utf8 chr, string_view const d_str, string_view::const_iterator itr)
+{
+  while (itr.byte_offset() < d_str.size_bytes() && *itr != chr) {
+    ++itr;
+  }
+  return itr;
+}
+
 /**
  * @brief Evaluate a specific string against regex pattern compiled to this instance.
  *
@@ -253,16 +262,16 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
         case BOL:
           if (pos == 0) break;
           if (jnk.startchar != '^') { return thrust::nullopt; }
-          --pos;
+          --itr;
           startchar = static_cast<char_utf8>('\n');
         case CHAR: {
-          auto const fidx = dstr.find(startchar, pos);
-          if (fidx == string_view::npos) { return thrust::nullopt; }
-          pos = fidx + (jnk.starttype == BOL);
+          auto const find_itr = find_char(startchar, dstr, itr);
+          if (find_itr.byte_offset() >= dstr.size_bytes()) { return thrust::nullopt; }
+          itr = find_itr + (jnk.starttype == BOL);
+          pos = itr.position();
           break;
         }
       }
-      itr += (pos - itr.position());  // faster to increment position
     }
 
     if (((eos < 0) || (pos < eos)) && match == 0) {
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 2eb03bd10a4..9abcca7a5e6 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -23,7 +23,7 @@
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
@@ -404,13 +404,14 @@ struct replace_multi_fn {
   column_device_view const d_strings;
   column_device_view const d_targets;
   column_device_view const d_repls;
-  int32_t* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str   = d_strings.element<string_view>(idx);
@@ -443,9 +444,11 @@ struct replace_multi_fn {
       ++spos;
     }
     if (out_ptr)  // copy remainder
+    {
       memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
-    else
-      d_offsets[idx] = bytes;
+    } else {
+      d_sizes[idx] = bytes;
+    }
   }
 };
 
@@ -459,7 +462,7 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
   auto d_targets      = column_device_view::create(targets.parent(), stream);
   auto d_replacements = column_device_view::create(repls.parent(), stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     replace_multi_fn{*d_strings, *d_targets, *d_replacements}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 857bc7fb41c..df8526fa942 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
@@ -345,13 +345,14 @@ struct replace_fn {
   string_view d_target;
   string_view d_replacement;
   cudf::size_type maxrepl;
-  cudf::size_type* d_offsets{};
+  cudf::size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str   = d_strings.element<string_view>(idx);
@@ -384,7 +385,7 @@ struct replace_fn {
     if (out_ptr) {  // copy remainder
       memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
     } else {
-      d_offsets[idx] = bytes;
+      d_sizes[idx] = bytes;
     }
   }
 };
@@ -398,7 +399,7 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     replace_fn{*d_strings, d_target, d_replacement, maxrepl}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
diff --git a/cpp/src/strings/replace/replace_slice.cu b/cpp/src/strings/replace/replace_slice.cu
index 90540b39189..54e84dfe504 100644
--- a/cpp/src/strings/replace/replace_slice.cu
+++ b/cpp/src/strings/replace/replace_slice.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -45,13 +45,14 @@ struct replace_slice_fn {
   string_view const d_repl;
   size_type const start;
   size_type const stop;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str   = d_strings.element<string_view>(idx);
@@ -69,7 +70,7 @@ struct replace_slice_fn {
                                    in_ptr + end,
                                    d_str.size_bytes() - end);
     } else {
-      d_offsets[idx] = d_str.size_bytes() + d_repl.size_bytes() - (end - begin);
+      d_sizes[idx] = d_str.size_bytes() + d_repl.size_bytes() - (end - begin);
     }
   }
 };
@@ -94,7 +95,7 @@ std::unique_ptr<column> replace_slice(strings_column_view const& input,
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   // this utility calls the given functor to build the offsets and chars columns
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     replace_slice_fn{*d_strings, d_repl, start, stop}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index d080065b330..972a4ffd58e 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -21,11 +21,13 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/slice.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -79,19 +81,20 @@ struct substring_fn {
   numeric_scalar_device_view<size_type> const d_start;
   numeric_scalar_device_view<size_type> const d_stop;
   numeric_scalar_device_view<size_type> const d_step;
-  int32_t* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str  = d_column.template element<string_view>(idx);
     auto const length = d_str.length();
     if (length == 0) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     size_type const step = d_step.is_valid() ? d_step.value() : 1;
@@ -131,7 +134,7 @@ struct substring_fn {
       }
       itr += step;
     }
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -205,7 +208,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
   auto const d_stop  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
   auto const d_step  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
 
-  auto [offsets, chars] = make_strings_children(
+  auto [offsets, chars] = experimental::make_strings_children(
     substring_fn{*d_column, d_start, d_stop, d_step}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
@@ -227,13 +230,17 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                "Parameter starts must have the same number of rows as strings.");
   CUDF_EXPECTS(stops_column.size() == strings_count,
                "Parameter stops must have the same number of rows as strings.");
-  CUDF_EXPECTS(starts_column.type() == stops_column.type(),
-               "Parameters starts and stops must be of the same type.");
+  CUDF_EXPECTS(cudf::have_same_types(starts_column, stops_column),
+               "Parameters starts and stops must be of the same type.",
+               cudf::data_type_error);
   CUDF_EXPECTS(starts_column.null_count() == 0, "Parameter starts must not contain nulls.");
   CUDF_EXPECTS(stops_column.null_count() == 0, "Parameter stops must not contain nulls.");
   CUDF_EXPECTS(starts_column.type().id() != data_type{type_id::BOOL8}.id(),
-               "Positions values must not be bool type.");
-  CUDF_EXPECTS(is_fixed_width(starts_column.type()), "Positions values must be fixed width type.");
+               "Positions values must not be bool type.",
+               cudf::data_type_error);
+  CUDF_EXPECTS(is_fixed_width(starts_column.type()),
+               "Positions values must be fixed width type.",
+               cudf::data_type_error);
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto starts_iter    = cudf::detail::indexalator_factory::make_input_iterator(starts_column);
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index fcf55429e09..75bc46d30c4 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/translate.hpp>
@@ -52,13 +52,14 @@ struct translate_fn {
   column_device_view const d_strings;
   rmm::device_uvector<translate_table>::iterator table_begin;
   rmm::device_uvector<translate_table>::iterator table_end;
-  int32_t* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     string_view const d_str = d_strings.element<string_view>(idx);
@@ -80,7 +81,7 @@ struct translate_fn {
       }
       if (chr && out_ptr) out_ptr += from_char_utf8(chr, out_ptr);
     }
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -111,7 +112,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
-  auto [offsets_column, chars] = make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     translate_fn{*d_strings, table.begin(), table.end()}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp
index bcbf2d44139..13832b0d9dc 100644
--- a/cpp/src/table/table_view.cpp
+++ b/cpp/src/table/table_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,30 +145,21 @@ bool has_nested_nullable_columns(table_view const& input)
   });
 }
 
-bool have_same_types(table_view const& lhs, table_view const& rhs)
+namespace detail {
+
+template <typename TableView>
+bool is_relationally_comparable(TableView const& lhs, TableView const& rhs)
 {
   return std::equal(lhs.begin(),
                     lhs.end(),
                     rhs.begin(),
                     rhs.end(),
                     [](column_view const& lcol, column_view const& rcol) {
-                      return cudf::column_types_equal(lcol, rcol);
+                      return cudf::is_relationally_comparable(lcol.type()) and
+                             cudf::have_same_types(lcol, rcol);
                     });
 }
 
-namespace detail {
-
-template <typename TableView>
-bool is_relationally_comparable(TableView const& lhs, TableView const& rhs)
-{
-  return std::all_of(thrust::counting_iterator<size_type>(0),
-                     thrust::counting_iterator<size_type>(lhs.num_columns()),
-                     [lhs, rhs](auto const i) {
-                       return lhs.column(i).type() == rhs.column(i).type() and
-                              cudf::is_relationally_comparable(lhs.column(i).type());
-                     });
-}
-
 // Explicit template instantiation for a table of immutable views
 template bool is_relationally_comparable<table_view>(table_view const& lhs, table_view const& rhs);
 
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index 63fe3113697..2efeeee0ee9 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -48,12 +48,13 @@ namespace {
  * the same row. The `d_separator` is appended between each token.
  */
 struct detokenizer_fn {
-  cudf::column_device_view const d_strings;  // these are the tokens
-  cudf::size_type const* d_row_map;          // indices sorted by output row
-  cudf::size_type const* d_token_offsets;    // to each input token array
-  cudf::string_view const d_separator;       // append after each token
-  cudf::size_type* d_offsets{};              // offsets to output buffer d_chars
-  char* d_chars{};                           // output buffer for characters
+  cudf::column_device_view const d_strings;    // these are the tokens
+  cudf::size_type const* d_row_map;            // indices sorted by output row
+  cudf::size_type const* d_token_offsets;      // to each input token array
+  cudf::string_view const d_separator;         // append after each token
+  cudf::size_type* d_sizes{};                  // output sizes
+  char* d_chars{};                             // output buffer for characters
+  cudf::detail::input_offsetalator d_offsets;  // for addressing output row data in d_chars
 
   __device__ void operator()(cudf::size_type idx)
   {
@@ -75,7 +76,7 @@ struct detokenizer_fn {
         nbytes += d_separator.size_bytes();
       }
     }
-    if (!d_chars) { d_offsets[idx] = (nbytes > 0) ? (nbytes - d_separator.size_bytes()) : 0; }
+    if (!d_chars) { d_sizes[idx] = (nbytes > 0) ? (nbytes - d_separator.size_bytes()) : 0; }
   }
 };
 
@@ -157,7 +158,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
 
   cudf::string_view const d_separator(separator.data(), separator.size());
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     detokenizer_fn{*strings_column, d_row_map, tokens_offsets.data(), d_separator},
     output_count,
     stream,
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index d9fcd7dfd05..fdd165a54bc 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -57,8 +57,9 @@ struct ngram_generator_fn {
   cudf::column_device_view const d_strings;
   cudf::size_type ngrams;
   cudf::string_view const d_separator;
-  cudf::size_type* d_offsets{};
+  cudf::size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Build ngram for each string.
@@ -81,7 +82,7 @@ struct ngram_generator_fn {
       bytes += d_separator.size_bytes();
       if (out_ptr) out_ptr = cudf::strings::detail::copy_string(out_ptr, d_separator);
     }
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -141,7 +142,7 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
   // compute the number of strings of ngrams
   auto const ngrams_count = strings_count - ngrams + 1;
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, stream, mr);
 
   // make the output strings column from the offsets and chars column
@@ -175,8 +176,9 @@ struct character_ngram_generator_fn {
   cudf::column_device_view const d_strings;
   cudf::size_type ngrams;
   cudf::size_type const* d_ngram_offsets{};
-  cudf::size_type* d_offsets{};
+  cudf::size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(cudf::size_type idx)
   {
@@ -186,8 +188,8 @@ struct character_ngram_generator_fn {
     auto itr                = d_str.begin();
     auto const ngram_offset = d_ngram_offsets[idx];
     auto const ngram_count  = d_ngram_offsets[idx + 1] - ngram_offset;
-    auto d_sizes            = d_offsets + ngram_offset;
-    auto out_ptr            = d_chars ? d_chars + *d_sizes : nullptr;
+    auto d_output_sizes     = d_sizes + ngram_offset;
+    auto out_ptr            = d_chars ? d_chars + d_offsets[ngram_offset] : nullptr;
     for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) {
       auto const begin = itr.byte_offset();
       auto const end   = (itr + ngrams).byte_offset();
@@ -195,7 +197,7 @@ struct character_ngram_generator_fn {
         out_ptr =
           cudf::strings::detail::copy_and_increment(out_ptr, d_str.data() + begin, (end - begin));
       } else {
-        *d_sizes++ = end - begin;
+        *d_output_sizes++ = end - begin;
       }
     }
   }
@@ -233,7 +235,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
                "Insufficient number of characters in each string to generate ngrams");
 
   character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets};
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     generator, strings_count, total_ngrams, stream, mr);
 
   auto output = cudf::make_strings_column(
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index e5e72d3a33e..2f97eb1ce74 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -26,7 +26,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -59,13 +59,14 @@ namespace {
  */
 struct normalize_spaces_fn {
   cudf::column_device_view const d_strings;  // strings to normalize
-  cudf::size_type* d_offsets{};              // offsets into d_chars
+  cudf::size_type* d_sizes{};                // size of each output row
   char* d_chars{};                           // output buffer for characters
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(cudf::size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     cudf::string_view const single_space(" ", 1);
@@ -93,7 +94,7 @@ struct normalize_spaces_fn {
       nbytes += token.size_bytes() + 1;  // token size plus a single space
     }
     // remove trailing space
-    if (!d_chars) d_offsets[idx] = (nbytes > 0) ? nbytes - 1 : 0;
+    if (!d_chars) { d_sizes[idx] = (nbytes > 0) ? nbytes - 1 : 0; }
   }
 };
 
@@ -109,8 +110,9 @@ struct codepoint_to_utf8_fn {
   cudf::column_device_view const d_strings;  // input strings
   uint32_t const* cp_data;                   // full code-point array
   int64_t const* d_cp_offsets{};             // offsets to each string's code-point array
-  cudf::size_type* d_offsets{};              // offsets for the output strings
+  cudf::size_type* d_sizes{};                // size of output string
   char* d_chars{};                           // buffer for the output strings column
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Return the number of bytes for the output string given its code-point array.
@@ -133,14 +135,14 @@ struct codepoint_to_utf8_fn {
   __device__ void operator()(cudf::size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const offset = d_cp_offsets[idx];
     auto const count  = d_cp_offsets[idx + 1] - offset;  // number of code-points
     auto str_cps      = cp_data + offset;                // code-points for this string
     if (!d_chars) {
-      d_offsets[idx] = compute_output_size(str_cps, count);
+      d_sizes[idx] = compute_output_size(str_cps, count);
       return;
     }
     // convert each code-point to 1-4 UTF-8 encoded bytes
@@ -183,7 +185,7 @@ std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const&
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
   // build offsets and children using the normalize_space_fn
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     normalize_spaces_fn{*d_strings}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
@@ -225,7 +227,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
   // build offsets and children using the codepoint_to_utf8_fn
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index f61fa544e73..f95b53a3ac8 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -21,7 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -48,8 +48,9 @@ using replace_result = thrust::pair<bool, cudf::string_view>;
 struct base_token_replacer_fn {
   cudf::column_device_view const d_strings;  ///< strings to tokenize
   cudf::string_view const d_delimiter;       ///< delimiter characters for tokenizing
-  cudf::size_type* d_offsets{};              ///< for locating output string in d_chars
+  cudf::size_type* d_sizes{};                ///< for output string size
   char* d_chars{};                           ///< output buffer
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Tokenizes each string and calls the provided `replacer` function
@@ -63,7 +64,7 @@ struct base_token_replacer_fn {
   __device__ void process_string(cudf::size_type idx, ReplaceFn replacer)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -95,10 +96,11 @@ struct base_token_replacer_fn {
     }
 
     // copy the remainder of the string's bytes to the output buffer
-    if (out_ptr)
+    if (out_ptr) {
       memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos);
-    else
-      d_offsets[idx] = nbytes;
+    } else {
+      d_sizes[idx] = nbytes;
+    }
   }
 };
 
@@ -230,7 +232,7 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
 
   // this utility calls replacer to build the offsets and chars columns
   auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr);
+    cudf::strings::detail::experimental::make_strings_children(replacer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,
@@ -263,7 +265,7 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
 
   // this utility calls filterer to build the offsets and chars columns
   auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
+    cudf::strings::detail::experimental::make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 570060b3870..723c306da1d 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -24,6 +24,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -61,7 +62,9 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const&
                                                               rmm::cuda_stream_view stream,
                                                               rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == categories.type(), "Mismatch type between input and categories.");
+  CUDF_EXPECTS(cudf::have_same_types(input, categories),
+               "Mismatch type between input and categories.",
+               cudf::data_type_error);
 
   if (categories.is_empty()) { return {make_empty_column(type_id::BOOL8), table_view{}}; }
 
diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp
index d6f5c65593a..dac981fb532 100644
--- a/cpp/src/utilities/type_checks.cpp
+++ b/cpp/src/utilities/type_checks.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -28,15 +30,16 @@ namespace {
 
 struct columns_equal_fn {
   template <typename T>
-  bool operator()(column_view const&, column_view const&)
+  bool operator()(column_view const& lhs, column_view const& rhs)
   {
-    return true;
+    return lhs.type() == rhs.type();
   }
 };
 
 template <>
 bool columns_equal_fn::operator()<dictionary32>(column_view const& lhs, column_view const& rhs)
 {
+  if (not cudf::is_dictionary(rhs.type())) { return false; }
   auto const kidx = dictionary_column_view::keys_column_index;
   return lhs.num_children() > 0 and rhs.num_children() > 0
            ? lhs.child(kidx).type() == rhs.child(kidx).type()
@@ -46,33 +49,132 @@ bool columns_equal_fn::operator()<dictionary32>(column_view const& lhs, column_v
 template <>
 bool columns_equal_fn::operator()<list_view>(column_view const& lhs, column_view const& rhs)
 {
+  if (rhs.type().id() != type_id::LIST) { return false; }
   auto const& ci = lists_column_view::child_column_index;
-  return column_types_equal(lhs.child(ci), rhs.child(ci));
+  return have_same_types(lhs.child(ci), rhs.child(ci));
 }
 
 template <>
 bool columns_equal_fn::operator()<struct_view>(column_view const& lhs, column_view const& rhs)
 {
-  return lhs.num_children() == rhs.num_children() and
-         std::all_of(thrust::make_counting_iterator(0),
-                     thrust::make_counting_iterator(lhs.num_children()),
-                     [&](auto i) { return column_types_equal(lhs.child(i), rhs.child(i)); });
+  if (rhs.type().id() != type_id::STRUCT) { return false; }
+  return std::equal(lhs.child_begin(),
+                    lhs.child_end(),
+                    rhs.child_begin(),
+                    rhs.child_end(),
+                    [](auto const& lhs, auto const& rhs) { return have_same_types(lhs, rhs); });
+}
+
+struct column_scalar_equal_fn {
+  template <typename T>
+  bool operator()(column_view const& col, scalar const& slr)
+  {
+    return col.type() == slr.type();
+  }
+};
+
+template <>
+bool column_scalar_equal_fn::operator()<dictionary32>(column_view const& col, scalar const& slr)
+{
+  // It is not possible to have a scalar dictionary, so compare the dictionary
+  // column keys type to the scalar type.
+  auto col_keys = cudf::dictionary_column_view(col).keys();
+  return have_same_types(col_keys, slr);
+}
+
+template <>
+bool column_scalar_equal_fn::operator()<list_view>(column_view const& col, scalar const& slr)
+{
+  if (slr.type().id() != type_id::LIST) { return false; }
+  auto const& ci      = lists_column_view::child_column_index;
+  auto const list_slr = static_cast<list_scalar const*>(&slr);
+  return have_same_types(col.child(ci), list_slr->view());
+}
+
+template <>
+bool column_scalar_equal_fn::operator()<struct_view>(column_view const& col, scalar const& slr)
+{
+  if (slr.type().id() != type_id::STRUCT) { return false; }
+  auto const struct_slr = static_cast<struct_scalar const*>(&slr);
+  auto const slr_tbl    = struct_slr->view();
+  return std::equal(col.child_begin(),
+                    col.child_end(),
+                    slr_tbl.begin(),
+                    slr_tbl.end(),
+                    [](auto const& lhs, auto const& rhs) { return have_same_types(lhs, rhs); });
+}
+
+struct scalars_equal_fn {
+  template <typename T>
+  bool operator()(scalar const& lhs, scalar const& rhs)
+  {
+    return lhs.type() == rhs.type();
+  }
+};
+
+template <>
+bool scalars_equal_fn::operator()<list_view>(scalar const& lhs, scalar const& rhs)
+{
+  if (rhs.type().id() != type_id::LIST) { return false; }
+  auto const list_lhs = static_cast<list_scalar const*>(&lhs);
+  auto const list_rhs = static_cast<list_scalar const*>(&rhs);
+  return have_same_types(list_lhs->view(), list_rhs->view());
+}
+
+template <>
+bool scalars_equal_fn::operator()<struct_view>(scalar const& lhs, scalar const& rhs)
+{
+  if (rhs.type().id() != type_id::STRUCT) { return false; }
+  auto const tbl_lhs = static_cast<struct_scalar const*>(&lhs)->view();
+  auto const tbl_rhs = static_cast<struct_scalar const*>(&rhs)->view();
+  return have_same_types(tbl_lhs, tbl_rhs);
 }
 
 };  // namespace
 
 // Implementation note: avoid using double dispatch for this function
 // as it increases code paths to NxN for N types.
-bool column_types_equal(column_view const& lhs, column_view const& rhs)
+bool have_same_types(column_view const& lhs, column_view const& rhs)
 {
-  if (lhs.type() != rhs.type()) { return false; }
   return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
 }
 
+bool column_types_equal(column_view const& lhs, column_view const& rhs)
+{
+  return have_same_types(lhs, rhs);
+}
+
+bool have_same_types(column_view const& lhs, scalar const& rhs)
+{
+  return type_dispatcher(lhs.type(), column_scalar_equal_fn{}, lhs, rhs);
+}
+
+bool have_same_types(scalar const& lhs, column_view const& rhs)
+{
+  return have_same_types(rhs, lhs);
+}
+
+bool have_same_types(scalar const& lhs, scalar const& rhs)
+{
+  return type_dispatcher(lhs.type(), scalars_equal_fn{}, lhs, rhs);
+}
+
+bool have_same_types(table_view const& lhs, table_view const& rhs)
+{
+  return std::equal(
+    lhs.begin(),
+    lhs.end(),
+    rhs.begin(),
+    rhs.end(),
+    [](column_view const& lcol, column_view const& rcol) { return have_same_types(lcol, rcol); });
+}
+
 bool column_types_equivalent(column_view const& lhs, column_view const& rhs)
 {
-  if (lhs.type().id() != rhs.type().id()) { return false; }
-  return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
+  // Check if the columns have fixed point types. This is the only case where
+  // type equality and equivalence differ.
+  if (cudf::is_fixed_point(lhs.type())) { return lhs.type().id() == rhs.type().id(); }
+  return have_same_types(lhs, rhs);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 6c56d82007a..bbb919aa2d1 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -294,7 +294,7 @@ ConfigureTest(
   PERCENT 30
 )
 ConfigureTest(
-  ORC_TEST io/orc_test.cpp
+  ORC_TEST io/orc_chunked_reader_test.cu io/orc_test.cpp
   GPUS 1
   PERCENT 30
 )
@@ -572,7 +572,7 @@ ConfigureTest(
 # * large strings test ----------------------------------------------------------------------------
 ConfigureTest(
   LARGE_STRINGS_TEST large_strings/large_strings_fixture.cpp large_strings/merge_tests.cpp
-  large_strings/concatenate_tests.cpp
+  large_strings/concatenate_tests.cpp large_strings/parquet_tests.cpp
   GPUS 1
   PERCENT 100
 )
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index c2d1e1d9f4f..a9bf22682cf 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -31,6 +31,7 @@
 #include <cudf/filling.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 
@@ -1226,7 +1227,7 @@ TEST_F(ListsColumnTest, ConcatenateMismatchedHierarchies)
     cudf::test::lists_column_wrapper<int> b{{{LCW{}}}};
     cudf::test::lists_column_wrapper<int> c{{LCW{}}};
 
-    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::data_type_error);
   }
 
   {
@@ -1235,7 +1236,7 @@ TEST_F(ListsColumnTest, ConcatenateMismatchedHierarchies)
     cudf::test::lists_column_wrapper<int> b{{{LCW{}}}};
     cudf::test::lists_column_wrapper<int> c{{LCW{}}};
 
-    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::data_type_error);
   }
 
   {
@@ -1243,14 +1244,14 @@ TEST_F(ListsColumnTest, ConcatenateMismatchedHierarchies)
     cudf::test::lists_column_wrapper<int> b{1, 2, 3};
     cudf::test::lists_column_wrapper<int> c{{3, 4, 5}};
 
-    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::data_type_error);
   }
 
   {
     cudf::test::lists_column_wrapper<int> a{{{1, 2, 3}}};
     cudf::test::lists_column_wrapper<int> b{{4, 5}};
 
-    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b})), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b})), cudf::data_type_error);
   }
 }
 
@@ -1605,7 +1606,7 @@ TEST_F(FixedPointTest, FixedPointScaleMismatch)
   auto const b = fp_wrapper(vec.begin() + 300, vec.begin() + 700, scale_type{-2});
   auto const c = fp_wrapper(vec.begin() + 700, vec.end(), /*****/ scale_type{-3});
 
-  EXPECT_THROW(cudf::concatenate(std::vector<cudf::column_view>{a, b, c}), cudf::logic_error);
+  EXPECT_THROW(cudf::concatenate(std::vector<cudf::column_view>{a, b, c}), cudf::data_type_error);
 }
 
 struct DictionaryConcatTest : public cudf::test::BaseFixture {};
@@ -1650,7 +1651,7 @@ TEST_F(DictionaryConcatTest, ErrorsTest)
   cudf::test::fixed_width_column_wrapper<int32_t> integers({10, 30, 20});
   auto dictionary2 = cudf::dictionary::encode(integers);
   std::vector<cudf::column_view> views({dictionary1->view(), dictionary2->view()});
-  EXPECT_THROW(cudf::concatenate(views), cudf::logic_error);
+  EXPECT_THROW(cudf::concatenate(views), cudf::data_type_error);
   std::vector<cudf::column_view> empty;
   EXPECT_THROW(cudf::concatenate(empty), cudf::logic_error);
 }
diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp
index bcc0ac29b3e..223946ddcee 100644
--- a/cpp/tests/copying/copy_range_tests.cpp
+++ b/cpp/tests/copying/copy_range_tests.cpp
@@ -465,7 +465,7 @@ TEST_F(CopyRangeErrorTestFixture, DTypeMismatch)
   auto dict_target = cudf::dictionary::encode(target);
   auto dict_source = cudf::dictionary::encode(source);
   EXPECT_THROW(cudf::copy_range(dict_source->view(), dict_target->view(), 0, 100, 0),
-               cudf::logic_error);
+               cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp
index 138e1935363..f31d8d6f79a 100644
--- a/cpp/tests/copying/copy_tests.cpp
+++ b/cpp/tests/copying/copy_tests.cpp
@@ -712,7 +712,7 @@ TEST_F(DictionaryCopyIfElseTest, TypeMismatch)
   cudf::test::dictionary_column_wrapper<double> input2({1.0, 1.0, 1.0, 1.0});
   cudf::test::fixed_width_column_wrapper<bool> mask({1, 0, 0, 1});
 
-  EXPECT_THROW(cudf::copy_if_else(input1, input2, mask), cudf::logic_error);
+  EXPECT_THROW(cudf::copy_if_else(input1, input2, mask), cudf::data_type_error);
 
   cudf::string_scalar input3{"1"};
   EXPECT_THROW(cudf::copy_if_else(input1, input3, mask), cudf::data_type_error);
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index 2be3c26af1d..99b86c86997 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -542,11 +542,6 @@ struct ListGetStructValueTest : public cudf::test::BaseFixture {
     return SCW{{field1, field2, field3}, mask};
   }
 
-  /**
-   * @brief Create a 0-length structs column
-   */
-  SCW zero_length_struct() { return SCW{}; }
-
   /**
    * @brief Concatenate structs columns, allow specifying inputs in `initializer_list`
    */
@@ -653,7 +648,7 @@ TYPED_TEST(ListGetStructValueTest, NonNestedGetNonNullEmpty)
   cudf::size_type index = 2;
   // For well-formed list column, an empty list still holds the complete structure of
   // a 0-length structs column
-  auto expected_data = this->zero_length_struct();
+  auto expected_data = this->make_test_structs_column({}, {}, {}, no_nulls());
 
   auto s       = cudf::get_element(list_column->view(), index);
   auto typed_s = static_cast<cudf::list_scalar const*>(s.get());
@@ -757,8 +752,8 @@ TYPED_TEST(ListGetStructValueTest, NestedGetNonNullEmpty)
   auto list_column_nested =
     this->make_test_lists_column(3, {0, 1, 1, 2}, std::move(list_column), {1, 1, 1});
 
-  auto expected_data =
-    this->make_test_lists_column(0, {0}, this->zero_length_struct().release(), {});
+  auto expected_data = this->make_test_lists_column(
+    0, {0}, this->make_test_structs_column({}, {}, {}, no_nulls()).release(), {});
 
   cudf::size_type index = 1;
   auto s                = cudf::get_element(list_column_nested->view(), index);
diff --git a/cpp/tests/dictionary/add_keys_test.cpp b/cpp/tests/dictionary/add_keys_test.cpp
index 1314375f383..46bf5468922 100644
--- a/cpp/tests/dictionary/add_keys_test.cpp
+++ b/cpp/tests/dictionary/add_keys_test.cpp
@@ -22,6 +22,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/dictionary/update_keys.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <vector>
 
@@ -83,7 +84,7 @@ TEST_F(DictionaryAddKeysTest, Errors)
   auto dictionary = cudf::dictionary::encode(input);
 
   cudf::test::fixed_width_column_wrapper<float> new_keys{1.0, 2.0, 3.0};
-  EXPECT_THROW(cudf::dictionary::add_keys(dictionary->view(), new_keys), cudf::logic_error);
+  EXPECT_THROW(cudf::dictionary::add_keys(dictionary->view(), new_keys), cudf::data_type_error);
   cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {1, 0, 1}};
   EXPECT_THROW(cudf::dictionary::add_keys(dictionary->view(), null_keys), cudf::logic_error);
 }
diff --git a/cpp/tests/dictionary/remove_keys_test.cpp b/cpp/tests/dictionary/remove_keys_test.cpp
index 13fe3efd0f4..9950a39d630 100644
--- a/cpp/tests/dictionary/remove_keys_test.cpp
+++ b/cpp/tests/dictionary/remove_keys_test.cpp
@@ -22,6 +22,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/dictionary/update_keys.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -119,7 +120,7 @@ TEST_F(DictionaryRemoveKeysTest, Errors)
   auto const dictionary = cudf::dictionary::encode(input);
 
   cudf::test::fixed_width_column_wrapper<float> del_keys{1.0, 2.0, 3.0};
-  EXPECT_THROW(cudf::dictionary::remove_keys(dictionary->view(), del_keys), cudf::logic_error);
+  EXPECT_THROW(cudf::dictionary::remove_keys(dictionary->view(), del_keys), cudf::data_type_error);
   cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {1, 0, 1}};
   EXPECT_THROW(cudf::dictionary::remove_keys(dictionary->view(), null_keys), cudf::logic_error);
 }
diff --git a/cpp/tests/dictionary/scatter_test.cpp b/cpp/tests/dictionary/scatter_test.cpp
index 2a2841827d0..2f77f4ee621 100644
--- a/cpp/tests/dictionary/scatter_test.cpp
+++ b/cpp/tests/dictionary/scatter_test.cpp
@@ -141,5 +141,5 @@ TEST_F(DictionaryScatterTest, Error)
   EXPECT_THROW(
     cudf::scatter(
       cudf::table_view{{source->view()}}, scatter_map, cudf::table_view{{target->view()}}),
-    cudf::logic_error);
+    cudf::data_type_error);
 }
diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp
index 600d00ac186..b49b4ce5aa0 100644
--- a/cpp/tests/dictionary/search_test.cpp
+++ b/cpp/tests/dictionary/search_test.cpp
@@ -77,9 +77,9 @@ TEST_F(DictionarySearchTest, Errors)
 {
   cudf::test::dictionary_column_wrapper<int64_t> dictionary({1, 2, 3});
   cudf::numeric_scalar<double> key(7);
-  EXPECT_THROW(cudf::dictionary::get_index(dictionary, key), cudf::logic_error);
+  EXPECT_THROW(cudf::dictionary::get_index(dictionary, key), cudf::data_type_error);
   EXPECT_THROW(
     cudf::dictionary::detail::get_insert_index(
       dictionary, key, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
-    cudf::logic_error);
+    cudf::data_type_error);
 }
diff --git a/cpp/tests/dictionary/set_keys_test.cpp b/cpp/tests/dictionary/set_keys_test.cpp
index d0c37493cf8..5c9ec3567fe 100644
--- a/cpp/tests/dictionary/set_keys_test.cpp
+++ b/cpp/tests/dictionary/set_keys_test.cpp
@@ -21,6 +21,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/dictionary/update_keys.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -82,7 +83,7 @@ TEST_F(DictionarySetKeysTest, Errors)
   auto dictionary = cudf::dictionary::encode(input);
 
   cudf::test::fixed_width_column_wrapper<float> new_keys{1.0, 2.0, 3.0};
-  EXPECT_THROW(cudf::dictionary::set_keys(dictionary->view(), new_keys), cudf::logic_error);
+  EXPECT_THROW(cudf::dictionary::set_keys(dictionary->view(), new_keys), cudf::data_type_error);
   cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {1, 0, 1}};
   EXPECT_THROW(cudf::dictionary::set_keys(dictionary->view(), null_keys), cudf::logic_error);
 }
diff --git a/cpp/tests/filling/fill_tests.cpp b/cpp/tests/filling/fill_tests.cpp
index 95a27defa4e..26badefe698 100644
--- a/cpp/tests/filling/fill_tests.cpp
+++ b/cpp/tests/filling/fill_tests.cpp
@@ -359,8 +359,8 @@ TEST_F(FillErrorTestFixture, DTypeMismatch)
 
   auto destination_view = cudf::mutable_column_view{destination};
 
-  EXPECT_THROW(cudf::fill_in_place(destination_view, 0, 10, *p_val), cudf::logic_error);
-  EXPECT_THROW(auto p_ret = cudf::fill(destination, 0, 10, *p_val), cudf::logic_error);
+  EXPECT_THROW(cudf::fill_in_place(destination_view, 0, 10, *p_val), cudf::data_type_error);
+  EXPECT_THROW(auto p_ret = cudf::fill(destination, 0, 10, *p_val), cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp
index cf619aace5a..5651a26f192 100644
--- a/cpp/tests/filling/sequence_tests.cpp
+++ b/cpp/tests/filling/sequence_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -102,15 +102,15 @@ TEST_F(SequenceTestFixture, MismatchedInputs)
 {
   cudf::numeric_scalar<int> init(0);
   cudf::numeric_scalar<float> step(-5);
-  EXPECT_THROW(cudf::sequence(10, init, step), cudf::logic_error);
+  EXPECT_THROW(cudf::sequence(10, init, step), cudf::data_type_error);
 
   cudf::numeric_scalar<int> init2(0);
   cudf::numeric_scalar<int8_t> step2(-5);
-  EXPECT_THROW(cudf::sequence(10, init2, step2), cudf::logic_error);
+  EXPECT_THROW(cudf::sequence(10, init2, step2), cudf::data_type_error);
 
   cudf::numeric_scalar<float> init3(0);
   cudf::numeric_scalar<double> step3(-5);
-  EXPECT_THROW(cudf::sequence(10, init3, step3), cudf::logic_error);
+  EXPECT_THROW(cudf::sequence(10, init3, step3), cudf::data_type_error);
 }
 
 TYPED_TEST(SequenceTypedTestFixture, DefaultStep)
diff --git a/cpp/tests/groupby/shift_tests.cpp b/cpp/tests/groupby/shift_tests.cpp
index d2ecb667eca..1a6abf2e734 100644
--- a/cpp/tests/groupby/shift_tests.cpp
+++ b/cpp/tests/groupby/shift_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -507,7 +507,7 @@ TEST_F(groupby_shift_fixed_point_type_test, MismatchScaleType)
 
   EXPECT_THROW(test_groupby_shift_multi(
                  key, cudf::table_view{{v1}}, offset, {*slr1}, cudf::table_view{{stub}}),
-               cudf::logic_error);
+               cudf::data_type_error);
 }
 
 TEST_F(groupby_shift_fixed_point_type_test, MismatchRepType)
@@ -525,5 +525,5 @@ TEST_F(groupby_shift_fixed_point_type_test, MismatchRepType)
 
   EXPECT_THROW(test_groupby_shift_multi(
                  key, cudf::table_view{{v1}}, offset, {*slr1}, cudf::table_view{{stub}}),
-               cudf::logic_error);
+               cudf::data_type_error);
 }
diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp
index 895887ee348..ecc8558243d 100644
--- a/cpp/tests/interop/dlpack_test.cpp
+++ b/cpp/tests/interop/dlpack_test.cpp
@@ -20,6 +20,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/interop.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/host_vector.h>
 
@@ -98,7 +99,7 @@ TEST_F(DLPackUntypedTests, MultipleTypesToDlpack)
   cudf::test::fixed_width_column_wrapper<int16_t> col1({1, 2, 3, 4});
   cudf::test::fixed_width_column_wrapper<int32_t> col2({1, 2, 3, 4});
   cudf::table_view input({col1, col2});
-  EXPECT_THROW(cudf::to_dlpack(input), cudf::logic_error);
+  EXPECT_THROW(cudf::to_dlpack(input), cudf::data_type_error);
 }
 
 TEST_F(DLPackUntypedTests, InvalidNullsToDlpack)
diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp
index 95cbe8057d1..66bd4dd1bfb 100644
--- a/cpp/tests/interop/from_arrow_device_test.cpp
+++ b/cpp/tests/interop/from_arrow_device_test.cpp
@@ -100,22 +100,26 @@ TEST_F(FromArrowDeviceTest, DateTimeTable)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
   ArrowSchemaInit(input_schema->children[0]);
-  ArrowSchemaSetTypeDateTime(
-    input_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
-  ArrowSchemaSetName(input_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    input_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
   nanoarrow::UniqueArray input_array;
-  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
   input_array->length                  = 6;
   input_array->null_count              = 0;
   input_array->children[0]->length     = 6;
   input_array->children[0]->null_count = 0;
-  ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc));
   ArrowArrayBuffer(input_array->children[0], 1)->data =
     const_cast<uint8_t*>(cudf::column_view(col).data<uint8_t>());
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr);
+  ArrowArrayBuffer(input_array->children[0], 1)->size_bytes =
+    sizeof(int64_t) * cudf::column_view(col).size();
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -155,23 +159,27 @@ TYPED_TEST(FromArrowDeviceTestDurationsTest, DurationTable)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
 
   ArrowSchemaInit(input_schema->children[0]);
-  ArrowSchemaSetTypeDateTime(
-    input_schema->children[0], NANOARROW_TYPE_DURATION, time_unit, nullptr);
-  ArrowSchemaSetName(input_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    input_schema->children[0], NANOARROW_TYPE_DURATION, time_unit, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
-  auto data_ptr = expected_table_view.column(0).data<uint8_t>();
+  auto data_ptr  = expected_table_view.column(0).data<uint8_t>();
+  auto data_size = expected_table_view.column(0).size();
   nanoarrow::UniqueArray input_array;
-  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
   input_array->length                  = expected_table_view.num_rows();
   input_array->null_count              = 0;
   input_array->children[0]->length     = expected_table_view.num_rows();
   input_array->children[0]->null_count = 0;
-  ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc);
-  ArrowArrayBuffer(input_array->children[0], 1)->data = const_cast<uint8_t*>(data_ptr);
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc));
+  ArrowArrayBuffer(input_array->children[0], 1)->data       = const_cast<uint8_t*>(data_ptr);
+  ArrowArrayBuffer(input_array->children[0], 1)->size_bytes = sizeof(T) * data_size;
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -199,19 +207,21 @@ TEST_F(FromArrowDeviceTest, NestedList)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
 
-  ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(input_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
   input_schema->children[0]->flags = ARROW_FLAG_NULLABLE;
 
-  ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(input_schema->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0]->children[0], "element"));
   input_schema->children[0]->children[0]->flags = 0;
 
-  ArrowSchemaInitFromType(input_schema->children[0]->children[0]->children[0],
-                          NANOARROW_TYPE_INT64);
-  ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(
+    input_schema->children[0]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element"));
   input_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE;
 
   nanoarrow::UniqueArray input_array;
@@ -223,7 +233,8 @@ TEST_F(FromArrowDeviceTest, NestedList)
   cudf::lists_column_view nested_view{lview.child()};
   populate_list_from_col(top_list->children[0], nested_view);
   populate_from_col<int64_t>(top_list->children[0]->children[0], nested_view.child());
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -287,47 +298,52 @@ TEST_F(FromArrowDeviceTest, StructColumn)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
 
   ArrowSchemaInit(input_schema->children[0]);
-  ArrowSchemaSetTypeStruct(input_schema->children[0], 5);
-  ArrowSchemaSetName(input_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema->children[0], 5));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
   input_schema->children[0]->flags = 0;
 
   auto child = input_schema->children[0];
-  ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING);
-  ArrowSchemaSetName(child->children[0], "string");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[0], "string"));
   child->children[0]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32);
-  ArrowSchemaSetName(child->children[1], "integral");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[1], "integral"));
   child->children[1]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL);
-  ArrowSchemaSetName(child->children[2], "bool");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[2], "bool"));
   child->children[2]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(child->children[3], "nested_list");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3], "nested_list"));
   child->children[3]->flags = 0;
-  ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(child->children[3]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3]->children[0], "element"));
   child->children[3]->children[0]->flags = 0;
-  ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64);
-  ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element"));
   child->children[3]->children[0]->children[0]->flags = 0;
 
   ArrowSchemaInit(child->children[4]);
-  ArrowSchemaSetTypeStruct(child->children[4], 2);
-  ArrowSchemaSetName(child->children[4], "struct");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(child->children[4], 2));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4], "struct"));
 
-  ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING);
-  ArrowSchemaSetName(child->children[4]->children[0], "string2");
-  ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32);
-  ArrowSchemaSetName(child->children[4]->children[1], "integral2");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[0], "string2"));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[1], "integral2"));
 
   nanoarrow::UniqueArray input_array;
-  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
 
   input_array->length = expected_table_view.num_rows();
 
@@ -336,7 +352,7 @@ TEST_F(FromArrowDeviceTest, StructColumn)
   array_a->length     = view_a.size();
   array_a->null_count = view_a.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc));
   ArrowArrayValidityBitmap(array_a)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_a.null_mask()));
 
@@ -354,14 +370,15 @@ TEST_F(FromArrowDeviceTest, StructColumn)
   array_struct->length     = view_struct.size();
   array_struct->null_count = view_struct.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc));
   ArrowArrayValidityBitmap(array_struct)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_struct.null_mask()));
 
   populate_from_col<cudf::string_view>(array_struct->children[0], view_struct.child(0));
   populate_from_col<int32_t>(array_struct->children[1], view_struct.child(1));
 
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -406,25 +423,28 @@ TEST_F(FromArrowDeviceTest, DictionaryIndicesType)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 3);
-
-  ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_INT8);
-  ArrowSchemaSetName(input_schema->children[0], "a");
-  ArrowSchemaAllocateDictionary(input_schema->children[0]);
-  ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64);
-
-  ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_INT16);
-  ArrowSchemaSetName(input_schema->children[1], "b");
-  ArrowSchemaAllocateDictionary(input_schema->children[1]);
-  ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64);
-
-  ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_INT64);
-  ArrowSchemaSetName(input_schema->children[2], "c");
-  ArrowSchemaAllocateDictionary(input_schema->children[2]);
-  ArrowSchemaInitFromType(input_schema->children[2]->dictionary, NANOARROW_TYPE_INT64);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 3));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_INT8));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[0]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_INT16));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[1], "b"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[1]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[2], "c"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[2]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[2]->dictionary, NANOARROW_TYPE_INT64));
 
   nanoarrow::UniqueArray input_array;
-  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
   input_array->length     = expected_table.num_rows();
   input_array->null_count = 0;
 
@@ -446,7 +466,8 @@ TEST_F(FromArrowDeviceTest, DictionaryIndicesType)
   populate_from_col<int64_t>(input_array->children[2]->dictionary,
                              cudf::dictionary_column_view{expected_table_view.column(2)}.keys());
 
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -562,20 +583,22 @@ TEST_F(FromArrowDeviceTest, FixedPoint128Table)
 
     nanoarrow::UniqueSchema input_schema;
     ArrowSchemaInit(input_schema.get());
-    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
     ArrowSchemaInit(input_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(input_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(input_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
     nanoarrow::UniqueArray input_array;
-    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
     input_array->length = expected.num_rows();
 
     populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
-    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     ArrowDeviceArray input_device_array;
     input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -607,20 +630,22 @@ TEST_F(FromArrowDeviceTest, FixedPoint128TableLarge)
 
     nanoarrow::UniqueSchema input_schema;
     ArrowSchemaInit(input_schema.get());
-    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
     ArrowSchemaInit(input_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(input_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(input_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
     nanoarrow::UniqueArray input_array;
-    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
     input_array->length = expected.num_rows();
 
     populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
-    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     ArrowDeviceArray input_device_array;
     input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -652,20 +677,22 @@ TEST_F(FromArrowDeviceTest, FixedPoint128TableNulls)
 
     nanoarrow::UniqueSchema input_schema;
     ArrowSchemaInit(input_schema.get());
-    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
     ArrowSchemaInit(input_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(input_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(input_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
     nanoarrow::UniqueArray input_array;
-    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
     input_array->length = expected.num_rows();
 
     populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
-    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     ArrowDeviceArray input_device_array;
     input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -699,20 +726,22 @@ TEST_F(FromArrowDeviceTest, FixedPoint128TableNullsLarge)
 
     nanoarrow::UniqueSchema input_schema;
     ArrowSchemaInit(input_schema.get());
-    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
     ArrowSchemaInit(input_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(input_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(input_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
     nanoarrow::UniqueArray input_array;
-    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
     input_array->length = expected.num_rows();
 
     populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
-    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     ArrowDeviceArray input_device_array;
     input_device_array.device_id   = rmm::get_current_cuda_device().value();
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index b795bafed97..fb5d1060f6f 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -122,13 +122,13 @@ void populate_dict_from_col(ArrowArray* arr, cudf::dictionary_column_view dview)
 {
   arr->length     = dview.size();
   arr->null_count = dview.null_count();
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
   ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(dview.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(dview.null_mask()));
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
   ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(IND_TYPE) * dview.indices().size();
   ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(dview.indices().data<uint8_t>());
 
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index fb346dad538..626aeb53cdd 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -217,7 +217,8 @@ get_nanoarrow_tables(cudf::size_type length)
   populate_from_col<cudf::string_view>(arrow->children[5]->children[1], struct_view.child(1));
   arrow->children[5]->length     = struct_view.size();
   arrow->children[5]->null_count = struct_view.null_count();
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arrow->children[5], 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferSetAllocator(ArrowArrayBuffer(arrow->children[5], 0), noop_alloc));
   ArrowArrayValidityBitmap(arrow->children[5])->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(struct_view.size());
   ArrowArrayValidityBitmap(arrow->children[5])->buffer.data =
@@ -241,13 +242,13 @@ void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view)
   arr->length     = view.size();
   arr->null_count = view.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
   ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(view.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
   ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(int32_t) * view.offsets().size();
   ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(view.offsets().data<uint8_t>());
 }
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
index 593c8136e6a..5260b435482 100644
--- a/cpp/tests/io/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -20,6 +20,8 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/io/types.hpp>
@@ -39,23 +41,22 @@ void run_test(const std::string& host_input, const std::string& expected_host_ou
   std::shared_ptr<rmm::mr::device_memory_resource> rsc =
     std::make_shared<rmm::mr::cuda_memory_resource>();
 
-  rmm::device_uvector<char> device_input(
-    host_input.size(), cudf::test::get_default_stream(), rsc.get());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(device_input.data(),
-                                host_input.data(),
-                                host_input.size(),
-                                cudaMemcpyHostToDevice,
-                                cudf::test::get_default_stream().value()));
+  auto stream_view  = cudf::test::get_default_stream();
+  auto device_input = cudf::detail::make_device_uvector_async(
+    host_input, stream_view, rmm::mr::get_current_device_resource());
+
   // Preprocessing FST
-  auto device_fst_output = cudf::io::json::detail::normalize_single_quotes(
-    std::move(device_input), cudf::test::get_default_stream(), rsc.get());
+  cudf::io::datasource::owning_buffer<rmm::device_uvector<char>> device_data(
+    std::move(device_input));
+  cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get());
 
-  std::string preprocessed_host_output(device_fst_output.size(), 0);
+  std::string preprocessed_host_output(device_data.size(), 0);
   CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
-                                device_fst_output.data(),
+                                device_data.data(),
                                 preprocessed_host_output.size(),
                                 cudaMemcpyDeviceToHost,
-                                cudf::test::get_default_stream().value()));
+                                stream_view.value()))
+  stream_view.synchronize();
   CUDF_TEST_EXPECT_VECTOR_EQUAL(
     preprocessed_host_output, expected_host_output, preprocessed_host_output.size());
 }
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index ee1207f04a2..b25822f6613 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -681,6 +681,111 @@ TEST_F(JsonReaderTest, JsonLinesByteRange)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{3000, 4000, 5000}});
 }
 
+TEST_F(JsonReaderTest, JsonLinesMultipleFilesByteRange_AcrossFiles)
+{
+  const std::string file1 = temp_env->get_temp_dir() + "JsonLinesMultipleFilesByteRangeTest1.json";
+  std::ofstream outfile1(file1, std::ofstream::out);
+  outfile1 << "[1000]\n[2000]\n[3000]\n[4000]\n[5000]\n[6000]\n[7000]\n[8000]\n[9000]";
+  outfile1.close();
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file1}})
+      .lines(true)
+      .byte_range_offset(11)
+      .byte_range_size(70);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 10);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.metadata.schema_info[0].name, "0");
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(0),
+    int64_wrapper{{3000, 4000, 5000, 6000, 7000, 8000, 9000, 1000, 2000, 3000}});
+}
+
+TEST_F(JsonReaderTest, JsonLinesMultipleFilesByteRange_ExcessRangeSize)
+{
+  const std::string file1 = temp_env->get_temp_dir() + "JsonLinesMultipleFilesByteRangeTest1.json";
+  std::ofstream outfile1(file1, std::ofstream::out);
+  outfile1 << "[1000]\n[2000]\n[3000]\n[4000]\n[5000]\n[6000]\n[7000]\n[8000]\n[9000]";
+  outfile1.close();
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file1}})
+      .lines(true)
+      .byte_range_offset(11)
+      .byte_range_size(1000);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 16);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.metadata.schema_info[0].name, "0");
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper{{3000,
+                                                4000,
+                                                5000,
+                                                6000,
+                                                7000,
+                                                8000,
+                                                9000,
+                                                1000,
+                                                2000,
+                                                3000,
+                                                4000,
+                                                5000,
+                                                6000,
+                                                7000,
+                                                8000,
+                                                9000}});
+}
+
+TEST_F(JsonReaderTest, JsonLinesMultipleFilesByteRange_LoadAllFiles)
+{
+  const std::string file1 = temp_env->get_temp_dir() + "JsonLinesMultipleFilesByteRangeTest1.json";
+  std::ofstream outfile1(file1, std::ofstream::out);
+  outfile1 << "[1000]\n[2000]\n[3000]\n[4000]\n[5000]\n[6000]\n[7000]\n[8000]\n[9000]";
+  outfile1.close();
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file1}}).lines(true);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 18);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.metadata.schema_info[0].name, "0");
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper{{1000,
+                                                2000,
+                                                3000,
+                                                4000,
+                                                5000,
+                                                6000,
+                                                7000,
+                                                8000,
+                                                9000,
+                                                1000,
+                                                2000,
+                                                3000,
+                                                4000,
+                                                5000,
+                                                6000,
+                                                7000,
+                                                8000,
+                                                9000}});
+}
+
 TEST_P(JsonReaderRecordTest, JsonLinesObjects)
 {
   const std::string fname = temp_env->get_temp_dir() + "JsonLinesObjectsTest.json";
@@ -2128,9 +2233,6 @@ TEST_F(JsonReaderTest, MixedTypes)
         .lines(true);
 
     cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
-    static int num_case                  = 0;
-    num_case++;
-    std::cout << "case:" << num_case << "\n";
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected);
   };
   // value + string (not mixed type case)
@@ -2332,4 +2434,206 @@ TEST_F(JsonReaderTest, MapTypes)
           {type_id::LIST, type_id::STRING, type_id::STRING});
 }
 
+// Test case for dtype prune:
+// all paths, only one.
+// one present, another not present, nothing present
+// nested, flat, not-jsonlines
+TEST_F(JsonReaderTest, JsonNestedDtypeFilter)
+{
+  std::string json_stringl = R"(
+    {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true}
+    {"a": 1, "b": {"0": "abc"          }, "c": false}
+    {"a": 1, "b": {}}
+    {"a": 1,                              "c": null}
+    )";
+  std::string json_string  = R"([
+    {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true},
+    {"a": 1, "b": {"0": "abc"          }, "c": false},
+    {"a": 1, "b": {}},
+    {"a": 1,                              "c": null}
+    ])";
+  for (auto& [json_string, lines] : {std::pair{json_stringl, true}, {json_string, false}}) {
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_string.data(), json_string.size()})
+        .prune_columns(true)
+        .lines(lines);
+
+    // include all columns
+    //// schema
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"b",
+         {data_type{cudf::type_id::STRUCT},
+          {{"0", {data_type{cudf::type_id::STRING}}},
+           {"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype<float>()}}}}}}}},
+        {"a", {dtype<int32_t>()}},
+        {"c", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", "b" and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 3);
+      ASSERT_EQ(result.metadata.schema_info.size(), 3);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+      EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+      // "b" children checks
+      ASSERT_EQ(result.metadata.schema_info[1].children.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[1].children[0].name, "0");
+      EXPECT_EQ(result.metadata.schema_info[1].children[1].name, "1");
+      ASSERT_EQ(result.metadata.schema_info[1].children[1].children.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[1].children[1].children[0].name, "offsets");
+      EXPECT_EQ(result.metadata.schema_info[1].children[1].children[1].name, "element");
+      // types
+      EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT32);
+      EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRUCT);
+      EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::BOOL8);
+      EXPECT_EQ(result.tbl->get_column(1).child(0).type().id(), cudf::type_id::STRING);
+      EXPECT_EQ(result.tbl->get_column(1).child(1).type().id(), cudf::type_id::LIST);
+      EXPECT_EQ(result.tbl->get_column(1).child(1).child(0).type().id(), cudf::type_id::INT32);
+      EXPECT_EQ(result.tbl->get_column(1).child(1).child(1).type().id(), cudf::type_id::FLOAT32);
+    }
+    //// vector
+    {
+      std::vector<data_type> types{
+        {dtype<int32_t>()}, data_type{cudf::type_id::STRUCT}, {dtype<bool>()}};
+      in_options.set_dtypes(types);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", "b" and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 3);
+      ASSERT_EQ(result.metadata.schema_info.size(), 3);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+      EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+    }
+    //// map
+    {
+      std::map<std::string, data_type> dtype_map{
+        {"b",
+         {
+           data_type{cudf::type_id::STRUCT},
+         }},
+        {"a", {dtype<int32_t>()}},
+        {"c", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_map);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", "b" and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 3);
+      ASSERT_EQ(result.metadata.schema_info.size(), 3);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+      EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+    }
+
+    // include only one column
+    //// schema
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"a", {dtype<int32_t>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "a"
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    }
+    //// vector
+    {
+      std::vector<data_type> types{{dtype<int32_t>()}};
+      in_options.set_dtypes(types);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "a"
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    }
+    //// map
+    {
+      std::map<std::string, data_type> dtype_map{
+        {"a", {dtype<int32_t>()}},
+      };
+      in_options.set_dtypes(dtype_map);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "a"
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    }
+
+    // include only one column (nested)
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"b",
+         {data_type{cudf::type_id::STRUCT},
+          {{"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype<float>()}}}}}}}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "b":"1":[float]
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "b");
+      ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "1");
+      ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
+      EXPECT_EQ(result.metadata.schema_info[0].children[0].children[1].name, "element");
+      EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
+      EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::LIST);
+      EXPECT_EQ(result.tbl->get_column(0).child(0).child(0).type().id(), cudf::type_id::INT32);
+      EXPECT_EQ(result.tbl->get_column(0).child(0).child(1).type().id(), cudf::type_id::FLOAT32);
+    }
+    // multiple - all present
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"a", {dtype<int32_t>()}},
+        {"c", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 2);
+      ASSERT_EQ(result.metadata.schema_info.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "c");
+    }
+    // multiple - not all present
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"a", {dtype<int32_t>()}},
+        {"d", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "a"
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    }
+    // multiple - not all present nested
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+
+        {"b",
+         {data_type{cudf::type_id::STRUCT},
+          {
+            {"2", {data_type{cudf::type_id::STRING}}},
+          }}},
+        {"c", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "b" (empty struct) and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 2);
+      ASSERT_EQ(result.metadata.schema_info.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "b");
+      ASSERT_EQ(result.metadata.schema_info[0].children.size(), 0);
+      EXPECT_EQ(result.metadata.schema_info[1].name, "c");
+    }
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json_whitespace_normalization_test.cu b/cpp/tests/io/json_whitespace_normalization_test.cu
index 336d360063f..8ed5fa81b12 100644
--- a/cpp/tests/io/json_whitespace_normalization_test.cu
+++ b/cpp/tests/io/json_whitespace_normalization_test.cu
@@ -19,6 +19,7 @@
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/types.hpp>
@@ -34,17 +35,26 @@ struct JsonWSNormalizationTest : public cudf::test::BaseFixture {};
 
 void run_test(std::string const& host_input, std::string const& expected_host_output)
 {
-  auto stream_view  = cudf::get_default_stream();
+  // Prepare cuda stream for data transfers & kernels
+  auto stream_view = cudf::test::get_default_stream();
+
   auto device_input = cudf::detail::make_device_uvector_async(
     host_input, stream_view, rmm::mr::get_current_device_resource());
 
   // Preprocessing FST
-  auto device_fst_output = cudf::io::json::detail::normalize_whitespace(
-    std::move(device_input), stream_view, rmm::mr::get_current_device_resource());
-
-  auto const preprocessed_host_output =
-    cudf::detail::make_std_vector_sync(device_fst_output, stream_view);
-
+  cudf::io::datasource::owning_buffer<rmm::device_uvector<char>> device_data(
+    std::move(device_input));
+  cudf::io::json::detail::normalize_whitespace(
+    device_data, stream_view, rmm::mr::get_current_device_resource());
+
+  std::string preprocessed_host_output(device_data.size(), 0);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
+                                device_data.data(),
+                                preprocessed_host_output.size(),
+                                cudaMemcpyDeviceToHost,
+                                stream_view.value()));
+
+  stream_view.synchronize();
   ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size());
   CUDF_TEST_EXPECT_VECTOR_EQUAL(
     preprocessed_host_output, expected_host_output, preprocessed_host_output.size());
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 2e2d5cae34c..112ee8fb57b 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -620,15 +620,12 @@ TEST_F(JsonTest, TokenStream2)
   }
 }
 
-struct JsonParserTest : public cudf::test::BaseFixture, public testing::WithParamInterface<bool> {};
-INSTANTIATE_TEST_SUITE_P(IsFullGPU, JsonParserTest, testing::Bool());
+struct JsonParserTest : public cudf::test::BaseFixture {};
 
-TEST_P(JsonParserTest, ExtractColumn)
+TEST_F(JsonParserTest, ExtractColumn)
 {
   using cuio_json::SymbolT;
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto json_parser = cuio_json::detail::device_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
@@ -867,14 +864,12 @@ TEST_F(JsonTest, PostProcessTokenStream)
   }
 }
 
-TEST_P(JsonParserTest, UTF_JSON)
+TEST_F(JsonParserTest, UTF_JSON)
 {
   // Prepare cuda stream for data transfers & kernels
-  auto const stream      = cudf::get_default_stream();
-  auto mr                = rmm::mr::get_current_device_resource();
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto const stream = cudf::get_default_stream();
+  auto mr           = rmm::mr::get_current_device_resource();
+  auto json_parser  = cuio_json::detail::device_parse_nested_json;
 
   // Default parsing options
   cudf::io::json_reader_options default_options{};
@@ -924,12 +919,10 @@ TEST_P(JsonParserTest, UTF_JSON)
   CUDF_EXPECT_NO_THROW(json_parser(d_utf_pass, default_options, stream, mr));
 }
 
-TEST_P(JsonParserTest, ExtractColumnWithQuotes)
+TEST_F(JsonParserTest, ExtractColumnWithQuotes)
 {
   using cuio_json::SymbolT;
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto json_parser = cuio_json::detail::device_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
@@ -959,12 +952,10 @@ TEST_P(JsonParserTest, ExtractColumnWithQuotes)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2);
 }
 
-TEST_P(JsonParserTest, ExpectFailMixStructAndList)
+TEST_F(JsonParserTest, ExpectFailMixStructAndList)
 {
   using cuio_json::SymbolT;
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto json_parser = cuio_json::detail::device_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
@@ -1002,12 +993,10 @@ TEST_P(JsonParserTest, ExpectFailMixStructAndList)
   }
 }
 
-TEST_P(JsonParserTest, EmptyString)
+TEST_F(JsonParserTest, EmptyString)
 {
   using cuio_json::SymbolT;
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto json_parser = cuio_json::detail::device_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
new file mode 100644
index 00000000000..1c1b53ea17f
--- /dev/null
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -0,0 +1,1477 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/io_metadata_utilities.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/io/data_sink.hpp>
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/orc.hpp>
+#include <cudf/io/orc_metadata.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+namespace {
+enum class output_limit : std::size_t {};
+enum class input_limit : std::size_t {};
+enum class output_row_granularity : cudf::size_type {};
+
+// Global environment for temporary files
+auto const temp_env = reinterpret_cast<cudf::test::TempDirTestEnvironment*>(
+  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+using int32s_col       = cudf::test::fixed_width_column_wrapper<int32_t>;
+using int64s_col       = cudf::test::fixed_width_column_wrapper<int64_t>;
+using doubles_col      = cudf::test::fixed_width_column_wrapper<double>;
+using strings_col      = cudf::test::strings_column_wrapper;
+using structs_col      = cudf::test::structs_column_wrapper;
+using int32s_lists_col = cudf::test::lists_column_wrapper<int32_t>;
+
+auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
+                std::string const& filename,
+                bool nullable                    = false,
+                std::size_t stripe_size_bytes    = cudf::io::default_stripe_size_bytes,
+                cudf::size_type stripe_size_rows = cudf::io::default_stripe_size_rows)
+{
+  if (nullable) {
+    // Generate deterministic bitmask instead of random bitmask for easy computation of data size.
+    auto const valid_iter = cudf::detail::make_counting_transform_iterator(
+      0, [](cudf::size_type i) { return i % 4 != 3; });
+    cudf::size_type offset{0};
+    for (auto& col : input_columns) {
+      auto const [null_mask, null_count] =
+        cudf::test::detail::make_null_mask(valid_iter + offset, valid_iter + col->size() + offset);
+      col = cudf::structs::detail::superimpose_nulls(
+        static_cast<cudf::bitmask_type const*>(null_mask.data()),
+        null_count,
+        std::move(col),
+        cudf::get_default_stream(),
+        rmm::mr::get_current_device_resource());
+
+      // Shift nulls of the next column by one position, to avoid having all nulls
+      // in the same table rows.
+      ++offset;
+    }
+  }
+
+  auto input_table = std::make_unique<cudf::table>(std::move(input_columns));
+  auto filepath =
+    temp_env->get_temp_filepath(nullable ? filename + "_nullable.orc" : filename + ".orc");
+
+  auto const write_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, *input_table)
+      .stripe_size_bytes(stripe_size_bytes)
+      .stripe_size_rows(stripe_size_rows)
+      .build();
+  cudf::io::write_orc(write_opts);
+
+  return std::pair{std::move(input_table), std::move(filepath)};
+}
+
+// NOTE: By default, output_row_granularity=10'000 rows.
+// This means if the input file has more than 10k rows then the output chunk will never
+// have less than 10k rows.
+auto chunked_read(std::string const& filepath,
+                  output_limit output_limit_bytes,
+                  input_limit input_limit_bytes             = input_limit{0},
+                  output_row_granularity output_granularity = output_row_granularity{10'000})
+{
+  auto const read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build();
+  auto reader = cudf::io::chunked_orc_reader(static_cast<std::size_t>(output_limit_bytes),
+                                             static_cast<std::size_t>(input_limit_bytes),
+                                             static_cast<cudf::size_type>(output_granularity),
+                                             read_opts);
+
+  auto num_chunks = 0;
+  auto out_tables = std::vector<std::unique_ptr<cudf::table>>{};
+
+  // TODO: remove this scope, when we get rid of mem stat in the reader.
+  // This is to avoid use-after-free of memory resource created by the mem stat object.
+  auto mr = rmm::mr::get_current_device_resource();
+
+  do {
+    auto chunk = reader.read_chunk();
+    // If the input file is empty, the first call to `read_chunk` will return an empty table.
+    // Thus, we only check for non-empty output table from the second call.
+    if (num_chunks > 0) {
+      CUDF_EXPECTS(chunk.tbl->num_rows() != 0, "Number of rows in the new chunk is zero.");
+    }
+    ++num_chunks;
+    out_tables.emplace_back(std::move(chunk.tbl));
+  } while (reader.has_next());
+
+  if (num_chunks > 1) {
+    CUDF_EXPECTS(out_tables.front()->num_rows() != 0, "Number of rows in the new chunk is zero.");
+  }
+
+  auto out_tviews = std::vector<cudf::table_view>{};
+  for (auto const& tbl : out_tables) {
+    out_tviews.emplace_back(tbl->view());
+  }
+
+  // return std::pair(cudf::concatenate(out_tviews), num_chunks);
+
+  // TODO: remove this
+  return std::pair(cudf::concatenate(out_tviews, cudf::get_default_stream(), mr), num_chunks);
+}
+
+auto chunked_read(std::string const& filepath,
+                  output_limit output_limit_bytes,
+                  output_row_granularity output_granularity)
+{
+  return chunked_read(filepath, output_limit_bytes, input_limit{0UL}, output_granularity);
+}
+
+}  // namespace
+
+struct OrcChunkedReaderTest : public cudf::test::BaseFixture {};
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadNoData)
+{
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  input_columns.emplace_back(int32s_col{}.release());
+  input_columns.emplace_back(int64s_col{}.release());
+
+  auto const [expected, filepath] = write_file(input_columns, "chunked_read_empty");
+  auto const [result, num_chunks] = chunked_read(filepath, output_limit{1'000});
+  EXPECT_EQ(num_chunks, 1);
+  EXPECT_EQ(result->num_rows(), 0);
+  EXPECT_EQ(result->num_columns(), 2);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadInvalidParameter)
+{
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  input_columns.emplace_back(int32s_col{}.release());
+  input_columns.emplace_back(int64s_col{}.release());
+
+  auto const [expected, filepath] = write_file(input_columns, "chunked_read_invalid");
+  EXPECT_THROW(
+    chunked_read(filepath, output_limit{1'000}, output_row_granularity{-1} /*invalid value*/),
+    cudf::logic_error);
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadSimpleData)
+{
+  auto constexpr num_rows = 40'000;
+
+  auto const generate_input = [num_rows](bool nullable, std::size_t stripe_rows) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+    input_columns.emplace_back(int64s_col(value_iter, value_iter + num_rows).release());
+
+    return write_file(input_columns,
+                      "chunked_read_simple",
+                      nullable,
+                      cudf::io::default_stripe_size_bytes,
+                      stripe_rows);
+  };
+
+  {
+    auto const [expected, filepath] = generate_input(false, 1'000);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  {
+    auto const [expected, filepath] = generate_input(false, cudf::io::default_stripe_size_rows);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  {
+    auto const [expected, filepath] = generate_input(true, 1'000);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  {
+    auto const [expected, filepath] = generate_input(true, cudf::io::default_stripe_size_rows);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
+{
+  // Tests some specific boundary conditions in the split calculations.
+
+  auto constexpr num_rows = 40'000;
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+    return write_file(input_columns, "chunked_read_simple_boundary");
+  }();
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL});
+    // Number of chunks is 4 because of using default `output_row_granularity = 10k`.
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte, and small value of `output_row_granularity`.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{1UL}, output_row_granularity{1'000});
+    EXPECT_EQ(num_chunks, 40);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte, and large value of `output_row_granularity`.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{1UL}, output_row_granularity{30'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  // Test with a limit slightly less than one granularity segment of data
+  // (output_row_granularity = 10k rows = 40'000 bytes).
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{39'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size one granularity segment of data
+  // (output_row_granularity = 10k rows = 40'000 bytes).
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{40'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly more than one granularity segment of data
+  // (output_row_granularity = 10k rows = 40'000 bytes).
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{41'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly less than two granularity segments of data
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{79'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of two granularity segments of data minus 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{79'999UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of two granularity segments of data.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{80'000UL});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly more the size two granularity segments of data.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{81'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of the input minus 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{159'999UL});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of the input.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{160'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithString)
+{
+  auto constexpr num_rows           = 60'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+
+    // ints                               Granularity Segment  total bytes   cumulative bytes
+    // 20000 rows of 4 bytes each               = A0           80000         80000
+    // 20000 rows of 4 bytes each               = A1           80000         160000
+    // 20000 rows of 4 bytes each               = A2           80000         240000
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+
+    // strings                            Granularity Segment  total bytes   cumulative bytes
+    // 20000 rows of 1 char each    (20000  + 80004) = B0      100004        100004
+    // 20000 rows of 4 chars each   (80000  + 80004) = B1      160004        260008
+    // 20000 rows of 16 chars each  (320000 + 80004) = B2      400004        660012
+    auto const strings  = std::vector<std::string>{"a", "bbbb", "cccccccccccccccc"};
+    auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+      if (i < 20000) { return strings[0]; }
+      if (i < 40000) { return strings[1]; }
+      return strings[2];
+    });
+    input_columns.emplace_back(strings_col(str_iter, str_iter + num_rows).release());
+
+    // Cumulative sizes:
+    // A0 + B0 :  180004
+    // A1 + B1 :  420008
+    // A2 + B2 :  900012
+    //                                    skip_rows / num_rows
+    // byte_limit==500000  should give 2 chunks: {0, 40000}, {40000, 20000}
+    // byte_limit==1000000 should give 1 chunks: {0, 60000},
+    return write_file(input_columns, "chunked_read_with_strings", nullable);
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{500'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{500'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructs)
+{
+  auto constexpr num_rows           = 100'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+    input_columns.emplace_back([=] {
+      auto child1 = int32s_col(int_iter, int_iter + num_rows);
+      auto child2 = int32s_col(int_iter + num_rows, int_iter + num_rows * 2);
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(
+        0, [&](int32_t i) { return std::to_string(i); });
+      auto child3 = strings_col{str_iter, str_iter + num_rows};
+
+      return structs_col{{child1, child2, child3}}.release();
+    }());
+
+    return write_file(input_columns, "chunked_read_with_structs", nullable);
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{2L << 40}, output_granularity);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{2L << 40}, output_granularity);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{500'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{500'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsNoNulls)
+{
+  auto constexpr num_rows           = 100'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    // 20000 rows in 1 segment consist of:
+    //
+    // 20001 offsets :   80004  bytes
+    // 30000 ints    :   120000 bytes
+    // total         :   200004 bytes
+    //
+    // However, `segmented_row_bit_count` used in chunked reader returns 200000,
+    // thus we consider as having only 200000 bytes in total.
+    auto const template_lists = int32s_lists_col{
+      int32s_lists_col{}, int32s_lists_col{0}, int32s_lists_col{1, 2}, int32s_lists_col{3, 4, 5}};
+
+    auto const gather_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+    auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+    input_columns.emplace_back(
+      std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+    return write_file(input_columns, "chunked_read_with_lists_no_null");
+  }();
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{2L << 40UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size slightly less than 1 row segment (forcing it to be at least 1 segment per read).
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{199'999UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size exactly 1 row segment.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{200'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size == size of 2 segments. Totally have 3 chunks.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{400'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size == size of 2 segment minus one byte: each chunk will be just one segment.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{399'999UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsHavingNulls)
+{
+  auto constexpr num_rows           = 100'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    // 20000 rows in 1 page consist of:
+    //
+    // 625 validity words :   2500 bytes   (a null every 4 rows: null at indices [3, 7, 11, ...])
+    // 20001 offsets      :   80004  bytes
+    // 15000 ints         :   60000 bytes
+    // total              :   142504 bytes
+    //
+    // However, `segmented_row_bit_count` used in chunked reader returns 142500,
+    // thus we consider as having only 142500 bytes in total.
+    auto const template_lists =
+      int32s_lists_col{// these will all be null
+                       int32s_lists_col{},
+                       int32s_lists_col{0},
+                       int32s_lists_col{1, 2},
+                       int32s_lists_col{3, 4, 5, 6, 7, 8, 9} /* this list will be nullified out */};
+    auto const gather_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+    auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+    input_columns.emplace_back(
+      std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+    return write_file(input_columns, "chunked_read_with_lists_nulls", true /*nullable*/);
+  }();
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{2L << 40}, output_granularity);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size slightly less than 1 row segment (forcing it to be at least 1 segment per read).
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{142'499UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size exactly 1 row segment.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{142'500UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size == size of 2 segments. Totally have 3 chunks.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{285'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size == size of 2 segment minus one byte: each chunk will be just one segment.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{284'999UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructsOfLists)
+{
+  auto constexpr num_rows = 100'000;
+
+  // Size of each segment (10k row by default) is from 537k to 560k bytes (no nulls)
+  // and from 456k to 473k (with nulls).
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+    input_columns.emplace_back([=] {
+      std::vector<std::unique_ptr<cudf::column>> child_columns;
+      child_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+      child_columns.emplace_back(
+        int32s_col(int_iter + num_rows, int_iter + num_rows * 2).release());
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+        return std::to_string(i) + "++++++++++++++++++++" + std::to_string(i);
+      });
+      child_columns.emplace_back(strings_col{str_iter, str_iter + num_rows}.release());
+
+      auto const template_lists = int32s_lists_col{
+        int32s_lists_col{}, int32s_lists_col{0}, int32s_lists_col{0, 1}, int32s_lists_col{0, 1, 2}};
+      auto const gather_iter =
+        cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+      auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+      child_columns.emplace_back(
+        std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+      return structs_col(std::move(child_columns)).release();
+    }());
+
+    return write_file(input_columns, "chunked_read_with_structs_of_lists", nullable);
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'500'000UL});
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2'000'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{5'000'000UL});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'500'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2'000'000UL});
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{5'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsOfStructs)
+{
+  auto constexpr num_rows = 100'000;
+
+  // Size of each segment (10k row by default) is from 450k to 530k bytes (no nulls)
+  // and from 330k to 380k (with nulls).
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+
+    auto offsets = std::vector<cudf::size_type>{};
+    offsets.reserve(num_rows * 2);
+    cudf::size_type num_structs = 0;
+    for (int i = 0; i < num_rows; ++i) {
+      offsets.push_back(num_structs);
+      auto const new_list_size = i % 4;
+      num_structs += new_list_size;
+    }
+    offsets.push_back(num_structs);
+
+    auto const make_structs_col = [=] {
+      auto child1 = int32s_col(int_iter, int_iter + num_structs);
+      auto child2 = int32s_col(int_iter + num_structs, int_iter + num_structs * 2);
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(
+        0, [&](int32_t i) { return std::to_string(i) + std::to_string(i) + std::to_string(i); });
+      auto child3 = strings_col{str_iter, str_iter + num_structs};
+
+      return structs_col{{child1, child2, child3}}.release();
+    };
+
+    input_columns.emplace_back(
+      cudf::make_lists_column(static_cast<cudf::size_type>(offsets.size() - 1),
+                              int32s_col(offsets.begin(), offsets.end()).release(),
+                              make_structs_col(),
+                              0,
+                              rmm::device_buffer{}));
+
+    return write_file(input_columns, "chunked_read_with_lists_of_structs", nullable);
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests.
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 7);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'500'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2'000'000UL});
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{5'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'500'000UL});
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2'000'000UL});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{5'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadNullCount)
+{
+  auto constexpr num_rows = 100'000;
+
+  auto const sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
+  auto const validity =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 4 != 3; });
+  std::vector<std::unique_ptr<cudf::column>> cols;
+  cols.push_back(int32s_col{sequence, sequence + num_rows, validity}.release());
+  auto const expected = std::make_unique<cudf::table>(std::move(cols));
+
+  auto const filepath          = temp_env->get_temp_filepath("chunked_reader_null_count.orc");
+  auto const stripe_limit_rows = num_rows / 5;
+  auto const write_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
+      .stripe_size_rows(stripe_limit_rows)
+      .build();
+  cudf::io::write_orc(write_opts);
+
+  auto const byte_limit = stripe_limit_rows * sizeof(int);
+  auto const read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build();
+  auto reader =
+    cudf::io::chunked_orc_reader(byte_limit, 0UL /*read_limit*/, stripe_limit_rows, read_opts);
+
+  do {
+    // Every fourth row is null.
+    EXPECT_EQ(reader.read_chunk().tbl->get_column(0).null_count(), stripe_limit_rows / 4UL);
+  } while (reader.has_next());
+}
+
+namespace {
+
+std::size_t constexpr input_limit_expected_file_count = 3;
+
+std::vector<std::string> input_limit_get_test_names(std::string const& base_filename)
+{
+  return {base_filename + "_a.orc", base_filename + "_b.orc", base_filename + "_c.orc"};
+}
+
+void input_limit_test_write_one(std::string const& filepath,
+                                cudf::table_view const& input,
+                                cudf::size_type stripe_size_rows,
+                                cudf::io::compression_type compression)
+{
+  auto const out_opts = cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, input)
+                          .compression(compression)
+                          .stripe_size_rows(stripe_size_rows)
+                          .build();
+  cudf::io::write_orc(out_opts);
+}
+
+void input_limit_test_write(
+  std::vector<std::string> const& test_files,
+  cudf::table_view const& input,
+  cudf::size_type stripe_size_rows = 20'000 /*write relatively small stripes by default*/)
+{
+  CUDF_EXPECTS(test_files.size() == input_limit_expected_file_count,
+               "Unexpected count of test filenames.");
+
+  // ZSTD yields a very small decompression size, can be much smaller than SNAPPY.
+  // However, ORC reader typically over-estimates the decompression size of data
+  // compressed by ZSTD to be very large, can be much larger than that of SNAPPY.
+  // That is because ZSTD may use a lot of scratch space at decode time
+  // (2.5x the total decompressed buffer size).
+  // As such, we may see smaller output chunks for the input data compressed by ZSTD.
+  input_limit_test_write_one(
+    test_files[0], input, stripe_size_rows, cudf::io::compression_type::NONE);
+  input_limit_test_write_one(
+    test_files[1], input, stripe_size_rows, cudf::io::compression_type::ZSTD);
+  input_limit_test_write_one(
+    test_files[2], input, stripe_size_rows, cudf::io::compression_type::SNAPPY);
+}
+
+void input_limit_test_read(int test_location,
+                           std::vector<std::string> const& test_files,
+                           cudf::table_view const& input,
+                           output_limit output_limit_bytes,
+                           input_limit input_limit_bytes,
+                           int const* expected_chunk_counts)
+{
+  CUDF_EXPECTS(test_files.size() == input_limit_expected_file_count,
+               "Unexpected count of test filenames.");
+
+  for (size_t idx = 0; idx < test_files.size(); ++idx) {
+    SCOPED_TRACE("Original line of failure: " + std::to_string(test_location) +
+                 ", file idx: " + std::to_string(idx));
+    auto const [result, num_chunks] =
+      chunked_read(test_files[idx], output_limit_bytes, input_limit_bytes);
+    EXPECT_EQ(expected_chunk_counts[idx], num_chunks);
+    // TODO: equal
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, input);
+  }
+}
+
+}  // namespace
+
+struct OrcChunkedReaderInputLimitTest : public cudf::test::BaseFixture {};
+
+TEST_F(OrcChunkedReaderInputLimitTest, SingleFixedWidthColumn)
+{
+  auto constexpr num_rows = 1'000'000;
+  auto const iter1        = thrust::make_constant_iterator(15);
+  auto const col1         = doubles_col(iter1, iter1 + num_rows);
+
+  auto const filename   = std::string{"single_col_fixed_width"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{col1}};
+  input_limit_test_write(test_files, input);
+
+  {
+    int constexpr expected[] = {50, 50, 50};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {17, 13, 10};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
+  }
+}
+
+TEST_F(OrcChunkedReaderInputLimitTest, MixedColumns)
+{
+  auto constexpr num_rows = 1'000'000;
+
+  auto const iter1 = thrust::make_counting_iterator<int>(0);
+  auto const col1  = int32s_col(iter1, iter1 + num_rows);
+
+  auto const iter2 = thrust::make_counting_iterator<double>(0);
+  auto const col2  = doubles_col(iter2, iter2 + num_rows);
+
+  auto const strings  = std::vector<std::string>{"abc", "de", "fghi"};
+  auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+    if (i < 250000) { return strings[0]; }
+    if (i < 750000) { return strings[1]; }
+    return strings[2];
+  });
+  auto const col3     = strings_col(str_iter, str_iter + num_rows);
+
+  auto const filename   = std::string{"mixed_columns"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{col1, col2, col3}};
+  input_limit_test_write(test_files, input);
+
+  {
+    int constexpr expected[] = {50, 50, 50};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {17, 50, 17};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
+  }
+}
+
+namespace {
+
+struct offset_gen {
+  int const group_size;
+  __device__ int operator()(int i) const { return i * group_size; }
+};
+
+template <typename T>
+struct value_gen {
+  __device__ T operator()(int i) const { return i % 1024; }
+};
+
+struct char_values {
+  __device__ int8_t operator()(int i) const
+  {
+    int const index = (i / 2) % 3;
+    // Generate repeating 3-runs of 2 values each: "aabbccaabbcc...".
+    return index == 0 ? 'a' : (index == 1 ? 'b' : 'c');
+  }
+};
+
+}  // namespace
+
+TEST_F(OrcChunkedReaderInputLimitTest, ListType)
+{
+  int constexpr num_rows  = 50'000'000;
+  int constexpr list_size = 4;
+
+  auto const stream = cudf::get_default_stream();
+  auto const iter   = thrust::make_counting_iterator(0);
+
+  auto offset_col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows + 1,
+                    offset_col->mutable_view().begin<int>(),
+                    offset_gen{list_size});
+
+  int constexpr num_ints = num_rows * list_size;
+  auto value_col         = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_ints,
+                    value_col->mutable_view().begin<int>(),
+                    value_gen<int>{});
+
+  auto const lists_col =
+    cudf::make_lists_column(num_rows, std::move(offset_col), std::move(value_col), 0, {}, stream);
+
+  auto const filename   = std::string{"list_type"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{*lists_col}};
+
+  // Although we set `stripe_size_rows` to be very large, the writer only write
+  // 250k rows (top level) per stripe due to having nested type.
+  // Thus, we have 200 stripes in total.
+  input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
+
+  {
+    int constexpr expected[] = {3, 40, 3};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{5 * 1024 * 1024UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {8, 40, 9};
+    input_limit_test_read(__LINE__,
+                          test_files,
+                          input,
+                          output_limit{128 * 1024 * 1024UL},
+                          input_limit{5 * 1024 * 1024UL},
+                          expected);
+  }
+}
+
+TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
+{
+  int constexpr num_rows  = 50'000'000;
+  int constexpr list_size = 4;
+  int constexpr str_size  = 3;
+
+  auto const stream = cudf::get_default_stream();
+  auto const iter   = thrust::make_counting_iterator(0);
+
+  // list<int>
+  auto offset_col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows + 1,
+                    offset_col->mutable_view().begin<int>(),
+                    offset_gen{list_size});
+
+  int constexpr num_ints = num_rows * list_size;
+  auto value_col         = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_ints,
+                    value_col->mutable_view().begin<int>(),
+                    value_gen<int>{});
+
+  auto const lists_col =
+    cudf::make_lists_column(num_rows, std::move(offset_col), std::move(value_col), 0, {}, stream);
+
+  // strings
+  int constexpr num_chars = num_rows * str_size;
+  auto str_offset_col     = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows + 1,
+                    str_offset_col->mutable_view().begin<int>(),
+                    offset_gen{str_size});
+  rmm::device_buffer str_chars(num_chars, stream);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_chars,
+                    static_cast<int8_t*>(str_chars.data()),
+                    char_values{});
+  auto const str_col =
+    cudf::make_strings_column(num_rows, std::move(str_offset_col), std::move(str_chars), 0, {});
+
+  // doubles
+  auto const double_col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::FLOAT64}, num_rows, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows,
+                    double_col->mutable_view().begin<double>(),
+                    value_gen<double>{});
+
+  auto const filename   = std::string{"mixed_cols_having_list"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{*lists_col, *str_col, *double_col}};
+
+  // Although we set `stripe_size_rows` to be very large, the writer only write
+  // 250k rows (top level) per stripe due to having nested type.
+  // Thus, we have 200 stripes in total.
+  input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
+
+  {
+    int constexpr expected[] = {13, 8, 6};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{128 * 1024 * 1024UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {13, 15, 17};
+    input_limit_test_read(__LINE__,
+                          test_files,
+                          input,
+                          output_limit{128 * 1024 * 1024UL},
+                          input_limit{128 * 1024 * 1024UL},
+                          expected);
+  }
+}
+
+TEST_F(OrcChunkedReaderInputLimitTest, ReadWithRowSelection)
+{
+  // `num_rows` should not be divisible by `stripe_size_rows`, to test the correctness of row
+  // selections.
+  int64_t constexpr num_rows    = 100'517'687l;
+  int constexpr rows_per_stripe = 100'000;
+  static_assert(num_rows % rows_per_stripe != 0,
+                "`num_rows` should not be divisible by `stripe_size_rows`.");
+
+  auto const it    = thrust::make_counting_iterator(0);
+  auto const col   = int32s_col(it, it + num_rows);
+  auto const input = cudf::table_view{{col}};
+
+  auto const filepath = temp_env->get_temp_filepath("chunk_read_with_row_selection.orc");
+  auto const write_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, input)
+      .stripe_size_rows(rows_per_stripe)
+      .build();
+  cudf::io::write_orc(write_opts);
+
+  // Verify metadata.
+  auto const metadata = cudf::io::read_orc_metadata(cudf::io::source_info{filepath});
+  EXPECT_EQ(metadata.num_rows(), num_rows);
+  EXPECT_EQ(metadata.num_stripes(), num_rows / rows_per_stripe + 1);
+
+  int constexpr random_val = 123456;
+
+  // Read some random number or rows that is not stripe size.
+  int constexpr num_rows_to_read = rows_per_stripe * 5 + random_val;
+
+  // Just shift the read data region back by a random offset.
+  const auto num_rows_to_skip = num_rows - num_rows_to_read - random_val;
+
+  const auto sequence_start = num_rows_to_skip % num_rows;
+  auto const skipped_col = int32s_col(it + sequence_start, it + sequence_start + num_rows_to_read);
+  auto const expected    = cudf::table_view{{skipped_col}};
+
+  auto const read_opts = cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
+                           .use_index(false)
+                           .skip_rows(num_rows_to_skip)
+                           .num_rows(num_rows_to_read)
+                           .build();
+
+  auto reader = cudf::io::chunked_orc_reader(
+    60'000UL * sizeof(int) /*output limit, equal to 60k rows, less than rows in 1 stripe*/,
+    rows_per_stripe * sizeof(int) /*input limit, around size of 1 stripe's decoded data*/,
+    50'000 /*output granularity, or minimum number of rows for the output chunk*/,
+    read_opts);
+
+  auto num_chunks  = 0;
+  auto read_tables = std::vector<std::unique_ptr<cudf::table>>{};
+  auto tviews      = std::vector<cudf::table_view>{};
+
+  do {
+    auto chunk = reader.read_chunk();
+    // Each output chunk should have either exactly 50k rows, or num_rows_to_read % 50k.
+    EXPECT_TRUE(chunk.tbl->num_rows() == 50000 ||
+                chunk.tbl->num_rows() == num_rows_to_read % 50000);
+
+    tviews.emplace_back(chunk.tbl->view());
+    read_tables.emplace_back(std::move(chunk.tbl));
+    ++num_chunks;
+  } while (reader.has_next());
+
+  auto const read_result = cudf::concatenate(tviews);
+  EXPECT_EQ(num_chunks, 13);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
+}
+
+TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
+{
+  using data_type = int16_t;
+  using data_col  = cudf::test::fixed_width_column_wrapper<data_type, int64_t>;
+
+  int64_t constexpr num_rows    = 500'000'000l;
+  int constexpr rows_per_stripe = 1'000'000;
+  int constexpr num_reps        = 10;
+  int64_t constexpr total_rows  = num_rows * num_reps;
+  static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
+
+  auto const it  = cudf::detail::make_counting_transform_iterator(0l, [num_rows](int64_t i) {
+    return (i % num_rows) % static_cast<int64_t>(std::numeric_limits<data_type>::max() / 2);
+  });
+  auto const col = data_col(it, it + num_rows);
+  auto const chunk_table = cudf::table_view{{col}};
+
+  std::vector<char> data_buffer;
+  {
+    auto const write_opts =
+      cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&data_buffer})
+        .stripe_size_rows(rows_per_stripe)
+        .build();
+
+    auto writer = cudf::io::orc_chunked_writer(write_opts);
+    for (int i = 0; i < num_reps; ++i) {
+      writer.write(chunk_table);
+    }
+  }
+
+  // Verify metadata.
+  auto const metadata =
+    cudf::io::read_orc_metadata(cudf::io::source_info{data_buffer.data(), data_buffer.size()});
+  EXPECT_EQ(metadata.num_rows(), total_rows);
+  EXPECT_EQ(metadata.num_stripes(), total_rows / rows_per_stripe);
+
+  // Read with row selections and memory limit.
+  {
+    int64_t constexpr num_rows_to_read = 5'000'000l;
+    int64_t const num_rows_to_skip =
+      static_cast<int64_t>(metadata.num_rows()) - num_rows_to_read -
+      123456l /*just shift the read data region back by a random offset*/;
+
+    // Check validity of the last 5 million rows.
+    auto const sequence_start = num_rows_to_skip % num_rows;
+    auto const skipped_col = data_col(it + sequence_start, it + sequence_start + num_rows_to_read);
+    auto const expected    = cudf::table_view{{skipped_col}};
+
+    auto const read_opts = cudf::io::orc_reader_options::builder(
+                             cudf::io::source_info{data_buffer.data(), data_buffer.size()})
+                             .use_index(false)
+                             .skip_rows(num_rows_to_skip)
+                             .num_rows(num_rows_to_read)
+                             .build();
+    auto reader = cudf::io::chunked_orc_reader(
+      600'000UL * sizeof(data_type) /* output limit, equal to 600k rows */,
+      rows_per_stripe * sizeof(data_type) /* input limit, around size of 1 stripe's decoded data */,
+      rows_per_stripe / 2 /* output granularity, or minimum number of rows for the output chunk */,
+      read_opts);
+
+    auto num_chunks  = 0;
+    auto read_tables = std::vector<std::unique_ptr<cudf::table>>{};
+    auto tviews      = std::vector<cudf::table_view>{};
+
+    do {
+      auto chunk = reader.read_chunk();
+      ++num_chunks;
+      tviews.emplace_back(chunk.tbl->view());
+      read_tables.emplace_back(std::move(chunk.tbl));
+    } while (reader.has_next());
+
+    auto const read_result = cudf::concatenate(tviews);
+    EXPECT_EQ(num_chunks, 11);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
+  }
+
+  // The test below requires a huge amount of memory, thus it is disabled by default.
+#ifdef LOCAL_TEST
+  // Read with only output limit -- there is no limit on the memory usage.
+  // However, the reader should be able to detect and load only enough stripes each time
+  // to avoid decoding a table having number of rows that exceeds the column size limit.
+  {
+    auto const read_opts = cudf::io::orc_reader_options::builder(
+                             cudf::io::source_info{data_buffer.data(), data_buffer.size()})
+                             .use_index(false)
+                             .build();
+    auto reader = cudf::io::chunked_orc_reader(
+      static_cast<std::size_t>(rows_per_stripe * 5.7) *
+        sizeof(data_type) /* output limit, equal to 5.7M rows */,
+      0UL /* no input limit */,
+      rows_per_stripe / 2 /* output granularity, or minimum number of rows for the output chunk */,
+      read_opts);
+
+    int num_chunks          = 0;
+    int64_t num_read_rows   = 0;
+    int64_t test_rows_start = 0;
+    auto test_chunk         = std::unique_ptr<cudf::table>{};
+
+    do {
+      auto chunk            = reader.read_chunk();
+      auto const chunk_rows = chunk.tbl->num_rows();
+
+      // Just randomly select one output chunk to verify.
+      if (num_chunks == 123) {
+        test_rows_start = num_read_rows;
+        test_chunk      = std::move(chunk.tbl);
+      }
+
+      ++num_chunks;
+      num_read_rows += chunk_rows;
+    } while (reader.has_next());
+
+    EXPECT_EQ(num_read_rows, total_rows);
+
+    // Typically, we got a chunk having 5M rows.
+    // However, since the reader internally splits file stripes that are not multiple of 5 stripes,
+    // we may have some extra chunks that have less than 5M rows.
+    EXPECT_EQ(num_chunks, 1002);
+
+    // Verify the selected chunk.
+    using namespace cudf::test::iterators;
+    auto const skipped_col =
+      data_col(it + test_rows_start, it + test_rows_start + test_chunk->num_rows(), no_nulls());
+    auto const expected = cudf::table_view{{skipped_col}};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, test_chunk->view());
+  }
+
+#endif  // LOCAL_TEST
+}
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index a16b3d63177..fd8484bc70f 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -567,9 +567,7 @@ TEST_F(ParquetWriterTest, EmptyList)
   auto result = cudf::io::read_parquet(
     cudf::io::parquet_reader_options_builder(cudf::io::source_info(filepath)));
 
-  using lcw     = cudf::test::lists_column_wrapper<int64_t>;
-  auto expected = lcw{lcw{}, lcw{}, lcw{}};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), L0->view());
 }
 
 TEST_F(ParquetWriterTest, DeepEmptyList)
@@ -903,6 +901,12 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
       ASSERT_TRUE(stats.min_value.has_value());
       ASSERT_TRUE(stats.max_value.has_value());
 
+      // check that min and max for the column chunk are exact (i.e. not truncated)
+      ASSERT_TRUE(stats.is_max_value_exact.has_value());
+      EXPECT_TRUE(stats.is_max_value_exact.value());
+      ASSERT_TRUE(stats.is_min_value_exact.has_value());
+      EXPECT_TRUE(stats.is_min_value_exact.value());
+
       // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;
@@ -1674,7 +1678,18 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
   // no nulls and no repetition, so the only encoding used should be for the data.
   // since we're writing v1, both dict and data pages should use PLAIN_DICTIONARY.
   auto const expect_enc = [&fmd](int idx, cudf::io::parquet::detail::Encoding enc) {
-    EXPECT_EQ(fmd.row_groups[0].columns[idx].meta_data.encodings[0], enc);
+    auto const& col_meta = fmd.row_groups[0].columns[idx].meta_data;
+    EXPECT_EQ(col_meta.encodings[0], enc);
+
+    // also check encoding stats are written properly
+    ASSERT_TRUE(col_meta.encoding_stats.has_value());
+    auto const& enc_stats = col_meta.encoding_stats.value();
+    for (auto const& ec : enc_stats) {
+      if (ec.page_type == cudf::io::parquet::detail::PageType::DATA_PAGE) {
+        EXPECT_EQ(ec.encoding, enc);
+        EXPECT_EQ(ec.count, 1);
+      }
+    }
   };
 
   // requested plain
diff --git a/cpp/tests/io/row_selection_test.cpp b/cpp/tests/io/row_selection_test.cpp
index 0c259c81a23..ebadd870091 100644
--- a/cpp/tests/io/row_selection_test.cpp
+++ b/cpp/tests/io/row_selection_test.cpp
@@ -122,17 +122,4 @@ TEST_F(FromOptsTest, LimitOptionsToFileRows)
   }
 }
 
-TEST_F(FromOptsTest, OverFlowDetection)
-{
-  auto const too_large_for_32bit = std::numeric_limits<int64_t>::max();
-
-  // Too many rows to read until the end of the file
-  EXPECT_THROW(skip_rows_num_rows_from_options(0, std::nullopt, too_large_for_32bit),
-               std::overflow_error);
-
-  // Should work fine with num_rows
-  EXPECT_NO_THROW(
-    skip_rows_num_rows_from_options(1000, too_large_for_32bit - 100, too_large_for_32bit));
-}
-
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/labeling/label_bins_tests.cpp b/cpp/tests/labeling/label_bins_tests.cpp
index 2ac6ad5dd0d..1a9e74df9be 100644
--- a/cpp/tests/labeling/label_bins_tests.cpp
+++ b/cpp/tests/labeling/label_bins_tests.cpp
@@ -25,6 +25,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/labeling/label_bins.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <algorithm>
 #include <limits>
@@ -64,7 +65,7 @@ TEST(BinColumnErrorTests, TestInvalidLeft)
 
   EXPECT_THROW(
     cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO),
-    cudf::logic_error);
+    cudf::data_type_error);
 };
 
 // Right edges type check.
@@ -76,7 +77,7 @@ TEST(BinColumnErrorTests, TestInvalidRight)
 
   EXPECT_THROW(
     cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO),
-    cudf::logic_error);
+    cudf::data_type_error);
 };
 
 // Input type check.
@@ -88,7 +89,7 @@ TEST(BinColumnErrorTests, TestInvalidInput)
 
   EXPECT_THROW(
     cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO),
-    cudf::logic_error);
+    cudf::data_type_error);
 };
 
 // Number of left and right edges must match.
diff --git a/cpp/tests/large_strings/parquet_tests.cpp b/cpp/tests/large_strings/parquet_tests.cpp
new file mode 100644
index 00000000000..007c08ce0fb
--- /dev/null
+++ b/cpp/tests/large_strings/parquet_tests.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/table/table_view.hpp>
+
+namespace {
+
+cudf::test::TempDirTestEnvironment* const g_temp_env =
+  static_cast<cudf::test::TempDirTestEnvironment*>(
+    ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+}  // namespace
+
+struct ParquetStringsTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(ParquetStringsTest, ReadLargeStrings)
+{
+  // need to create a string column larger than `threshold`
+  auto const col0        = this->long_column();
+  auto const column_size = cudf::strings_column_view(col0).chars_size(cudf::get_default_stream());
+  auto const threshold   = column_size - 1;
+  auto const expected    = cudf::table_view{{col0, col0, col0}};
+
+  auto expected_metadata = cudf::io::table_input_metadata{expected};
+  expected_metadata.column_metadata[1].set_encoding(
+    cudf::io::column_encoding::DELTA_LENGTH_BYTE_ARRAY);
+  expected_metadata.column_metadata[2].set_encoding(cudf::io::column_encoding::DELTA_BYTE_ARRAY);
+
+  // set smaller threshold to reduce file size and execution time
+  setenv("LIBCUDF_LARGE_STRINGS_THRESHOLD", std::to_string(threshold).c_str(), 1);
+
+  auto const filepath = g_temp_env->get_temp_filepath("ReadLargeStrings.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .compression(cudf::io::compression_type::ZSTD)
+      .stats_level(cudf::io::STATISTICS_NONE)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options default_in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto const result      = cudf::io::read_parquet(default_in_opts);
+  auto const result_view = result.tbl->view();
+  for (auto cv : result_view) {
+    auto const offsets = cudf::strings_column_view(cv).offsets();
+    EXPECT_EQ(offsets.type(), cudf::data_type{cudf::type_id::INT64});
+  }
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result_view, expected);
+
+  // go back to normal threshold
+  unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
+}
diff --git a/cpp/tests/lists/combine/concatenate_rows_tests.cpp b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
index 008003a08a1..bf088eb855a 100644
--- a/cpp/tests/lists/combine/concatenate_rows_tests.cpp
+++ b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/lists/combine.hpp>
+#include <cudf/utilities/error.hpp>
 
 using namespace cudf::test::iterators;
 
@@ -53,7 +54,7 @@ TEST_F(ListConcatenateRowsTest, InvalidInput)
     auto const col1 = IntListsCol{}.release();
     auto const col2 = StrListsCol{}.release();
     EXPECT_THROW(cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}}),
-                 cudf::logic_error);
+                 cudf::data_type_error);
   }
 }
 
diff --git a/cpp/tests/lists/sequences_tests.cpp b/cpp/tests/lists/sequences_tests.cpp
index e97600a76d3..74545903eb3 100644
--- a/cpp/tests/lists/sequences_tests.cpp
+++ b/cpp/tests/lists/sequences_tests.cpp
@@ -22,6 +22,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/lists/filling.hpp>
+#include <cudf/utilities/error.hpp>
 
 using namespace cudf::test::iterators;
 
@@ -200,8 +201,8 @@ TEST_F(NumericSequencesTest, InvalidSizesInput)
   auto const steps  = IntsCol{};
   auto const sizes  = FWDCol<float>{};
 
-  EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::logic_error);
-  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error);
+  EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::data_type_error);
+  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::data_type_error);
 }
 
 TEST_F(NumericSequencesTest, MismatchedColumnSizesInput)
@@ -220,7 +221,7 @@ TEST_F(NumericSequencesTest, MismatchedColumnTypesInput)
   auto const steps  = FWDCol<float>{1, 2, 3};
   auto const sizes  = IntsCol{1, 2, 3};
 
-  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error);
+  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::data_type_error);
 }
 
 TEST_F(NumericSequencesTest, InputHasNulls)
diff --git a/cpp/tests/replace/clamp_test.cpp b/cpp/tests/replace/clamp_test.cpp
index bb33de1f1e7..239c9ce6ddd 100644
--- a/cpp/tests/replace/clamp_test.cpp
+++ b/cpp/tests/replace/clamp_test.cpp
@@ -25,6 +25,7 @@
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -41,7 +42,7 @@ TEST_F(ClampErrorTest, MisMatchingScalarTypes)
 
   cudf::test::fixed_width_column_wrapper<int32_t> input({1, 2, 3, 4, 5, 6});
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::data_type_error);
 }
 
 TEST_F(ClampErrorTest, MisMatchingInputAndScalarTypes)
@@ -53,7 +54,7 @@ TEST_F(ClampErrorTest, MisMatchingInputAndScalarTypes)
 
   cudf::test::fixed_width_column_wrapper<int64_t> input({1, 2, 3, 4, 5, 6});
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::data_type_error);
 }
 
 TEST_F(ClampErrorTest, MisMatchingReplaceScalarTypes)
@@ -69,7 +70,7 @@ TEST_F(ClampErrorTest, MisMatchingReplaceScalarTypes)
 
   cudf::test::fixed_width_column_wrapper<int64_t> input({1, 2, 3, 4, 5, 6});
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *lo_replace, *hi, *hi_replace), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *lo_replace, *hi, *hi_replace), cudf::data_type_error);
 }
 
 TEST_F(ClampErrorTest, InValidCase1)
@@ -640,7 +641,7 @@ TYPED_TEST(FixedPointTest, MismatchedScalarScales)
   auto const hi    = cudf::make_fixed_point_scalar<decimalXX>(8, scale);
   auto const input = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, scale};
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::data_type_error);
 }
 
 TYPED_TEST(FixedPointTest, MismatchedColumnScalarScale)
@@ -655,7 +656,7 @@ TYPED_TEST(FixedPointTest, MismatchedColumnScalarScale)
   auto const hi    = cudf::make_fixed_point_scalar<decimalXX>(8, scale);
   auto const input = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, scale_type{-4}};
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::data_type_error);
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index 6c23dd6bdc8..9603ea44a76 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -58,7 +58,7 @@ TEST_F(ReplaceErrorTest, TypeMismatch)
   cudf::test::fixed_width_column_wrapper<float> values_to_replace_column{
     {10, 11, 12, 13, 14, 15, 16, 17}};
 
-  EXPECT_THROW(cudf::replace_nulls(input_column, values_to_replace_column), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input_column, values_to_replace_column), cudf::data_type_error);
 }
 
 // Error: column type mismatch
@@ -68,7 +68,7 @@ TEST_F(ReplaceErrorTest, TypeMismatchScalar)
                                                                {0, 0, 1, 1, 1, 1, 1, 1}};
   cudf::numeric_scalar<float> replacement(1);
 
-  EXPECT_THROW(cudf::replace_nulls(input_column, replacement), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input_column, replacement), cudf::data_type_error);
 }
 
 struct ReplaceNullsStringsTest : public cudf::test::BaseFixture {};
@@ -659,14 +659,14 @@ TEST_F(ReplaceDictionaryTest, ReplaceNullsError)
   cudf::test::fixed_width_column_wrapper<int64_t> replacement_w({1, 2, 3, 4});
   auto replacement = cudf::dictionary::encode(replacement_w);
 
-  EXPECT_THROW(cudf::replace_nulls(input->view(), replacement->view()), cudf::logic_error);
-  EXPECT_THROW(cudf::replace_nulls(input->view(), cudf::string_scalar("x")), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input->view(), replacement->view()), cudf::data_type_error);
+  EXPECT_THROW(cudf::replace_nulls(input->view(), cudf::string_scalar("x")), cudf::data_type_error);
 
   cudf::test::fixed_width_column_wrapper<int64_t> input_one_w({1}, {0});
   auto input_one  = cudf::dictionary::encode(input_one_w);
   auto dict_input = cudf::dictionary_column_view(input_one->view());
   auto dict_repl  = cudf::dictionary_column_view(replacement->view());
-  EXPECT_THROW(cudf::replace_nulls(input->view(), replacement->view()), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input->view(), replacement->view()), cudf::data_type_error);
 }
 
 TEST_F(ReplaceDictionaryTest, ReplaceNullsEmpty)
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index 613034efc12..1858cd7782e 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -30,6 +30,7 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -63,7 +64,7 @@ TEST_F(ReplaceErrorTest, TypeMismatch)
 
   EXPECT_THROW(
     cudf::find_and_replace_all(input_column, values_to_replace_column, replacement_values_column),
-    cudf::logic_error);
+    cudf::data_type_error);
 }
 
 // Error: nulls in old-values
diff --git a/cpp/tests/transform/one_hot_encode_tests.cpp b/cpp/tests/transform/one_hot_encode_tests.cpp
index 1015370fe4b..8384cb3480b 100644
--- a/cpp/tests/transform/one_hot_encode_tests.cpp
+++ b/cpp/tests/transform/one_hot_encode_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <cudf/table/table_view.hpp>
 #include <cudf/transform.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <limits>
 
@@ -198,7 +199,7 @@ TEST_F(OneHotEncodingTest, MismatchTypes)
   auto input    = cudf::test::strings_column_wrapper{"xx", "yy", "xx"};
   auto category = cudf::test::fixed_width_column_wrapper<int64_t>{1};
 
-  EXPECT_THROW(cudf::one_hot_encode(input, category), cudf::logic_error);
+  EXPECT_THROW(cudf::one_hot_encode(input, category), cudf::data_type_error);
 }
 
 TEST_F(OneHotEncodingTest, List)
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 047b096a283..7cc2777972e 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -31,6 +31,7 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -238,11 +239,6 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
 
 template <bool check_exact_equality>
 struct column_property_comparator {
-  bool types_equivalent(cudf::data_type const& lhs, cudf::data_type const& rhs)
-  {
-    return is_fixed_point(lhs) ? lhs.id() == rhs.id() : lhs == rhs;
-  }
-
   bool compare_common(cudf::column_view const& lhs,
                       cudf::column_view const& rhs,
                       cudf::column_view const& lhs_row_indices,
@@ -252,9 +248,9 @@ struct column_property_comparator {
     bool result = true;
 
     if (check_exact_equality) {
-      PROP_EXPECT_EQ(lhs.type(), rhs.type());
+      PROP_EXPECT_EQ(cudf::have_same_types(lhs, rhs), true);
     } else {
-      PROP_EXPECT_EQ(types_equivalent(lhs.type(), rhs.type()), true);
+      PROP_EXPECT_EQ(cudf::column_types_equivalent(lhs, rhs), true);
     }
 
     auto const lhs_size = check_exact_equality ? lhs.size() : lhs_row_indices.size();
diff --git a/cpp/tests/utilities_tests/type_check_tests.cpp b/cpp/tests/utilities_tests/type_check_tests.cpp
index 9c23798fce6..fecb896f95a 100644
--- a/cpp/tests/utilities_tests/type_check_tests.cpp
+++ b/cpp/tests/utilities_tests/type_check_tests.cpp
@@ -19,13 +19,11 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/wrappers/durations.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-namespace cudf {
-namespace test {
-
 template <typename T>
 struct ColumnTypeCheckTestTyped : public cudf::test::BaseFixture {};
 
@@ -35,56 +33,56 @@ TYPED_TEST_SUITE(ColumnTypeCheckTestTyped, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(ColumnTypeCheckTestTyped, SameFixedWidth)
 {
-  fixed_width_column_wrapper<TypeParam> lhs{1, 1}, rhs{2};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  cudf::test::fixed_width_column_wrapper<TypeParam> lhs{1, 1}, rhs{2};
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 }
 
 TEST_F(ColumnTypeCheckTest, SameString)
 {
-  strings_column_wrapper lhs{{'a', 'a'}}, rhs{{'b'}};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  cudf::test::strings_column_wrapper lhs{{'a', 'a'}}, rhs{{'b'}};
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 
-  strings_column_wrapper lhs2{}, rhs2{{'b'}};
-  EXPECT_TRUE(column_types_equal(lhs2, rhs2));
+  cudf::test::strings_column_wrapper lhs2{}, rhs2{{'b'}};
+  EXPECT_TRUE(cudf::have_same_types(lhs2, rhs2));
 
-  strings_column_wrapper lhs3{}, rhs3{};
-  EXPECT_TRUE(column_types_equal(lhs3, rhs3));
+  cudf::test::strings_column_wrapper lhs3{}, rhs3{};
+  EXPECT_TRUE(cudf::have_same_types(lhs3, rhs3));
 }
 
 TEST_F(ColumnTypeCheckTest, SameList)
 {
-  using LCW = lists_column_wrapper<int32_t>;
+  using LCW = cudf::test::lists_column_wrapper<int32_t>;
 
   LCW lhs{}, rhs{};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 
   LCW lhs2{{1, 2, 3}}, rhs2{{4, 5}};
-  EXPECT_TRUE(column_types_equal(lhs2, rhs2));
+  EXPECT_TRUE(cudf::have_same_types(lhs2, rhs2));
 
   LCW lhs3{{LCW{1}, LCW{2, 3}}}, rhs3{{LCW{4, 5}}};
-  EXPECT_TRUE(column_types_equal(lhs3, rhs3));
+  EXPECT_TRUE(cudf::have_same_types(lhs3, rhs3));
 
   LCW lhs4{{LCW{1}, LCW{}, LCW{2, 3}}}, rhs4{{LCW{4, 5}, LCW{}}};
-  EXPECT_TRUE(column_types_equal(lhs4, rhs4));
+  EXPECT_TRUE(cudf::have_same_types(lhs4, rhs4));
 }
 
 TYPED_TEST(ColumnTypeCheckTestTyped, SameDictionary)
 {
-  using DCW = dictionary_column_wrapper<TypeParam>;
+  using DCW = cudf::test::dictionary_column_wrapper<TypeParam>;
   DCW lhs{1, 1, 2, 3}, rhs{5, 5};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 
   DCW lhs2{}, rhs2{};
-  EXPECT_TRUE(column_types_equal(lhs2, rhs2));
+  EXPECT_TRUE(cudf::have_same_types(lhs2, rhs2));
 }
 
 TEST_F(ColumnTypeCheckTest, SameStruct)
 {
-  using SCW      = structs_column_wrapper;
-  using FCW      = fixed_width_column_wrapper<int32_t>;
-  using StringCW = strings_column_wrapper;
-  using LCW      = lists_column_wrapper<int32_t>;
-  using DCW      = dictionary_column_wrapper<int32_t>;
+  using SCW      = cudf::test::structs_column_wrapper;
+  using FCW      = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using StringCW = cudf::test::strings_column_wrapper;
+  using LCW      = cudf::test::lists_column_wrapper<int32_t>;
+  using DCW      = cudf::test::dictionary_column_wrapper<int32_t>;
 
   FCW lf1{1, 2, 3}, rf1{0, 1};
   StringCW lf2{"a", "bb", ""}, rf2{"cc", "d"};
@@ -92,127 +90,158 @@ TEST_F(ColumnTypeCheckTest, SameStruct)
   DCW lf4{5, 5, 5}, rf4{9, 9};
 
   SCW lhs{lf1, lf2, lf3, lf4}, rhs{rf1, rf2, rf3, rf4};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentBasics)
 {
-  fixed_width_column_wrapper<int32_t> lhs1{1, 1};
-  strings_column_wrapper rhs1{"a", "bb"};
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs1{1, 1};
+  cudf::test::strings_column_wrapper rhs1{"a", "bb"};
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
-  lists_column_wrapper<string_view> lhs2{{"hello"}, {"world", "!"}};
-  strings_column_wrapper rhs2{"", "kk"};
+  cudf::test::lists_column_wrapper<cudf::string_view> lhs2{{"hello"}, {"world", "!"}};
+  cudf::test::strings_column_wrapper rhs2{"", "kk"};
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
 
-  fixed_width_column_wrapper<int32_t> lhs3{1, 1};
-  dictionary_column_wrapper<int32_t> rhs3{2, 2};
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs3{1, 1};
+  cudf::test::dictionary_column_wrapper<int32_t> rhs3{2, 2};
 
-  EXPECT_FALSE(column_types_equal(lhs3, rhs3));
+  EXPECT_FALSE(cudf::have_same_types(lhs3, rhs3));
 
-  lists_column_wrapper<int32_t> lhs4{{8, 8, 8}, {10, 10}};
-  structs_column_wrapper rhs4{rhs2, rhs3};
+  cudf::test::lists_column_wrapper<int32_t> lhs4{{8, 8, 8}, {10, 10}};
+  cudf::test::structs_column_wrapper rhs4{rhs2, rhs3};
 
-  EXPECT_FALSE(column_types_equal(lhs4, rhs4));
+  EXPECT_FALSE(cudf::have_same_types(lhs4, rhs4));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentFixedWidth)
 {
-  fixed_width_column_wrapper<int32_t> lhs1{1, 1};
-  fixed_width_column_wrapper<int64_t> rhs1{2};
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs1{1, 1};
+  cudf::test::fixed_width_column_wrapper<int64_t> rhs1{2};
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
-  fixed_width_column_wrapper<float> lhs2{1, 1};
-  fixed_width_column_wrapper<double> rhs2{2};
+  cudf::test::fixed_width_column_wrapper<float> lhs2{1, 1};
+  cudf::test::fixed_width_column_wrapper<double> rhs2{2};
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
 
-  fixed_width_column_wrapper<timestamp_ms> lhs3{1, 1};
-  fixed_width_column_wrapper<timestamp_us> rhs3{2};
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms> lhs3{1, 1};
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_us> rhs3{2};
 
-  EXPECT_FALSE(column_types_equal(lhs3, rhs3));
+  EXPECT_FALSE(cudf::have_same_types(lhs3, rhs3));
 
-  fixed_width_column_wrapper<duration_D> lhs4{};
-  fixed_width_column_wrapper<duration_us> rhs4{42};
+  cudf::test::fixed_width_column_wrapper<cudf::duration_D> lhs4{};
+  cudf::test::fixed_width_column_wrapper<cudf::duration_us> rhs4{42};
 
-  EXPECT_FALSE(column_types_equal(lhs4, rhs4));
+  EXPECT_FALSE(cudf::have_same_types(lhs4, rhs4));
 
   // Same rep, different scale
-  fixed_point_column_wrapper<int32_t> lhs5({10000}, numeric::scale_type{-3});
-  fixed_point_column_wrapper<int32_t> rhs5({10000}, numeric::scale_type{0});
+  cudf::test::fixed_point_column_wrapper<int32_t> lhs5({10000}, numeric::scale_type{-3});
+  cudf::test::fixed_point_column_wrapper<int32_t> rhs5({10000}, numeric::scale_type{0});
 
-  EXPECT_FALSE(column_types_equal(lhs5, rhs5));
-  EXPECT_TRUE(column_types_equivalent(lhs5, rhs5));
+  EXPECT_FALSE(cudf::have_same_types(lhs5, rhs5));
+  EXPECT_TRUE(cudf::column_types_equivalent(lhs5, rhs5));
 
   // Different rep, same scale
-  fixed_point_column_wrapper<int32_t> lhs6({10000}, numeric::scale_type{-1});
-  fixed_point_column_wrapper<int64_t> rhs6({4200}, numeric::scale_type{-1});
+  cudf::test::fixed_point_column_wrapper<int32_t> lhs6({10000}, numeric::scale_type{-1});
+  cudf::test::fixed_point_column_wrapper<int64_t> rhs6({4200}, numeric::scale_type{-1});
 
-  EXPECT_FALSE(column_types_equal(lhs6, rhs6));
+  EXPECT_FALSE(cudf::have_same_types(lhs6, rhs6));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentDictionary)
 {
-  dictionary_column_wrapper<int32_t, uint32_t> lhs1{1, 1, 1, 2, 2, 3};
-  dictionary_column_wrapper<int64_t, uint32_t> rhs1{0, 0, 42, 42};
+  cudf::test::dictionary_column_wrapper<int32_t, uint32_t> lhs1{1, 1, 1, 2, 2, 3};
+  cudf::test::dictionary_column_wrapper<int64_t, uint32_t> rhs1{0, 0, 42, 42};
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
-  dictionary_column_wrapper<double, uint32_t> lhs2{3.14, 3.14, 5.00};
-  dictionary_column_wrapper<int64_t, uint32_t> rhs2{0, 0, 42, 42};
+  cudf::test::dictionary_column_wrapper<double, uint32_t> lhs2{3.14, 3.14, 5.00};
+  cudf::test::dictionary_column_wrapper<int64_t, uint32_t> rhs2{0, 0, 42, 42};
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
 
-  dictionary_column_wrapper<int32_t, uint32_t> lhs3{1, 1, 1, 2, 2, 3};
-  dictionary_column_wrapper<duration_s, uint32_t> rhs3{8, 8};
+  cudf::test::dictionary_column_wrapper<int32_t, uint32_t> lhs3{1, 1, 1, 2, 2, 3};
+  cudf::test::dictionary_column_wrapper<cudf::duration_s, uint32_t> rhs3{8, 8};
 
-  EXPECT_FALSE(column_types_equal(lhs3, rhs3));
+  EXPECT_FALSE(cudf::have_same_types(lhs3, rhs3));
 
-  dictionary_column_wrapper<int32_t, uint32_t> lhs4{1, 1, 2, 3}, rhs4{};
-  EXPECT_FALSE(column_types_equal(lhs4, rhs4));
+  cudf::test::dictionary_column_wrapper<int32_t, uint32_t> lhs4{1, 1, 2, 3}, rhs4{};
+  EXPECT_FALSE(cudf::have_same_types(lhs4, rhs4));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentLists)
 {
-  using LCW_i = lists_column_wrapper<int32_t>;
-  using LCW_f = lists_column_wrapper<float>;
+  using LCW_i = cudf::test::lists_column_wrapper<int32_t>;
+  using LCW_f = cudf::test::lists_column_wrapper<float>;
 
   // Different nested level
   LCW_i lhs1{LCW_i{1, 1, 2, 3}, LCW_i{}, LCW_i{42, 42}};
   LCW_i rhs1{LCW_i{LCW_i{8, 8, 8}, LCW_i{9, 9}}, LCW_i{LCW_i{42, 42}}};
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
   // Different base column type
   LCW_i lhs2{LCW_i{1, 1, 2, 3}, LCW_i{}, LCW_i{42, 42}};
   LCW_f rhs2{LCW_f{9.0, 9.1}, LCW_f{3.14}, LCW_f{}};
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentStructs)
 {
-  fixed_width_column_wrapper<int32_t> lf1{1, 1, 1};
-  fixed_width_column_wrapper<int64_t> rf1{2, 2};
+  cudf::test::fixed_width_column_wrapper<int32_t> lf1{1, 1, 1};
+  cudf::test::fixed_width_column_wrapper<int64_t> rf1{2, 2};
+
+  cudf::test::structs_column_wrapper lhs1{lf1};
+  cudf::test::structs_column_wrapper rhs1{rf1};
 
-  structs_column_wrapper lhs1{lf1};
-  structs_column_wrapper rhs1{rf1};
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  cudf::test::fixed_width_column_wrapper<int32_t> lf2{1, 1, 1};
+  cudf::test::fixed_width_column_wrapper<int32_t> rf2{2, 2};
 
-  fixed_width_column_wrapper<int32_t> lf2{1, 1, 1};
-  fixed_width_column_wrapper<int32_t> rf2{2, 2};
+  cudf::test::strings_column_wrapper lf3{"a", "b", "c"};
 
-  strings_column_wrapper lf3{"a", "b", "c"};
+  cudf::test::structs_column_wrapper lhs2{lf2, lf3};
+  cudf::test::structs_column_wrapper rhs2{rf2};
 
-  structs_column_wrapper lhs2{lf2, lf3};
-  structs_column_wrapper rhs2{rf2};
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
+}
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+TYPED_TEST(ColumnTypeCheckTestTyped, AllTypesEqual)
+{
+  {
+    // An empty table
+    cudf::table_view tbl{};
+    EXPECT_TRUE(cudf::all_have_same_types(tbl.begin(), tbl.end()));
+  }
+
+  {
+    // A table with one column
+    cudf::test::fixed_width_column_wrapper<TypeParam> col1{1, 2, 3};
+    cudf::table_view tbl{{col1}};
+    EXPECT_TRUE(cudf::all_have_same_types(tbl.begin(), tbl.end()));
+  }
+
+  {
+    // A table with all the same types
+    cudf::test::fixed_width_column_wrapper<TypeParam> col1{1, 2, 3};
+    cudf::test::fixed_width_column_wrapper<TypeParam> col2{4, 5, 6};
+    cudf::test::fixed_width_column_wrapper<TypeParam> col3{7, 8, 9};
+    cudf::table_view tbl{{col1, col2, col3}};
+    EXPECT_TRUE(cudf::all_have_same_types(tbl.begin(), tbl.end()));
+  }
 }
 
-}  // namespace test
-}  // namespace cudf
+TEST_F(ColumnTypeCheckTest, AllTypesNotEqual)
+{
+  // A table with different types
+  cudf::test::fixed_width_column_wrapper<int> col1{1, 2, 3};
+  cudf::test::fixed_width_column_wrapper<float> col2{3.14, 1.57, 2.71};
+  cudf::table_view tbl{{col1, col2}};
+  EXPECT_FALSE(cudf::all_have_same_types(tbl.begin(), tbl.end()));
+}
diff --git a/docs/dask_cudf/source/api.rst b/docs/dask_cudf/source/api.rst
index db32f4bbcb3..ab10f4af4fa 100644
--- a/docs/dask_cudf/source/api.rst
+++ b/docs/dask_cudf/source/api.rst
@@ -13,12 +13,11 @@ Creating and storing DataFrames
 of DataFrames from a variety of storage formats. For on-disk data that
 are not supported directly in Dask-cuDF, we recommend using Dask's
 data reading facilities, followed by calling
-:func:`.from_dask_dataframe` to obtain a Dask-cuDF object.
+:meth:`*.to_backend("cudf")` to obtain a Dask-cuDF object.
 
 .. automodule:: dask_cudf
    :members:
       from_cudf,
-      from_dask_dataframe,
       from_delayed,
       read_csv,
       read_json,
diff --git a/java/ci/Dockerfile.centos7 b/java/ci/Dockerfile.centos7
deleted file mode 100644
index b2c620848de..00000000000
--- a/java/ci/Dockerfile.centos7
+++ /dev/null
@@ -1,56 +0,0 @@
-#
-# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-###
-# Build the image for cudf development environment.
-#
-# Arguments: CUDA_VERSION=11.X.Y
-#
-###
-ARG CUDA_VERSION=11.8.0
-FROM nvidia/cuda:$CUDA_VERSION-devel-centos7
-
-### Install basic requirements
-ARG DEVTOOLSET_VERSION=11
-RUN yum install -y centos-release-scl
-RUN yum install -y devtoolset-${DEVTOOLSET_VERSION} epel-release
-RUN yum install -y git zlib-devel maven tar wget patch ninja-build
-
-## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
-RUN mkdir /usr/local/rapids && mkdir /rapids && chmod 777 /usr/local/rapids && chmod 777 /rapids
-
-ARG CMAKE_VERSION=3.26.4
-RUN cd /usr/local/ && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
-   tar zxf cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
-   rm cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
-
-ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:$PATH
-
-ARG CCACHE_VERSION=4.6
-RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}.tar.gz && \
-   tar zxf ccache-${CCACHE_VERSION}.tar.gz && \
-   rm ccache-${CCACHE_VERSION}.tar.gz && \
-   cd ccache-${CCACHE_VERSION} && \
-   mkdir build && \
-   cd build && \
-   scl enable devtoolset-${DEVTOOLSET_VERSION} \
-      "cmake .. \
-         -DCMAKE_BUILD_TYPE=Release \
-         -DZSTD_FROM_INTERNET=ON \
-         -DREDIS_STORAGE_BACKEND=OFF && \
-      cmake --build . --parallel ${PARALLEL_LEVEL} --target install" && \
-   cd ../.. && \
-   rm -rf ccache-${CCACHE_VERSION}
diff --git a/java/ci/Dockerfile.rocky b/java/ci/Dockerfile.rocky
new file mode 100644
index 00000000000..6b87f3ed34e
--- /dev/null
+++ b/java/ci/Dockerfile.rocky
@@ -0,0 +1,62 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+###
+# Build the image for cudf development environment.
+#
+# Arguments: CUDA_VERSION=[11.X.Y, 12.X.Y], OS_RELEASE=[8, 9], TARGETPLATFORM=[linux/amd64, linux/amd64]
+#
+###
+ARG CUDA_VERSION=11.8.0
+ARG OS_RELEASE=8
+ARG TARGETPLATFORM=linux/amd64
+# multi-platform build with: docker buildx build --platform linux/arm64,linux/amd64 <ARGS> on either amd64 or arm64 host
+# check available official arm-based docker images at https://hub.docker.com/r/nvidia/cuda/tags (OS/ARCH)
+FROM --platform=$TARGETPLATFORM nvidia/cuda:$CUDA_VERSION-devel-rockylinux$OS_RELEASE
+ARG TOOLSET_VERSION=11
+### Install basic requirements
+RUN dnf --enablerepo=powertools install -y  scl-utils gcc-toolset-${TOOLSET_VERSION} git zlib-devel maven tar wget patch ninja-build
+## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
+RUN mkdir /usr/local/rapids /rapids && chmod 777 /usr/local/rapids /rapids
+
+# 3.22.3+: CUDA architecture 'native' support + flexible CMAKE_<LANG>_*_LAUNCHER for ccache
+ARG CMAKE_VERSION=3.26.4
+# default x86_64 from x86 build, aarch64 cmake for arm build
+ARG CMAKE_ARCH=x86_64
+RUN cd /usr/local && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \
+   tar zxf cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \
+   rm cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz
+ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}/bin:$PATH
+
+# ccache for interactive builds
+ARG CCACHE_VERSION=4.6
+RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}.tar.gz && \
+   tar zxf ccache-${CCACHE_VERSION}.tar.gz && \
+   rm ccache-${CCACHE_VERSION}.tar.gz && \
+   cd ccache-${CCACHE_VERSION} && \
+   mkdir build && \
+   cd build && \
+   scl enable gcc-toolset-${TOOLSET_VERSION} \
+      "cmake .. \
+         -DCMAKE_BUILD_TYPE=Release \
+         -DZSTD_FROM_INTERNET=ON \
+         -DREDIS_STORAGE_BACKEND=OFF && \
+      cmake --build . --parallel 4 --target install" && \
+   cd ../.. && \
+   rm -rf ccache-${CCACHE_VERSION}
+
+# disable cuda container constraints to allow running w/ elder drivers on data-center GPUs
+ENV NVIDIA_DISABLE_REQUIRE="true"
diff --git a/java/ci/README.md b/java/ci/README.md
index da24c5923ea..18ad3cc4d0d 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -11,14 +11,14 @@
 
 In the root path of cuDF repo, run below command to build the docker image.
 ```bash
-docker build -f java/ci/Dockerfile.centos7 --build-arg CUDA_VERSION=11.8.0 -t cudf-build:11.8.0-devel-centos7 .
+docker build -f java/ci/Dockerfile.rocky --build-arg CUDA_VERSION=11.8.0 -t cudf-build:11.8.0-devel-rocky8 .
 ```
 
 The following CUDA versions are supported w/ CUDA Enhanced Compatibility:
 * CUDA 11.0+
 
 Change the --build-arg CUDA_VERSION to what you need.
-You can replace the tag "cudf-build:11.8.0-devel-centos7" with another name you like.
+You can replace the tag "cudf-build:11.8.0-devel-rocky8" with another name you like.
 
 ## Start the docker then build
 
@@ -26,7 +26,7 @@ You can replace the tag "cudf-build:11.8.0-devel-centos7" with another name you
 
 Run below command to start a docker container with GPU.
 ```bash
-nvidia-docker run -it cudf-build:11.8.0-devel-centos7 bash
+nvidia-docker run -it cudf-build:11.8.0-devel-rocky8 bash
 ```
 
 ### Download the cuDF source code
@@ -42,7 +42,7 @@ git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.06
 ```bash
 cd cudf
 export WORKSPACE=`pwd`
-scl enable devtoolset-11 "java/ci/build-in-docker.sh"
+scl enable gcc-toolset-11 "java/ci/build-in-docker.sh"
 ```
 
 ### The output
diff --git a/java/src/main/java/ai/rapids/cudf/ORCChunkedReader.java b/java/src/main/java/ai/rapids/cudf/ORCChunkedReader.java
new file mode 100644
index 00000000000..2f46c8d1825
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/ORCChunkedReader.java
@@ -0,0 +1,169 @@
+/*
+ *
+ *  Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * Provide an interface for reading an ORC file in an iterative manner.
+ */
+public class ORCChunkedReader implements AutoCloseable {
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+  /**
+   * Construct the reader instance from read limits, output row granularity,
+   * and a file already loaded in a memory buffer.
+   *
+   * @param chunkReadLimit Limit on total number of bytes to be returned per read,
+   *                       or 0 if there is no limit.
+   * @param passReadLimit  Limit on the amount of memory used by the chunked reader,
+   *                       or 0 if there is no limit.
+   * @param opts           The options for ORC reading.
+   * @param buffer         Raw ORC file content.
+   * @param offset         The starting offset into buffer.
+   * @param len            The number of bytes to parse the given buffer.
+   */
+  public ORCChunkedReader(long chunkReadLimit, long passReadLimit,
+      ORCOptions opts, HostMemoryBuffer buffer, long offset, long len) {
+    handle = createReader(chunkReadLimit, passReadLimit,
+        opts.getIncludeColumnNames(), buffer.getAddress() + offset, len,
+        opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(),
+        opts.getDecimal128Columns());
+    if (handle == 0) {
+      throw new IllegalStateException("Cannot create native chunked ORC reader object.");
+    }
+  }
+
+  /**
+   * Construct a chunked ORC reader instance, similar to
+   * {@link ORCChunkedReader#ORCChunkedReader(long, long, ORCOptions, HostMemoryBuffer, long, long)},
+   * with an additional parameter to control the granularity of the output table.
+   * When reading a chunk table, with respect to the given size limits, a subset of stripes may
+   * be loaded, decompressed and decoded into a large intermediate table. The reader will then
+   * subdivide that table into smaller tables for final output using
+   * {@code outputRowSizingGranularity} as the subdivision step. If the chunked reader is
+   * constructed without this parameter, the default value of 10k rows will be used.
+   *
+   * @param outputRowSizingGranularity The change step in number of rows in the output table.
+   * @see ORCChunkedReader#ORCChunkedReader(long, long, ORCOptions, HostMemoryBuffer, long, long)
+   */
+  public ORCChunkedReader(long chunkReadLimit, long passReadLimit, long outputRowSizingGranularity,
+      ORCOptions opts, HostMemoryBuffer buffer, long offset, long len) {
+    handle = createReaderWithOutputGranularity(chunkReadLimit, passReadLimit, outputRowSizingGranularity,
+        opts.getIncludeColumnNames(), buffer.getAddress() + offset, len,
+        opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(),
+        opts.getDecimal128Columns());
+    if (handle == 0) {
+      throw new IllegalStateException("Cannot create native chunked ORC reader object.");
+    }
+  }
+
+  /**
+   * Check if the given file has anything left to read.
+   *
+   * @return A boolean value indicating if there is more data to read from file.
+   */
+  public boolean hasNext() {
+    if (handle == 0) {
+      throw new IllegalStateException("Native chunked ORC reader object may have been closed.");
+    }
+
+    if (firstCall) {
+      // This function needs to return true at least once, so an empty table
+      // (but having empty columns instead of no column) can be returned by readChunk()
+      // if the input file has no row.
+      firstCall = false;
+      return true;
+    }
+    return hasNext(handle);
+  }
+
+  /**
+   * Read a chunk of rows in the given ORC file such that the returning data has total size
+   * does not exceed the given read limit. If the given file has no data, or all data has been read
+   * before by previous calls to this function, a null Table will be returned.
+   *
+   * @return A table of new rows reading from the given file.
+   */
+  public Table readChunk() {
+    if (handle == 0) {
+      throw new IllegalStateException("Native chunked ORC reader object may have been closed.");
+    }
+
+    long[] columnPtrs = readChunk(handle);
+    return columnPtrs != null ? new Table(columnPtrs) : null;
+  }
+
+  @Override
+  public void close() {
+    if (handle != 0) {
+      close(handle);
+      handle = 0;
+    }
+  }
+
+
+  /**
+   * Auxiliary variable to help {@link #hasNext()} returning true at least once.
+   */
+  private boolean firstCall = true;
+
+  /**
+   * Handle for memory address of the native ORC chunked reader class.
+   */
+  private long handle;
+
+  /**
+   * Create a native chunked ORC reader object on heap and return its memory address.
+   *
+   * @param chunkReadLimit    Limit on total number of bytes to be returned per read,
+   *                          or 0 if there is no limit.
+   * @param passReadLimit     Limit on the amount of memory used by the chunked reader,
+   *                          or 0 if there is no limit.
+   * @param filterColumnNames Name of the columns to read, or an empty array if we want to read all.
+   * @param bufferAddrs       The address of a buffer to read from, or 0 if we are not using that buffer.
+   * @param length            The length of the buffer to read from.
+   * @param usingNumPyTypes   Whether the parser should implicitly promote TIMESTAMP
+   *                          columns to TIMESTAMP_MILLISECONDS for compatibility with NumPy.
+   * @param timeUnit          return type of TimeStamp in units
+   * @param decimal128Columns name of the columns which are read as Decimal128 rather than Decimal64
+   */
+  private static native long createReader(long chunkReadLimit, long passReadLimit,
+      String[] filterColumnNames, long bufferAddrs, long length,
+      boolean usingNumPyTypes, int timeUnit, String[] decimal128Columns);
+
+  /**
+   * Create a native chunked ORC reader object, similar to
+   * {@link ORCChunkedReader#createReader(long, long, String[], long, long, boolean, int, String[])},
+   * with an additional parameter to control the granularity of the output table.
+   *
+   * @param outputRowSizingGranularity The change step in number of rows in the output table.
+   * @see ORCChunkedReader#createReader(long, long, String[], long, long, boolean, int, String[])
+   */
+  private static native long createReaderWithOutputGranularity(
+      long chunkReadLimit, long passReadLimit, long outputRowSizingGranularity,
+      String[] filterColumnNames, long bufferAddrs, long length,
+      boolean usingNumPyTypes, int timeUnit, String[] decimal128Columns);
+
+  private static native boolean hasNext(long handle);
+
+  private static native long[] readChunk(long handle);
+
+  private static native void close(long handle);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java
index c8571dd841c..43603386649 100644
--- a/java/src/main/java/ai/rapids/cudf/Schema.java
+++ b/java/src/main/java/ai/rapids/cudf/Schema.java
@@ -20,6 +20,7 @@
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.stream.Collectors;
 
 /**
  * The schema of data to be read in.
@@ -221,6 +222,13 @@ public DType[] getChildTypes() {
     return ret;
   }
 
+  public int getNumChildren() {
+    if (childSchemas == null) {
+      return 0;
+    }
+    return childSchemas.size();
+  }
+
   int[] getFlattenedNumChildren() {
     flattenIfNeeded();
     return flattenedCounts;
@@ -243,7 +251,25 @@ public boolean isStructOrHasStructDescendant() {
     return false;
   }
 
-  public static class Builder {
+  public HostColumnVector.DataType asHostDataType() {
+    if (topLevelType == DType.LIST) {
+      assert(childSchemas != null && childSchemas.size() == 1);
+      HostColumnVector.DataType element = childSchemas.get(0).asHostDataType();
+      return new HostColumnVector.ListType(true, element);
+    } else if (topLevelType == DType.STRUCT) {
+      if (childSchemas == null) {
+        return new HostColumnVector.StructType(true);
+      } else {
+        List<HostColumnVector.DataType> childTypes =
+                childSchemas.stream().map(Schema::asHostDataType).collect(Collectors.toList());
+        return new HostColumnVector.StructType(true, childTypes);
+      }
+    } else {
+      return new HostColumnVector.BasicType(true, topLevelType);
+    }
+  }
+
+    public static class Builder {
     private final DType topLevelType;
     private final List<String> names;
     private final List<Builder> types;
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 4038b3a40b8..4e737451ed6 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1220,8 +1220,26 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp
               columns[i] = tbl.getColumn(index).incRefCount();
             }
           } else {
-            try (Scalar s = Scalar.fromNull(types[i])) {
-              columns[i] = ColumnVector.fromScalar(s, rowCount);
+            if (types[i] == DType.LIST) {
+              Schema listSchema = schema.getChild(i);
+              Schema elementSchema = listSchema.getChild(0);
+              try (Scalar s = Scalar.listFromNull(elementSchema.asHostDataType())) {
+                columns[i] = ColumnVector.fromScalar(s, rowCount);
+              }
+            } else if (types[i] == DType.STRUCT) {
+              Schema structSchema = schema.getChild(i);
+              int numStructChildren = structSchema.getNumChildren();
+              DataType[] structChildrenTypes = new DataType[numStructChildren];
+              for (int j = 0; j < numStructChildren; j++) {
+                structChildrenTypes[j] = structSchema.getChild(j).asHostDataType();
+              }
+              try (Scalar s = Scalar.structFromNull(structChildrenTypes)) {
+                columns[i] = ColumnVector.fromScalar(s, rowCount);
+              }
+            } else {
+              try (Scalar s = Scalar.fromNull(types[i])) {
+                columns[i] = ColumnVector.fromScalar(s, rowCount);
+              }
             }
           }
         }
diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp
index 7681008f584..cf04a87262f 100644
--- a/java/src/main/native/src/ChunkedReaderJni.cpp
+++ b/java/src/main/native/src/ChunkedReaderJni.cpp
@@ -18,22 +18,22 @@
 #include "jni_utils.hpp"
 
 #include <cudf/column/column.hpp>
+#include <cudf/io/orc.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/table/table.hpp>
 
 #include <memory>
+#include <optional>
 #include <vector>
 
-// This function is defined in `TableJni.cpp`.
-jlongArray cudf::jni::convert_table_for_return(
-  JNIEnv* env,
-  std::unique_ptr<cudf::table>&& table_result,
-  std::vector<std::unique_ptr<cudf::column>>&& extra_columns);
-
 // This file is for the code related to chunked reader (Parquet, ORC, etc.).
 
 extern "C" {
 
+//
+// Chunked Parquet reader JNI
+//
+
 // This function should take all the parameters that `Table.readParquet` takes,
 // plus one more parameter `long chunkSizeByteLimit`.
 JNIEXPORT jlong JNICALL
@@ -54,19 +54,17 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
     JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", 0);
     read_buffer = false;
   } else if (inp_file_path != nullptr) {
-    JNI_THROW_NEW(env,
-                  "java/lang/IllegalArgumentException",
-                  "Cannot pass in both a buffer and an inp_file_path",
-                  0);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "Cannot pass in both a buffer and an inp_file_path", 0);
   } else if (buffer_length <= 0) {
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "An empty buffer is not supported", 0);
+    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring filename(env, inp_file_path);
     if (!read_buffer && filename.is_empty()) {
-      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "inp_file_path cannot be empty", 0);
+      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", 0);
     }
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
@@ -155,7 +153,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_readChunk(
                                                                                 jclass,
                                                                                 jlong handle)
 {
-  JNI_NULL_CHECK(env, handle, "handle is null", 0);
+  JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -163,7 +161,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_readChunk(
     auto chunk            = reader_ptr->read_chunk();
     return chunk.tbl ? cudf::jni::convert_table_for_return(env, chunk.tbl) : nullptr;
   }
-  CATCH_STD(env, 0);
+  CATCH_STD(env, nullptr);
 }
 
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv* env,
@@ -179,4 +177,151 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv* en
   CATCH_STD(env, );
 }
 
+//
+// Chunked ORC reader JNI
+//
+
+namespace {
+jlong create_chunked_orc_reader(JNIEnv* env,
+                                jlong chunk_read_limit,
+                                jlong pass_read_limit,
+                                std::optional<jlong> output_granularity,
+                                jobjectArray filter_col_names,
+                                jlong buffer,
+                                jlong buffer_length,
+                                jboolean using_numpy_Types,
+                                jint unit,
+                                jobjectArray dec128_col_names)
+{
+  JNI_NULL_CHECK(env, buffer, "buffer is null", 0);
+  if (buffer_length <= 0) {
+    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0);
+  }
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+    cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names);
+
+    auto const source = cudf::io::source_info(reinterpret_cast<char*>(buffer),
+                                              static_cast<std::size_t>(buffer_length));
+    auto opts_builder = cudf::io::orc_reader_options::builder(source);
+    if (n_filter_col_names.size() > 0) {
+      opts_builder = opts_builder.columns(n_filter_col_names.as_cpp_vector());
+    }
+    auto const read_opts = opts_builder.use_index(false)
+                             .use_np_dtypes(static_cast<bool>(using_numpy_Types))
+                             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+                             .decimal128_columns(n_dec128_col_names.as_cpp_vector())
+                             .build();
+
+    if (output_granularity) {
+      return reinterpret_cast<jlong>(
+        new cudf::io::chunked_orc_reader(static_cast<std::size_t>(chunk_read_limit),
+                                         static_cast<std::size_t>(pass_read_limit),
+                                         static_cast<std::size_t>(output_granularity.value()),
+                                         read_opts));
+    }
+    return reinterpret_cast<jlong>(
+      new cudf::io::chunked_orc_reader(static_cast<std::size_t>(chunk_read_limit),
+                                       static_cast<std::size_t>(pass_read_limit),
+                                       read_opts));
+  }
+  CATCH_STD(env, 0);
+}
+}  // namespace
+
+// This function should take all the parameters that `Table.readORC` takes,
+// plus two more parameters: `chunk_read_limit` and `pass_read_limit`.
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ORCChunkedReader_createReader(JNIEnv* env,
+                                                  jclass,
+                                                  jlong chunk_read_limit,
+                                                  jlong pass_read_limit,
+                                                  jobjectArray filter_col_names,
+                                                  jlong buffer,
+                                                  jlong buffer_length,
+                                                  jboolean using_numpy_Types,
+                                                  jint unit,
+                                                  jobjectArray dec128_col_names)
+{
+  return create_chunked_orc_reader(env,
+                                   chunk_read_limit,
+                                   pass_read_limit,
+                                   std::nullopt,
+                                   filter_col_names,
+                                   buffer,
+                                   buffer_length,
+                                   using_numpy_Types,
+                                   unit,
+                                   dec128_col_names);
+}
+
+// This function should take all the parameters that `Table.readORC` takes,
+// plus three more parameters: `chunk_read_limit`, `pass_read_limit`, `output_granularity`.
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ORCChunkedReader_createReaderWithOutputGranularity(
+  JNIEnv* env,
+  jclass,
+  jlong chunk_read_limit,
+  jlong pass_read_limit,
+  jlong output_granularity,
+  jobjectArray filter_col_names,
+  jlong buffer,
+  jlong buffer_length,
+  jboolean using_numpy_Types,
+  jint unit,
+  jobjectArray dec128_col_names)
+{
+  return create_chunked_orc_reader(env,
+                                   chunk_read_limit,
+                                   pass_read_limit,
+                                   output_granularity,
+                                   filter_col_names,
+                                   buffer,
+                                   buffer_length,
+                                   using_numpy_Types,
+                                   unit,
+                                   dec128_col_names);
+}
+
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ORCChunkedReader_hasNext(JNIEnv* env,
+                                                                        jclass,
+                                                                        jlong handle)
+{
+  JNI_NULL_CHECK(env, handle, "handle is null", false);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_orc_reader* const>(handle);
+    return reader_ptr->has_next();
+  }
+  CATCH_STD(env, false);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ORCChunkedReader_readChunk(JNIEnv* env,
+                                                                            jclass,
+                                                                            jlong handle)
+{
+  JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_orc_reader* const>(handle);
+    auto chunk            = reader_ptr->read_chunk();
+    return chunk.tbl ? cudf::jni::convert_table_for_return(env, chunk.tbl) : nullptr;
+  }
+  CATCH_STD(env, nullptr);
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ORCChunkedReader_close(JNIEnv* env, jclass, jlong handle)
+{
+  JNI_NULL_CHECK(env, handle, "handle is null", );
+
+  try {
+    cudf::jni::auto_set_device(env);
+    delete reinterpret_cast<cudf::io::chunked_orc_reader*>(handle);
+  }
+  CATCH_STD(env, );
+}
+
 }  // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 8560a9caad7..dc6eb55fc6a 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -81,6 +81,7 @@ public class TableTest extends CudfTestBase {
   private static final File TEST_PARQUET_FILE_CHUNKED_READ = TestUtils.getResourceAsFile("splittable.parquet");
   private static final File TEST_PARQUET_FILE_BINARY = TestUtils.getResourceAsFile("binary.parquet");
   private static final File TEST_ORC_FILE = TestUtils.getResourceAsFile("TestOrcFile.orc");
+  private static final File TEST_ORC_FILE_CHUNKED_READ = TestUtils.getResourceAsFile("splittable.orc");
   private static final File TEST_ORC_TIMESTAMP_DATE_FILE = TestUtils.getResourceAsFile("timestamp-date-test.orc");
   private static final File TEST_DECIMAL_PARQUET_FILE = TestUtils.getResourceAsFile("decimal.parquet");
   private static final File TEST_ALL_TYPES_PLAIN_AVRO_FILE = TestUtils.getResourceAsFile("alltypes_plain.avro");
@@ -1699,6 +1700,29 @@ void testReadORCTimeUnit() {
     }
   }
 
+  @Test
+  void testORCChunkedReader() throws IOException {
+    byte[] buffer = Files.readAllBytes(TEST_ORC_FILE_CHUNKED_READ.toPath());
+    long len = buffer.length;
+
+    try (HostMemoryBuffer hostBuf = hostMemoryAllocator.allocate(len)) {
+      hostBuf.setBytes(0, buffer, 0, len);
+      try (ORCChunkedReader reader = new ORCChunkedReader(0, 2 * 1024 * 1024, 10000,
+          ORCOptions.DEFAULT, hostBuf, 0, len)) {
+        int numChunks = 0;
+        long totalRows = 0;
+        while (reader.hasNext()) {
+          ++numChunks;
+          try (Table chunk = reader.readChunk()) {
+            totalRows += chunk.getRowCount();
+          }
+        }
+        assertEquals(10, numChunks);
+        assertEquals(1000000, totalRows);
+      }
+    }
+  }
+
   @Test
   void testCrossJoin() {
     try (Table leftTable = new Table.TestBuilder()
diff --git a/java/src/test/resources/splittable.orc b/java/src/test/resources/splittable.orc
new file mode 100644
index 00000000000..1f5e094534f
Binary files /dev/null and b/java/src/test/resources/splittable.orc differ
diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd
index b916c2b7ad9..1e1057beede 100644
--- a/python/cudf/cudf/_lib/cpp/io/json.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/json.pxd
@@ -28,6 +28,7 @@ cdef extern from "cudf/io/json.hpp" \
         size_type get_byte_range_size() except +
         bool is_enabled_lines() except +
         bool is_enabled_mixed_types_as_string() except +
+        bool is_enabled_prune_columns() except +
         bool is_enabled_dayfirst() except +
         bool is_enabled_experimental() except +
 
@@ -41,6 +42,7 @@ cdef extern from "cudf/io/json.hpp" \
         void set_byte_range_size(size_type size) except +
         void enable_lines(bool val) except +
         void enable_mixed_types_as_string(bool val) except +
+        void enable_prune_columns(bool val) except +
         void enable_dayfirst(bool val) except +
         void enable_experimental(bool val) except +
         void enable_keep_quotes(bool val) except +
@@ -79,6 +81,9 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& mixed_types_as_string(
             bool val
         ) except +
+        json_reader_options_builder& prune_columns(
+            bool val
+        ) except +
         json_reader_options_builder& dayfirst(
             bool val
         ) except +
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index d5ac8574fe4..d5bb1726a43 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -1,6 +1,6 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -21,8 +21,8 @@ cdef extern from "cudf/io/orc.hpp" \
 
         cudf_io_types.source_info get_source() except +
         vector[vector[size_type]] get_stripes() except +
-        size_type get_skip_rows() except +
-        size_type get_num_rows() except +
+        int64_t get_skip_rows() except +
+        optional[int64_t] get_num_rows() except +
         bool is_enabled_use_index() except +
         bool is_enabled_use_np_dtypes() except +
         data_type get_timestamp_type() except +
@@ -31,8 +31,8 @@ cdef extern from "cudf/io/orc.hpp" \
 
         void set_columns(vector[string] col_names) except +
         void set_stripes(vector[vector[size_type]] strps) except +
-        void set_skip_rows(size_type rows) except +
-        void set_num_rows(size_type nrows) except +
+        void set_skip_rows(int64_t rows) except +
+        void set_num_rows(int64_t nrows) except +
         void enable_use_index(bool val) except +
         void enable_use_np_dtypes(bool val) except +
         void set_timestamp_type(data_type type) except +
@@ -49,8 +49,8 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_reader_options_builder& columns(vector[string] col_names) except +
         orc_reader_options_builder& \
             stripes(vector[vector[size_type]] strps) except +
-        orc_reader_options_builder& skip_rows(size_type rows) except +
-        orc_reader_options_builder& num_rows(size_type nrows) except +
+        orc_reader_options_builder& skip_rows(int64_t rows) except +
+        orc_reader_options_builder& num_rows(int64_t nrows) except +
         orc_reader_options_builder& use_index(bool val) except +
         orc_reader_options_builder& use_np_dtypes(bool val) except +
         orc_reader_options_builder& timestamp_type(data_type type) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/find.pxd b/python/cudf/cudf/_lib/cpp/strings/find.pxd
index 953d5c30b2a..dfbdebb9651 100644
--- a/python/cudf/cudf/_lib/cpp/strings/find.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/find.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -41,6 +41,11 @@ cdef extern from "cudf/strings/find.hpp" namespace "cudf::strings" nogil:
         size_type start,
         size_type stop) except +
 
+    cdef unique_ptr[column] find(
+        column_view source_strings,
+        column_view target,
+        size_type start) except +
+
     cdef unique_ptr[column] rfind(
         column_view source_strings,
         string_scalar target,
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index f2e03391f08..cef71ed24a5 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -49,7 +49,8 @@ cpdef read_json(object filepaths_or_buffers,
                 object byte_range,
                 bool legacy,
                 bool keep_quotes,
-                bool mixed_types_as_string):
+                bool mixed_types_as_string,
+                bool prune_columns):
     """
     Cython function to call into libcudf API, see `read_json`.
 
@@ -128,6 +129,7 @@ cpdef read_json(object filepaths_or_buffers,
 
     opts.enable_keep_quotes(keep_quotes)
     opts.enable_mixed_types_as_string(mixed_types_as_string)
+    opts.enable_prune_columns(prune_columns)
     # Read JSON
     cdef cudf_io_types.table_with_metadata c_result
 
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 836880a6f2c..918880648bf 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -472,11 +472,11 @@ cdef int64_t get_skiprows_arg(object arg) except*:
         raise TypeError("skiprows must be an int >= 0")
     return <int64_t> arg
 
-cdef size_type get_num_rows_arg(object arg) except*:
+cdef int64_t get_num_rows_arg(object arg) except*:
     arg = -1 if arg is None else arg
     if not isinstance(arg, int) or arg < -1:
         raise TypeError("num_rows must be an int >= -1")
-    return <size_type> arg
+    return <int64_t> arg
 
 
 cdef orc_reader_options make_orc_reader_options(
@@ -484,7 +484,7 @@ cdef orc_reader_options make_orc_reader_options(
     object column_names,
     object stripes,
     int64_t skip_rows,
-    size_type num_rows,
+    int64_t num_rows,
     type_id timestamp_type,
     bool use_index
 ) except*:
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index 2565e92d5c9..b9e5e48226d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -16,6 +16,10 @@ from .scalar cimport Scalar
 from .types cimport DataType, type_id
 from .utils cimport int_to_bitmask_ptr, int_to_void_ptr
 
+import functools
+
+import numpy as np
+
 
 cdef class Column:
     """A container of nullable device data as a column of elements.
@@ -223,6 +227,51 @@ cdef class Column:
             c_result = move(make_column_from_scalar(dereference(c_scalar), size))
         return Column.from_libcudf(move(c_result))
 
+    @staticmethod
+    def from_cuda_array_interface_obj(object obj):
+        """Create a Column from an object with a CUDA array interface.
+
+        Parameters
+        ----------
+        obj : object
+            The object with the CUDA array interface to create a column from.
+
+        Returns
+        -------
+        Column
+            A Column containing the data from the CUDA array interface.
+
+        Notes
+        -----
+        Data is not copied when creating the column. The caller is
+        responsible for ensuring the data is not mutated unexpectedly while the
+        column is in use.
+        """
+        data = gpumemoryview(obj)
+        iface = data.__cuda_array_interface__()
+        if iface.get('mask') is not None:
+            raise ValueError("mask not yet supported.")
+
+        typestr = iface['typestr'][1:]
+        if not is_c_contiguous(
+            iface['shape'],
+            iface['strides'],
+            np.dtype(typestr).itemsize
+        ):
+            raise ValueError("Data must be C-contiguous")
+
+        data_type = _datatype_from_dtype_desc(typestr)
+        size = iface['shape'][0]
+        return Column(
+            data_type,
+            size,
+            data,
+            None,
+            0,
+            0,
+            []
+        )
+
     cpdef DataType type(self):
         """The type of data in the column."""
         return self._data_type
@@ -296,3 +345,61 @@ cdef class ListColumnView:
     cpdef offsets(self):
         """The offsets column of the underlying list column."""
         return self._column.child(1)
+
+
+@functools.cache
+def _datatype_from_dtype_desc(desc):
+    mapping = {
+        'u1': type_id.UINT8,
+        'u2': type_id.UINT16,
+        'u4': type_id.UINT32,
+        'u8': type_id.UINT64,
+        'i1': type_id.INT8,
+        'i2': type_id.INT16,
+        'i4': type_id.INT32,
+        'i8': type_id.INT64,
+        'f4': type_id.FLOAT32,
+        'f8': type_id.FLOAT64,
+        'b1': type_id.BOOL8,
+        'M8[s]': type_id.TIMESTAMP_SECONDS,
+        'M8[ms]': type_id.TIMESTAMP_MILLISECONDS,
+        'M8[us]': type_id.TIMESTAMP_MICROSECONDS,
+        'M8[ns]': type_id.TIMESTAMP_NANOSECONDS,
+        'm8[s]': type_id.DURATION_SECONDS,
+        'm8[ms]': type_id.DURATION_MILLISECONDS,
+        'm8[us]': type_id.DURATION_MICROSECONDS,
+        'm8[ns]': type_id.DURATION_NANOSECONDS,
+    }
+    if desc not in mapping:
+        raise ValueError(f"Unsupported dtype: {desc}")
+    return DataType(mapping[desc])
+
+
+def is_c_contiguous(
+    shape: Sequence[int], strides: Sequence[int], itemsize: int
+) -> bool:
+    """Determine if shape and strides are C-contiguous
+
+    Parameters
+    ----------
+    shape : Sequence[int]
+        Number of elements in each dimension.
+    strides : Sequence[int]
+        The stride of each dimension in bytes.
+    itemsize : int
+        Size of an element in bytes.
+
+    Return
+    ------
+    bool
+        The boolean answer.
+    """
+
+    if any(dim == 0 for dim in shape):
+        return True
+    cumulative_stride = itemsize
+    for dim, stride in zip(reversed(shape), reversed(strides)):
+        if dim > 1 and stride != cumulative_stride:
+            return False
+        cumulative_stride *= dim
+    return True
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
index 3a2a9e1e7eb..c42b57ece63 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources case.pyx)
+set(cython_sources case.pyx find.pyx)
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
   CXX
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
index ff87549b5b5..33e2d56c087 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
@@ -1,3 +1,3 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import case
+from . cimport case, find
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
index ff87549b5b5..9220f6bd045 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
@@ -1,3 +1,3 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import case
+from . import case, find
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
new file mode 100644
index 00000000000..22e933106c7
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.scalar cimport Scalar
+
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
+
+cpdef Column find(
+    Column input,
+    ColumnOrScalar target,
+    size_type start=*,
+    size_type stop=*
+)
+
+cpdef Column rfind(
+    Column input,
+    Scalar target,
+    size_type start=*,
+    size_type stop=*
+)
+
+cpdef Column contains(
+    Column input,
+    ColumnOrScalar target,
+)
+
+cpdef Column starts_with(
+    Column input,
+    ColumnOrScalar target,
+)
+
+cpdef Column ends_with(
+    Column input,
+    ColumnOrScalar target,
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
new file mode 100644
index 00000000000..1d94132a8b3
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
@@ -0,0 +1,277 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.strings cimport find as cpp_find
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.scalar cimport Scalar
+
+from cython.operator import dereference
+
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+
+
+cpdef Column find(
+    Column input,
+    ColumnOrScalar target,
+    size_type start=0,
+    size_type stop=-1
+):
+    """Returns a column of character position values where the target string is
+    first found in each string of the provided column.
+
+    ``target`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    If ``target`` is a scalar, the scalar will be searched for in each string.
+    If ``target`` is a column, the corresponding string in the column will be
+    searched for in each string.
+
+    For details, see :cpp:func:`find`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Union[Column, Scalar]
+        String to search for in each string
+    start : size_type
+        First character position to include in the search
+    stop : size_type
+        Last position (exclusive) to include in the search. Default of -1 will
+        search to the end of the string.
+
+    Returns
+    -------
+    pylibcudf.Column
+        New integer column with character position values
+    """
+    cdef unique_ptr[column] result
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(
+                cpp_find.find(
+                    input.view(),
+                    target.view(),
+                    start
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        with nogil:
+            result = move(
+                cpp_find.find(
+                    input.view(),
+                    dereference(<string_scalar*>(target.c_obj.get())),
+                    start,
+                    stop
+                )
+            )
+    else:
+        raise ValueError(f"Invalid target {target}")
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column rfind(
+    Column input,
+    Scalar target,
+    size_type start=0,
+    size_type stop=-1
+):
+    """
+    Returns a column of character position values where the target string is
+    first found searching from the end of each string.
+
+    For details, see :cpp:func:`rfind`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Scalar
+        String to search for in each string
+    start : size_type
+        First character position to include in the search
+    stop : size_type
+        Last position (exclusive) to include in the search. Default of -1 will
+        search to the end of the string.
+
+    Returns
+    -------
+    pylibcudf.Column
+        New integer column with character position values
+    """
+    cdef unique_ptr[column] result
+    with nogil:
+        result = move(
+            cpp_find.rfind(
+                input.view(),
+                dereference(<string_scalar*>(target.c_obj.get())),
+                start,
+                stop
+            )
+        )
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column contains(
+    Column input,
+    ColumnOrScalar target,
+):
+    """
+    Returns a column of boolean values for each string where true indicates the
+    corresponding target string was found within that string in the provided
+    column.
+
+    ``target`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    If ``target`` is a scalar, the scalar will be searched for in each string.
+    If ``target`` is a column, the corresponding string in the column will be
+    searched for in each string.
+
+    For details, see :cpp:func:`contains`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Union[Column, Scalar]
+        String to search for in each string
+
+    Returns
+    -------
+    pylibcudf.Column
+        New boolean column with True for each string that contains the target
+    """
+    cdef unique_ptr[column] result
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(
+                cpp_find.contains(
+                    input.view(),
+                    target.view()
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        with nogil:
+            result = move(
+                cpp_find.contains(
+                    input.view(),
+                    dereference(<string_scalar*>(target.c_obj.get()))
+                )
+            )
+    else:
+        raise ValueError(f"Invalid target {target}")
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column starts_with(
+    Column input,
+    ColumnOrScalar target,
+):
+    """
+    Returns a column of boolean values for each string where true indicates the
+    target string was found at the beginning of the string in the provided
+    column.
+
+    ``target`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    If ``target`` is a scalar, the scalar will be searched for in each string.
+    If ``target`` is a column, the corresponding string in the column will be
+    searched for in each string.
+
+    For details, see :cpp:func:`starts_with`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Union[Column, Scalar]
+        String to search for at the beginning of each string
+
+    Returns
+    -------
+    pylibcudf.Column
+        New boolean column with True for each string that starts with the target
+    """
+    cdef unique_ptr[column] result
+
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(
+                cpp_find.starts_with(
+                    input.view(),
+                    target.view()
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        with nogil:
+            result = move(
+                cpp_find.starts_with(
+                    input.view(),
+                    dereference(<string_scalar*>(target.c_obj.get()))
+                )
+            )
+    else:
+        raise ValueError(f"Invalid target {target}")
+
+    return Column.from_libcudf(move(result))
+
+cpdef Column ends_with(
+    Column input,
+    ColumnOrScalar target,
+):
+    """
+    Returns a column of boolean values for each string where true indicates the
+    target string was found at the end of the string in the provided column.
+
+    ``target`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    If ``target`` is a scalar, the scalar will be searched for in each string.
+    If ``target`` is a column, the corresponding string in the column will be
+    searched for in each string.
+
+    For details, see :cpp:func:`ends_with`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Union[Column, Scalar]
+        String to search for at the end of each string
+
+    Returns
+    -------
+    pylibcudf.Column
+        New boolean column with True for each string that ends with the target
+    """
+    cdef unique_ptr[column] result
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(
+                cpp_find.ends_with(
+                    input.view(),
+                    target.view()
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        with nogil:
+            result = move(
+                cpp_find.ends_with(
+                    input.view(),
+                    dereference(<string_scalar*>(target.c_obj.get()))
+                )
+            )
+    else:
+        raise ValueError(f"Invalid target {target}")
+
+    return Column.from_libcudf(move(result))
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 7ddf4ff4883..aee496e9f1c 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -354,7 +354,8 @@ def as_device_scalar(val, dtype=None):
 def _is_null_host_scalar(slr):
     if cudf.utils.utils.is_na_like(slr):
         return True
-    elif isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr):
+    elif (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) or \
+            slr is pd.NaT:
         return True
     else:
         return False
diff --git a/python/cudf/cudf/_lib/strings/find.pyx b/python/cudf/cudf/_lib/strings/find.pyx
index f6dd3b80de9..341776b102c 100644
--- a/python/cudf/cudf/_lib/strings/find.pyx
+++ b/python/cudf/cudf/_lib/strings/find.pyx
@@ -1,23 +1,10 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+import cudf._lib.pylibcudf as plc
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.find cimport (
-    contains as cpp_contains,
-    ends_with as cpp_ends_with,
-    find as cpp_find,
-    rfind as cpp_rfind,
-    starts_with as cpp_starts_with,
-)
 from cudf._lib.cpp.types cimport size_type
-from cudf._lib.scalar cimport DeviceScalar
 
 
 @acquire_spill_lock()
@@ -26,23 +13,13 @@ def contains(Column source_strings, object py_target):
     Returns a Column of boolean values with True for `source_strings`
     that contain the pattern given in `py_target`.
     """
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
+    return Column.from_pylibcudf(
+        plc.strings.find.contains(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value
+        )
     )
 
-    with nogil:
-        c_result = move(cpp_contains(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def contains_multiple(Column source_strings, Column target_strings):
@@ -50,17 +27,12 @@ def contains_multiple(Column source_strings, Column target_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contain the corresponding string in `target_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view target_view = target_strings.view()
-
-    with nogil:
-        c_result = move(cpp_contains(
-            source_view,
-            target_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.find.contains(
+            source_strings.to_pylibcudf(mode="read"),
+            target_strings.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -70,23 +42,13 @@ def endswith(Column source_strings, object py_target):
     that contain strings that end with the pattern given in `py_target`.
     """
 
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
+    return Column.from_pylibcudf(
+        plc.strings.find.ends_with(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value
+        )
     )
 
-    with nogil:
-        c_result = move(cpp_ends_with(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def endswith_multiple(Column source_strings, Column target_strings):
@@ -95,17 +57,12 @@ def endswith_multiple(Column source_strings, Column target_strings):
     that contain strings that end with corresponding location
     in `target_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view target_view = target_strings.view()
-
-    with nogil:
-        c_result = move(cpp_ends_with(
-            source_view,
-            target_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.find.ends_with(
+            source_strings.to_pylibcudf(mode="read"),
+            target_strings.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -114,24 +71,13 @@ def startswith(Column source_strings, object py_target):
     Returns a Column of boolean values with True for `source_strings`
     that contain strings that start with the pattern given in `py_target`.
     """
-
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
+    return Column.from_pylibcudf(
+        plc.strings.find.starts_with(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value
+        )
     )
 
-    with nogil:
-        c_result = move(cpp_starts_with(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def startswith_multiple(Column source_strings, Column target_strings):
@@ -140,17 +86,12 @@ def startswith_multiple(Column source_strings, Column target_strings):
     that contain strings that begin with corresponding location
     in `target_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view target_view = target_strings.view()
-
-    with nogil:
-        c_result = move(cpp_starts_with(
-            source_view,
-            target_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.find.starts_with(
+            source_strings.to_pylibcudf(mode="read"),
+            target_strings.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -164,25 +105,14 @@ def find(Column source_strings,
     Scan portion of strings in `source_strings` can be
     controlled by setting `start` and `end` values.
     """
-
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_find(
-            source_view,
-            scalar_str[0],
+    return Column.from_pylibcudf(
+        plc.strings.find.find(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value,
             start,
             end
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -197,21 +127,11 @@ def rfind(Column source_strings,
     controlled by setting `start` and `end` values.
     """
 
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_rfind(
-            source_view,
-            scalar_str[0],
+    return Column.from_pylibcudf(
+        plc.strings.find.rfind(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value,
             start,
             end
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+        )
+    )
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 742198b1e61..6c116e740ff 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -276,6 +276,7 @@ def __getitem__(self, key):
         raise NotImplementedError()
 
     def __contains__(self, item):
+        hash(item)
         return item in self._values
 
     def _copy_type_metadata(
@@ -518,7 +519,7 @@ def where(self, cond, other=None, inplace=False):
         """
         raise NotImplementedError
 
-    def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None):
+    def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
         raise NotImplementedError
 
     def union(self, other, sort=None):
@@ -2062,7 +2063,13 @@ def dropna(self, how="any"):
             one null value. "all" drops only rows containing
             *all* null values.
         """
-
+        if how not in {"any", "all"}:
+            raise ValueError(f"{how=} must be 'any' or 'all'")
+        try:
+            if not self.hasnans:
+                return self.copy()
+        except NotImplementedError:
+            pass
         # This is to be consistent with IndexedFrame.dropna to handle nans
         # as nulls by default
         data_columns = [
@@ -2206,3 +2213,9 @@ def _split(self, splits):
 
 def _get_result_name(left_name, right_name):
     return left_name if _is_same_name(left_name, right_name) else None
+
+
+def _return_get_indexer_result(result):
+    if cudf.get_option("mode.pandas_compatible"):
+        return result.astype("int64")
+    return result
diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
index 4888cdd9ac9..f04cae719c2 100644
--- a/python/cudf/cudf/core/_internals/timezones.py
+++ b/python/cudf/cudf/core/_internals/timezones.py
@@ -3,23 +3,18 @@
 import os
 import zoneinfo
 from functools import lru_cache
-from typing import Tuple, cast
+from typing import Literal, Tuple
 
 import numpy as np
-import pandas as pd
 
-import cudf
-from cudf._lib.labeling import label_bins
-from cudf._lib.search import search_sorted
 from cudf._lib.timezone import make_timezone_transition_table
-from cudf.core.column.column import as_column, build_column
-from cudf.core.column.datetime import DatetimeColumn, DatetimeTZColumn
-from cudf.core.dataframe import DataFrame
-from cudf.utils.dtypes import _get_base_dtype
+from cudf.core.column.column import as_column
+from cudf.core.column.datetime import DatetimeColumn
+from cudf.core.column.timedelta import TimeDeltaColumn
 
 
 @lru_cache(maxsize=20)
-def get_tz_data(zone_name):
+def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
     """
     Return timezone data (transition times and UTC offsets) for the
     given IANA time zone.
@@ -31,8 +26,8 @@ def get_tz_data(zone_name):
 
     Returns
     -------
-    DataFrame with two columns containing the transition times
-    ("transition_times") and corresponding UTC offsets ("offsets").
+    Tuple with two columns containing the transition times
+    and corresponding UTC offsets.
     """
     try:
         # like zoneinfo, we first look in TZPATH
@@ -43,19 +38,23 @@ def get_tz_data(zone_name):
     return tz_table
 
 
-def _find_and_read_tzfile_tzpath(zone_name):
+def _find_and_read_tzfile_tzpath(
+    zone_name: str,
+) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
     for search_path in zoneinfo.TZPATH:
         if os.path.isfile(os.path.join(search_path, zone_name)):
-            return _read_tzfile_as_frame(search_path, zone_name)
+            return _read_tzfile_as_columns(search_path, zone_name)
     raise zoneinfo.ZoneInfoNotFoundError(zone_name)
 
 
-def _find_and_read_tzfile_tzdata(zone_name):
+def _find_and_read_tzfile_tzdata(
+    zone_name: str,
+) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
     import importlib.resources
 
     package_base = "tzdata.zoneinfo"
     try:
-        return _read_tzfile_as_frame(
+        return _read_tzfile_as_columns(
             str(importlib.resources.files(package_base)), zone_name
         )
     # TODO: make it so that the call to libcudf raises a
@@ -77,7 +76,9 @@ def _find_and_read_tzfile_tzdata(zone_name):
         raise zoneinfo.ZoneInfoNotFoundError(zone_name)
 
 
-def _read_tzfile_as_frame(tzdir, zone_name):
+def _read_tzfile_as_columns(
+    tzdir, zone_name: str
+) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
     transition_times_and_offsets = make_timezone_transition_table(
         tzdir, zone_name
     )
@@ -85,91 +86,13 @@ def _read_tzfile_as_frame(tzdir, zone_name):
     if not transition_times_and_offsets:
         # this happens for UTC-like zones
         min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]")
-        transition_times_and_offsets = (
-            as_column([min_date]),
-            as_column([np.timedelta64(0, "s")]),
-        )
-
-    return DataFrame._from_data(
-        dict(
-            zip(["transition_times", "offsets"], transition_times_and_offsets)
-        )
-    )
-
+        return (as_column([min_date]), as_column([np.timedelta64(0, "s")]))
+    return tuple(transition_times_and_offsets)  # type: ignore[return-value]
 
-def _find_ambiguous_and_nonexistent(
-    data: DatetimeColumn, zone_name: str
-) -> Tuple:
-    """
-    Recognize ambiguous and nonexistent timestamps for the given timezone.
-
-    Returns a tuple of columns, both of "bool" dtype and of the same
-    size as `data`, that respectively indicate ambiguous and
-    nonexistent timestamps in `data` with the value `True`.
-
-    Ambiguous and/or nonexistent timestamps are only possible if any
-    transitions occur in the time zone database for the given timezone.
-    If no transitions occur, the tuple `(False, False)` is returned.
-    """
-    tz_data_for_zone = get_tz_data(zone_name)
-    transition_times = tz_data_for_zone["transition_times"]
-    offsets = tz_data_for_zone["offsets"].astype(
-        f"timedelta64[{data.time_unit}]"
-    )
 
-    if len(offsets) == 1:  # no transitions
-        return False, False
-
-    transition_times, offsets, old_offsets = (
-        transition_times[1:]._column,
-        offsets[1:]._column,
-        offsets[:-1]._column,
-    )
-
-    # Assume we have two clocks at the moment of transition:
-    # - Clock 1 is turned forward or backwards correctly
-    # - Clock 2 makes no changes
-    clock_1 = transition_times + offsets
-    clock_2 = transition_times + old_offsets
-
-    # At the start of an ambiguous time period, Clock 1 (which has
-    # been turned back) reads less than Clock 2:
-    cond = clock_1 < clock_2
-    ambiguous_begin = clock_1.apply_boolean_mask(cond)
-
-    # The end of an ambiguous time period is what Clock 2 reads at
-    # the moment of transition:
-    ambiguous_end = clock_2.apply_boolean_mask(cond)
-    ambiguous = label_bins(
-        data,
-        left_edges=ambiguous_begin,
-        left_inclusive=True,
-        right_edges=ambiguous_end,
-        right_inclusive=False,
-    ).notnull()
-
-    # At the start of a non-existent time period, Clock 2 reads less
-    # than Clock 1 (which has been turned forward):
-    cond = clock_1 > clock_2
-    nonexistent_begin = clock_2.apply_boolean_mask(cond)
-
-    # The end of the non-existent time period is what Clock 1 reads
-    # at the moment of transition:
-    nonexistent_end = clock_1.apply_boolean_mask(cond)
-    nonexistent = label_bins(
-        data,
-        left_edges=nonexistent_begin,
-        left_inclusive=True,
-        right_edges=nonexistent_end,
-        right_inclusive=False,
-    ).notnull()
-
-    return ambiguous, nonexistent
-
-
-def localize(
-    data: DatetimeColumn, zone_name: str, ambiguous, nonexistent
-) -> DatetimeTZColumn:
+def check_ambiguous_and_nonexistent(
+    ambiguous: Literal["NaT"], nonexistent: Literal["NaT"]
+) -> Tuple[Literal["NaT"], Literal["NaT"]]:
     if ambiguous != "NaT":
         raise NotImplementedError(
             "Only ambiguous='NaT' is currently supported"
@@ -178,80 +101,4 @@ def localize(
         raise NotImplementedError(
             "Only nonexistent='NaT' is currently supported"
         )
-    if isinstance(data, DatetimeTZColumn):
-        raise ValueError(
-            "Already localized. "
-            "Use `tz_convert` to convert between time zones."
-        )
-    dtype = pd.DatetimeTZDtype(data.time_unit, zone_name)
-    ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name)
-    localized = cast(
-        DatetimeColumn,
-        data._scatter_by_column(
-            data.isnull() | (ambiguous | nonexistent),
-            cudf.Scalar(cudf.NaT, dtype=data.dtype),
-        ),
-    )
-    gmt_data = local_to_utc(localized, zone_name)
-    return cast(
-        DatetimeTZColumn,
-        build_column(
-            data=gmt_data.base_data,
-            dtype=dtype,
-            mask=localized.base_mask,
-            size=gmt_data.size,
-            offset=gmt_data.offset,
-        ),
-    )
-
-
-def delocalize(data: DatetimeColumn) -> DatetimeColumn:
-    """
-    Convert a timezone-aware datetime column to a timezone-naive one.
-    If the column is already timezone-naive, return it as is.
-    """
-    if isinstance(data, DatetimeTZColumn):
-        return data._local_time
-    # already timezone-naive:
-    return data
-
-
-def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn:
-    if not isinstance(data, DatetimeTZColumn):
-        raise TypeError(
-            "Cannot convert from timezone-naive timestamps to "
-            "timezone-aware timestamps. For that, "
-            "use `tz_localize`."
-        )
-    if zone_name == str(data.dtype.tz):
-        return data.copy()
-    utc_time = data._utc_time
-    out = cast(
-        DatetimeTZColumn,
-        build_column(
-            data=utc_time.base_data,
-            dtype=pd.DatetimeTZDtype(data.time_unit, zone_name),
-            mask=utc_time.base_mask,
-            size=utc_time.size,
-            offset=utc_time.offset,
-        ),
-    )
-    return out
-
-
-def utc_to_local(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
-    tz_data_for_zone = get_tz_data(zone_name)
-    transition_times, offsets = tz_data_for_zone._columns
-    transition_times = transition_times.astype(_get_base_dtype(data.dtype))
-    indices = search_sorted([transition_times], [data], "right") - 1
-    offsets_from_utc = offsets.take(indices, nullify=True)
-    return data + offsets_from_utc
-
-
-def local_to_utc(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
-    tz_data_for_zone = get_tz_data(zone_name)
-    transition_times, offsets = tz_data_for_zone._columns
-    transition_times_local = (transition_times + offsets).astype(data.dtype)
-    indices = search_sorted([transition_times_local], [data], "right") - 1
-    offsets_to_utc = offsets.take(indices, nullify=True)
-    return data - offsets_to_utc
+    return ambiguous, nonexistent
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 33cec21caa5..272abdece9e 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -142,10 +142,10 @@ def _index_or_values_interpolation(column, index=None):
         BooleanMask(~mask, len(to_interp))
     )
 
-    known_x = known_x_and_y._index._column.values
+    known_x = known_x_and_y.index.to_cupy()
     known_y = known_x_and_y._data.columns[0].values
 
-    result = cp.interp(to_interp._index.values, known_x, known_y)
+    result = cp.interp(index.to_cupy(), known_x, known_y)
 
     # find the first nan
     first_nan_idx = (mask == 0).argmax().item()
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index b2aba4f978b..5c2d77033b8 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -6,7 +6,7 @@
 import pickle
 import weakref
 from types import SimpleNamespace
-from typing import Any, Dict, Literal, Mapping, Optional, Sequence, Tuple
+from typing import Any, Dict, Literal, Mapping, Optional, Tuple
 
 import numpy
 from typing_extensions import Self
@@ -480,36 +480,6 @@ def __str__(self) -> str:
         )
 
 
-def is_c_contiguous(
-    shape: Sequence[int], strides: Sequence[int], itemsize: int
-) -> bool:
-    """Determine if shape and strides are C-contiguous
-
-    Parameters
-    ----------
-    shape : Sequence[int]
-        Number of elements in each dimension.
-    strides : Sequence[int]
-        The stride of each dimension in bytes.
-    itemsize : int
-        Size of an element in bytes.
-
-    Return
-    ------
-    bool
-        The boolean answer.
-    """
-
-    if any(dim == 0 for dim in shape):
-        return True
-    cumulative_stride = itemsize
-    for dim, stride in zip(reversed(shape), reversed(strides)):
-        if dim > 1 and stride != cumulative_stride:
-            return False
-        cumulative_stride *= dim
-    return True
-
-
 def get_ptr_and_size(array_interface: Mapping) -> Tuple[int, int]:
     """Retrieve the pointer and size from an array interface.
 
@@ -531,7 +501,9 @@ def get_ptr_and_size(array_interface: Mapping) -> Tuple[int, int]:
     shape = array_interface["shape"] or (1,)
     strides = array_interface["strides"]
     itemsize = cudf.dtype(array_interface["typestr"]).itemsize
-    if strides is None or is_c_contiguous(shape, strides, itemsize):
+    if strides is None or cudf._lib.pylibcudf.column.is_c_contiguous(
+        shape, strides, itemsize
+    ):
         nelem = math.prod(shape)
         ptr = array_interface["data"][0] or 0
         return ptr, nelem * itemsize
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index e3e73035046..dc51cd4f28f 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1045,9 +1045,6 @@ def fillna(
         """
         Fill null values with *fill_value*
         """
-        if not self.nullable:
-            return self
-
         if fill_value is not None:
             fill_is_scalar = np.isscalar(fill_value)
 
@@ -1079,6 +1076,11 @@ def fillna(
                     self.codes.dtype
                 )
 
+        # Validation of `fill_value` will have to be performed
+        # before returning self.
+        if not self.nullable:
+            return self
+
         return super().fillna(fill_value, method=method)
 
     def indices_of(
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 7e48552742c..e23da59b883 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1101,11 +1101,27 @@ def __arrow_array__(self, type=None):
         )
 
     @property
-    def __cuda_array_interface__(self):
-        raise NotImplementedError(
-            f"dtype {self.dtype} is not yet supported via "
-            "`__cuda_array_interface__`"
-        )
+    def __cuda_array_interface__(self) -> abc.Mapping[str, Any]:
+        output = {
+            "shape": (len(self),),
+            "strides": (self.dtype.itemsize,),
+            "typestr": self.dtype.str,
+            "data": (self.data_ptr, False),
+            "version": 1,
+        }
+
+        if self.nullable and self.has_nulls():
+            # Create a simple Python object that exposes the
+            # `__cuda_array_interface__` attribute here since we need to modify
+            # some of the attributes from the numba device array
+            output["mask"] = cuda_array_interface_wrapper(
+                ptr=self.mask_ptr,
+                size=len(self),
+                owner=self.mask,
+                readonly=True,
+                typestr="<t1",
+            )
+        return output
 
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return _array_ufunc(self, ufunc, method, inputs, kwargs)
@@ -2147,6 +2163,19 @@ def as_column(
                     nan_as_null=nan_as_null,
                     length=length,
                 )
+            elif (
+                isinstance(element, (pd.Timestamp, pd.Timedelta))
+                or element is pd.NaT
+            ):
+                # TODO: Remove this after
+                # https://github.com/apache/arrow/issues/26492
+                # is fixed.
+                return as_column(
+                    pd.Series(arbitrary),
+                    dtype=dtype,
+                    nan_as_null=nan_as_null,
+                    length=length,
+                )
             elif not any(element is na for na in (None, pd.NA, np.nan)):
                 # Might have NA + element like above, but short-circuit if
                 # an element pyarrow/pandas might be able to parse
@@ -2195,7 +2224,7 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer:
         raise NotImplementedError(f"Cannot infer mask from typestr {typestr}")
 
 
-def serialize_columns(columns) -> Tuple[List[dict], List]:
+def serialize_columns(columns: list[ColumnBase]) -> Tuple[List[dict], List]:
     """
     Return the headers and frames resulting
     from serializing a list of Column
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index b84c1dc7ccd..9fe4e5da96d 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -7,7 +7,7 @@
 import locale
 import re
 from locale import nl_langinfo
-from typing import Any, Mapping, Optional, Sequence, cast
+from typing import TYPE_CHECKING, Any, Literal, Optional, Sequence, Tuple, cast
 
 import numpy as np
 import pandas as pd
@@ -16,6 +16,8 @@
 
 import cudf
 from cudf import _lib as libcudf
+from cudf._lib.labeling import label_bins
+from cudf._lib.search import search_sorted
 from cudf._typing import (
     ColumnBinaryOperand,
     DatetimeLikeScalar,
@@ -25,12 +27,15 @@
 )
 from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
 from cudf.core._compat import PANDAS_GE_220
-from cudf.core.buffer import Buffer, cuda_array_interface_wrapper
+from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
 from cudf.utils.utils import _all_bools_with_nulls
 
+if TYPE_CHECKING:
+    from cudf.core.column.numerical import NumericalColumn
+
 if PANDAS_GE_220:
     _guess_datetime_format = pd.tseries.api.guess_datetime_format
 else:
@@ -399,29 +404,6 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
 
         return NotImplemented
 
-    @property
-    def __cuda_array_interface__(self) -> Mapping[str, Any]:
-        output = {
-            "shape": (len(self),),
-            "strides": (self.dtype.itemsize,),
-            "typestr": self.dtype.str,
-            "data": (self.data_ptr, False),
-            "version": 1,
-        }
-
-        if self.nullable and self.has_nulls():
-            # Create a simple Python object that exposes the
-            # `__cuda_array_interface__` attribute here since we need to modify
-            # some of the attributes from the numba device array
-            output["mask"] = cuda_array_interface_wrapper(
-                ptr=self.mask_ptr,
-                size=len(self),
-                owner=self.mask,
-                readonly=True,
-                typestr="<t1",
-            )
-        return output
-
     def as_datetime_column(
         self, dtype: Dtype, format: str | None = None
     ) -> DatetimeColumn:
@@ -688,6 +670,121 @@ def _with_type_metadata(self, dtype):
             )
         return self
 
+    def _find_ambiguous_and_nonexistent(
+        self, zone_name: str
+    ) -> Tuple[NumericalColumn, NumericalColumn] | Tuple[bool, bool]:
+        """
+        Recognize ambiguous and nonexistent timestamps for the given timezone.
+
+        Returns a tuple of columns, both of "bool" dtype and of the same
+        size as `self`, that respectively indicate ambiguous and
+        nonexistent timestamps in `self` with the value `True`.
+
+        Ambiguous and/or nonexistent timestamps are only possible if any
+        transitions occur in the time zone database for the given timezone.
+        If no transitions occur, the tuple `(False, False)` is returned.
+        """
+        from cudf.core._internals.timezones import get_tz_data
+
+        transition_times, offsets = get_tz_data(zone_name)
+        offsets = offsets.astype(f"timedelta64[{self.time_unit}]")  # type: ignore[assignment]
+
+        if len(offsets) == 1:  # no transitions
+            return False, False
+
+        transition_times, offsets, old_offsets = (
+            transition_times.slice(1, len(transition_times)),
+            offsets.slice(1, len(offsets)),
+            offsets.slice(0, len(offsets) - 1),
+        )
+
+        # Assume we have two clocks at the moment of transition:
+        # - Clock 1 is turned forward or backwards correctly
+        # - Clock 2 makes no changes
+        clock_1 = transition_times + offsets
+        clock_2 = transition_times + old_offsets
+
+        # At the start of an ambiguous time period, Clock 1 (which has
+        # been turned back) reads less than Clock 2:
+        cond = clock_1 < clock_2
+        ambiguous_begin = clock_1.apply_boolean_mask(cond)
+
+        # The end of an ambiguous time period is what Clock 2 reads at
+        # the moment of transition:
+        ambiguous_end = clock_2.apply_boolean_mask(cond)
+        ambiguous = label_bins(
+            self,
+            left_edges=ambiguous_begin,
+            left_inclusive=True,
+            right_edges=ambiguous_end,
+            right_inclusive=False,
+        ).notnull()
+
+        # At the start of a non-existent time period, Clock 2 reads less
+        # than Clock 1 (which has been turned forward):
+        cond = clock_1 > clock_2
+        nonexistent_begin = clock_2.apply_boolean_mask(cond)
+
+        # The end of the non-existent time period is what Clock 1 reads
+        # at the moment of transition:
+        nonexistent_end = clock_1.apply_boolean_mask(cond)
+        nonexistent = label_bins(
+            self,
+            left_edges=nonexistent_begin,
+            left_inclusive=True,
+            right_edges=nonexistent_end,
+            right_inclusive=False,
+        ).notnull()
+
+        return ambiguous, nonexistent
+
+    def tz_localize(
+        self,
+        tz: str | None,
+        ambiguous: Literal["NaT"] = "NaT",
+        nonexistent: Literal["NaT"] = "NaT",
+    ):
+        from cudf.core._internals.timezones import (
+            check_ambiguous_and_nonexistent,
+            get_tz_data,
+        )
+
+        if tz is None:
+            return self.copy()
+        ambiguous, nonexistent = check_ambiguous_and_nonexistent(
+            ambiguous, nonexistent
+        )
+        dtype = pd.DatetimeTZDtype(self.time_unit, tz)
+        ambiguous_col, nonexistent_col = self._find_ambiguous_and_nonexistent(
+            tz
+        )
+        localized = self._scatter_by_column(
+            self.isnull() | (ambiguous_col | nonexistent_col),
+            cudf.Scalar(cudf.NaT, dtype=self.dtype),
+        )
+
+        transition_times, offsets = get_tz_data(tz)
+        transition_times_local = (transition_times + offsets).astype(
+            localized.dtype
+        )
+        indices = (
+            search_sorted([transition_times_local], [localized], "right") - 1
+        )
+        offsets_to_utc = offsets.take(indices, nullify=True)
+        gmt_data = localized - offsets_to_utc
+        return DatetimeTZColumn(
+            data=gmt_data.base_data,
+            dtype=dtype,
+            mask=localized.base_mask,
+            size=gmt_data.size,
+            offset=gmt_data.offset,
+        )
+
+    def tz_convert(self, tz: str | None):
+        raise TypeError(
+            "Cannot convert tz-naive timestamps, use tz_localize to localize"
+        )
+
 
 class DatetimeTZColumn(DatetimeColumn):
     def __init__(
@@ -754,9 +851,13 @@ def _utc_time(self):
     @property
     def _local_time(self):
         """Return the local time as naive timestamps."""
-        from cudf.core._internals.timezones import utc_to_local
+        from cudf.core._internals.timezones import get_tz_data
 
-        return utc_to_local(self, str(self.dtype.tz))
+        transition_times, offsets = get_tz_data(str(self.dtype.tz))
+        transition_times = transition_times.astype(_get_base_dtype(self.dtype))
+        indices = search_sorted([transition_times], [self], "right") - 1
+        offsets_from_utc = offsets.take(indices, nullify=True)
+        return self + offsets_from_utc
 
     def as_string_column(
         self, dtype: Dtype, format: str | None = None
@@ -779,3 +880,32 @@ def __repr__(self):
             f"{arr.to_string()}\n"
             f"dtype: {self.dtype}"
         )
+
+    def tz_localize(self, tz: str | None, ambiguous="NaT", nonexistent="NaT"):
+        from cudf.core._internals.timezones import (
+            check_ambiguous_and_nonexistent,
+        )
+
+        if tz is None:
+            return self._local_time
+        ambiguous, nonexistent = check_ambiguous_and_nonexistent(
+            ambiguous, nonexistent
+        )
+        raise ValueError(
+            "Already localized. "
+            "Use `tz_convert` to convert between time zones."
+        )
+
+    def tz_convert(self, tz: str | None):
+        if tz is None:
+            return self._utc_time
+        elif tz == str(self.dtype.tz):
+            return self.copy()
+        utc_time = self._utc_time
+        return type(self)(
+            data=utc_time.base_data,
+            dtype=pd.DatetimeTZDtype(self.time_unit, tz),
+            mask=utc_time.base_mask,
+            size=utc_time.size,
+            offset=utc_time.offset,
+        )
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index b83a6ded416..3a0f6649e21 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -38,6 +38,12 @@ class DecimalBaseColumn(NumericalBaseColumn):
     dtype: DecimalDtype
     _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
 
+    @property
+    def __cuda_array_interface__(self):
+        raise NotImplementedError(
+            "Decimals are not yet supported via `__cuda_array_interface__`"
+        )
+
     def as_decimal_column(
         self,
         dtype: Dtype,
@@ -342,12 +348,6 @@ def to_arrow(self):
             buffers=[mask_buf, data_buf],
         )
 
-    @property
-    def __cuda_array_interface__(self):
-        raise NotImplementedError(
-            "Decimals are not yet supported via `__cuda_array_interface__`"
-        )
-
     def _with_type_metadata(
         self: "cudf.core.column.Decimal64Column", dtype: Dtype
     ) -> "cudf.core.column.Decimal64Column":
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index f42c87de3fd..f6c7ca7675a 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -3,16 +3,7 @@
 from __future__ import annotations
 
 import functools
-from typing import (
-    Any,
-    Callable,
-    Mapping,
-    Optional,
-    Sequence,
-    Tuple,
-    Union,
-    cast,
-)
+from typing import Any, Callable, Optional, Sequence, Tuple, Union, cast
 
 import cupy as cp
 import numpy as np
@@ -37,7 +28,7 @@
     is_integer_dtype,
     is_scalar,
 )
-from cudf.core.buffer import Buffer, cuda_array_interface_wrapper
+from cudf.core.buffer import Buffer
 from cudf.core.column import (
     ColumnBase,
     as_column,
@@ -116,15 +107,14 @@ def __contains__(self, item: ScalarLike) -> bool:
         # Handles improper item types
         # Fails if item is of type None, so the handler.
         try:
-            if np.can_cast(item, self.dtype):
-                item = self.dtype.type(item)
-            else:
+            search_item = self.dtype.type(item)
+            if search_item != item and self.dtype.kind != "f":
                 return False
         except (TypeError, ValueError):
             return False
         # TODO: Use `scalar`-based `contains` wrapper
         return libcudf.search.contains(
-            self, column.as_column([item], dtype=self.dtype)
+            self, column.as_column([search_item], dtype=self.dtype)
         ).any()
 
     def indices_of(self, value: ScalarLike) -> NumericalColumn:
@@ -194,30 +184,6 @@ def __setitem__(self, key: Any, value: Any):
         if out:
             self._mimic_inplace(out, inplace=True)
 
-    @property
-    def __cuda_array_interface__(self) -> Mapping[str, Any]:
-        output = {
-            "shape": (len(self),),
-            "strides": (self.dtype.itemsize,),
-            "typestr": self.dtype.str,
-            "data": (self.data_ptr, False),
-            "version": 1,
-        }
-
-        if self.nullable and self.has_nulls():
-            # Create a simple Python object that exposes the
-            # `__cuda_array_interface__` attribute here since we need to modify
-            # some of the attributes from the numba device array
-            output["mask"] = cuda_array_interface_wrapper(
-                ptr=self.mask_ptr,
-                size=len(self),
-                owner=self.mask,
-                readonly=True,
-                typestr="<t1",
-            )
-
-        return output
-
     def unary_operator(self, unaryop: Union[str, Callable]) -> ColumnBase:
         if callable(unaryop):
             return libcudf.transform.transform(self, unaryop)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 0862995bc46..3e941d60079 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -692,7 +692,7 @@ def contains(
 
         Returning an Index of booleans using only a literal pattern.
 
-        >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]
+        >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.nan]
         >>> idx = cudf.Index(data)
         >>> idx
         Index(['Mouse', 'dog', 'house and parrot', '23.0', None], dtype='object')
@@ -5600,6 +5600,13 @@ def data_array_view(
     ) -> cuda.devicearray.DeviceNDArray:
         raise ValueError("Cannot get an array view of a StringColumn")
 
+    @property
+    def __cuda_array_interface__(self):
+        raise NotImplementedError(
+            f"dtype {self.dtype} is not yet supported via "
+            "`__cuda_array_interface__`"
+        )
+
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
 
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 33085bede78..fbce6e02330 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import itertools
+import sys
 from collections import abc
 from functools import cached_property, reduce
 from typing import (
@@ -174,6 +175,38 @@ def __repr__(self) -> str:
         )
         return f"{type_info}\n{column_info}"
 
+    def _from_columns_like_self(
+        self, columns: abc.Iterable[ColumnBase], verify: bool = True
+    ):
+        """
+        Return a new ColumnAccessor with columns and the properties of self.
+
+        Parameters
+        ----------
+        columns : iterable of Columns
+            New columns for the ColumnAccessor.
+        verify : bool, optional
+            Whether to verify column length and type.
+        """
+        if sys.version_info.major >= 3 and sys.version_info.minor >= 10:
+            data = zip(self.names, columns, strict=True)
+        else:
+            columns = list(columns)
+            if len(columns) != len(self.names):
+                raise ValueError(
+                    f"The number of columns ({len(columns)}) must match "
+                    f"the number of existing column labels ({len(self.names)})."
+                )
+            data = zip(self.names, columns)
+        return type(self)(
+            data=dict(data),
+            multiindex=self.multiindex,
+            level_names=self.level_names,
+            rangeindex=self.rangeindex,
+            label_dtype=self.label_dtype,
+            verify=verify,
+        )
+
     @property
     def level_names(self) -> Tuple[Any, ...]:
         if self._level_names is None or len(self._level_names) == 0:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 45bb66d5d4b..6928425a867 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1215,7 +1215,7 @@ def dtypes(self):
         >>> df.dtypes
         float              float64
         int                  int64
-        datetime    datetime64[us]
+        datetime    datetime64[ns]
         string              object
         dtype: object
         """
@@ -1768,7 +1768,7 @@ def _concat(
                 indices[:first_data_column_position],
             )
             if not isinstance(out._index, MultiIndex) and isinstance(
-                out._index._values.dtype, cudf.CategoricalDtype
+                out._index.dtype, cudf.CategoricalDtype
             ):
                 out = out.set_index(
                     cudf.core.index.as_index(out.index._values)
@@ -3036,8 +3036,11 @@ def where(self, cond, other=None, inplace=False):
 
         # First process the condition.
         if isinstance(cond, Series):
-            cond = self._from_data_like_self(
-                {name: cond._column for name in self._column_names},
+            cond = self._from_data(
+                self._data._from_columns_like_self(
+                    itertools.repeat(cond._column, len(self._column_names)),
+                    verify=False,
+                )
             )
         elif hasattr(cond, "__cuda_array_interface__"):
             cond = DataFrame(
@@ -3078,7 +3081,7 @@ def where(self, cond, other=None, inplace=False):
                 should be equal to number of columns of self"""
             )
 
-        out = {}
+        out = []
         for (name, col), other_col in zip(self._data.items(), other_cols):
             col, other_col = _check_and_cast_columns_with_other(
                 source_col=col,
@@ -3091,16 +3094,17 @@ def where(self, cond, other=None, inplace=False):
                     col, other_col, cond_col
                 )
 
-                out[name] = _make_categorical_like(result, self._data[name])
+                out.append(_make_categorical_like(result, self._data[name]))
             else:
                 out_mask = cudf._lib.null_mask.create_null_mask(
                     len(col),
                     state=cudf._lib.null_mask.MaskState.ALL_NULL,
                 )
-                out[name] = col.set_mask(out_mask)
+                out.append(col.set_mask(out_mask))
 
         return self._mimic_inplace(
-            self._from_data_like_self(out), inplace=inplace
+            self._from_data_like_self(self._data._from_columns_like_self(out)),
+            inplace=inplace,
         )
 
     @docutils.doc_apply(
@@ -3578,7 +3582,7 @@ def rename(
         if index:
             if (
                 any(isinstance(item, str) for item in index.values())
-                and type(self.index._values) != cudf.core.column.StringColumn
+                and self.index.dtype != "object"
             ):
                 raise NotImplementedError(
                     "Implicit conversion of index to "
@@ -7556,6 +7560,12 @@ def interleave_columns(self):
         Returns
         -------
         The interleaved columns as a single column
+
+        .. pandas-compat::
+            **DataFrame.interleave_columns**
+
+            This method does not exist in pandas but it can be run
+            as ``pd.Series(np.vstack(df.to_numpy()).reshape((-1,)))``.
         """
         if ("category" == self.dtypes).any():
             raise ValueError(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 01842b5f0a9..58932db2bda 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -762,10 +762,17 @@ def fillna(
             else:
                 replace_val = None
             should_fill = (
-                col_name in value
-                and col.has_nulls(include_nan=True)
-                and not libcudf.scalar._is_null_host_scalar(replace_val)
-            ) or method is not None
+                (
+                    col_name in value
+                    and col.has_nulls(include_nan=True)
+                    and not libcudf.scalar._is_null_host_scalar(replace_val)
+                )
+                or method is not None
+                or (
+                    isinstance(col, cudf.core.column.CategoricalColumn)
+                    and not libcudf.scalar._is_null_host_scalar(replace_val)
+                )
+            )
             if should_fill:
                 filled_data[col_name] = col.fillna(replace_val, method)
             else:
@@ -1077,7 +1084,7 @@ def isna(self):
         >>> import cudf
         >>> import numpy as np
         >>> import pandas as pd
-        >>> df = cudf.DataFrame({'age': [5, 6, np.NaN],
+        >>> df = cudf.DataFrame({'age': [5, 6, np.nan],
         ...                    'born': [pd.NaT, pd.Timestamp('1939-05-27'),
         ...                             pd.Timestamp('1940-04-25')],
         ...                    'name': ['Alfred', 'Batman', ''],
@@ -1095,7 +1102,7 @@ def isna(self):
 
         Show which entries in a Series are NA.
 
-        >>> ser = cudf.Series([5, 6, np.NaN, np.inf, -np.inf])
+        >>> ser = cudf.Series([5, 6, np.nan, np.inf, -np.inf])
         >>> ser
         0     5.0
         1     6.0
@@ -1113,14 +1120,16 @@ def isna(self):
 
         Show which entries in an Index are NA.
 
-        >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf])
+        >>> idx = cudf.Index([1, 2, None, np.nan, 0.32, np.inf])
         >>> idx
         Index([1.0, 2.0, <NA>, <NA>, 0.32, Inf], dtype='float64')
         >>> idx.isna()
         array([False, False,  True,  True, False, False])
         """
         data_columns = (col.isnull() for col in self._columns)
-        return self._from_data_like_self(zip(self._column_names, data_columns))
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(data_columns)
+        )
 
     # Alias for isna
     isnull = isna
@@ -1156,7 +1165,7 @@ def notna(self):
         >>> import cudf
         >>> import numpy as np
         >>> import pandas as pd
-        >>> df = cudf.DataFrame({'age': [5, 6, np.NaN],
+        >>> df = cudf.DataFrame({'age': [5, 6, np.nan],
         ...                    'born': [pd.NaT, pd.Timestamp('1939-05-27'),
         ...                             pd.Timestamp('1940-04-25')],
         ...                    'name': ['Alfred', 'Batman', ''],
@@ -1174,7 +1183,7 @@ def notna(self):
 
         Show which entries in a Series are NA.
 
-        >>> ser = cudf.Series([5, 6, np.NaN, np.inf, -np.inf])
+        >>> ser = cudf.Series([5, 6, np.nan, np.inf, -np.inf])
         >>> ser
         0     5.0
         1     6.0
@@ -1192,14 +1201,16 @@ def notna(self):
 
         Show which entries in an Index are NA.
 
-        >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf])
+        >>> idx = cudf.Index([1, 2, None, np.nan, 0.32, np.inf])
         >>> idx
         Index([1.0, 2.0, <NA>, <NA>, 0.32, Inf], dtype='float64')
         >>> idx.notna()
         array([ True,  True, False, False,  True,  True])
         """
         data_columns = (col.notnull() for col in self._columns)
-        return self._from_data_like_self(zip(self._column_names, data_columns))
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(data_columns)
+        )
 
     # Alias for notna
     notnull = notna
@@ -1506,7 +1517,9 @@ def _encode(self):
     @_cudf_nvtx_annotate
     def _unaryop(self, op):
         data_columns = (col.unary_operator(op) for col in self._columns)
-        return self._from_data_like_self(zip(self._column_names, data_columns))
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(data_columns)
+        )
 
     @classmethod
     @_cudf_nvtx_annotate
@@ -1638,12 +1651,14 @@ def _apply_cupy_ufunc_to_operands(
     def __neg__(self):
         """Negate for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
-            {
-                name: col.unary_operator("not")
-                if is_bool_dtype(col.dtype)
-                else -1 * col
-                for name, col in self._data.items()
-            }
+            self._data._from_columns_like_self(
+                (
+                    col.unary_operator("not")
+                    if col.dtype.kind == "b"
+                    else -1 * col
+                    for col in self._data.columns
+                )
+            )
         )
 
     @_cudf_nvtx_annotate
@@ -1897,10 +1912,9 @@ def __copy__(self):
     def __invert__(self):
         """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
-            {
-                name: _apply_inverse_column(col)
-                for name, col in self._data.items()
-            }
+            self._data._from_columns_like_self(
+                (_apply_inverse_column(col) for col in self._data.columns)
+            )
         )
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index eeaa6edb972..0622650cc29 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -22,6 +22,7 @@
 import cupy
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from typing_extensions import Self
 
 import cudf
@@ -38,7 +39,7 @@
     is_list_like,
     is_scalar,
 )
-from cudf.core._base_index import BaseIndex
+from cudf.core._base_index import BaseIndex, _return_get_indexer_result
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.column import (
     CategoricalColumn,
@@ -60,6 +61,7 @@
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
+    _NUMPY_SCTYPES,
     _maybe_convert_to_default_type,
     find_common_type,
     is_mixed_with_object_dtype,
@@ -249,6 +251,15 @@ def searchsorted(
         ), "Invalid ascending flag"
         return search_range(value, self._range, side=side)
 
+    def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
+        if sort and self.step < 0:
+            codes = cupy.arange(len(self) - 1, -1, -1)
+            uniques = self[::-1]
+        else:
+            codes = cupy.arange(len(self), dtype=np.intp)
+            uniques = self
+        return codes, uniques
+
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def name(self):
@@ -261,7 +272,7 @@ def name(self, value):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def start(self):
+    def start(self) -> int:
         """
         The value of the `start` parameter (0 if this was not supplied).
         """
@@ -269,7 +280,7 @@ def start(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def stop(self):
+    def stop(self) -> int:
         """
         The value of the stop parameter.
         """
@@ -277,7 +288,7 @@ def stop(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def step(self):
+    def step(self) -> int:
         """
         The value of the step parameter.
         """
@@ -285,7 +296,7 @@ def step(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def _num_rows(self):
+    def _num_rows(self) -> int:
         return len(self)
 
     @cached_property  # type: ignore
@@ -296,33 +307,33 @@ def _values(self):
         else:
             return column.column_empty(0, masked=False, dtype=self.dtype)
 
-    def _clean_nulls_from_index(self):
+    def _clean_nulls_from_index(self) -> Self:
         return self
 
-    def _is_numeric(self):
+    def _is_numeric(self) -> bool:
         return True
 
-    def _is_boolean(self):
+    def _is_boolean(self) -> bool:
         return False
 
-    def _is_integer(self):
+    def _is_integer(self) -> bool:
         return True
 
-    def _is_floating(self):
+    def _is_floating(self) -> bool:
         return False
 
-    def _is_object(self):
+    def _is_object(self) -> bool:
         return False
 
-    def _is_categorical(self):
+    def _is_categorical(self) -> bool:
         return False
 
-    def _is_interval(self):
+    def _is_interval(self) -> bool:
         return False
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def hasnans(self):
+    def hasnans(self) -> bool:
         return False
 
     @property  # type: ignore
@@ -334,8 +345,12 @@ def _data(self):
 
     @_cudf_nvtx_annotate
     def __contains__(self, item):
+        hash(item)
         if isinstance(item, bool) or not isinstance(
-            item, tuple(np.sctypes["int"] + np.sctypes["float"] + [int, float])
+            item,
+            tuple(
+                _NUMPY_SCTYPES["int"] + _NUMPY_SCTYPES["float"] + [int, float]
+            ),
         ):
             return False
         try:
@@ -370,12 +385,15 @@ def astype(self, dtype, copy: bool = True):
             return self
         return self._as_int_index().astype(dtype, copy=copy)
 
+    def fillna(self, value, downcast=None):
+        return self.copy()
+
     @_cudf_nvtx_annotate
     def drop_duplicates(self, keep="first"):
         return self
 
     @_cudf_nvtx_annotate
-    def duplicated(self, keep="first"):
+    def duplicated(self, keep="first") -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
     @_cudf_nvtx_annotate
@@ -391,6 +409,11 @@ def __repr__(self):
             + ")"
         )
 
+    @property
+    @_cudf_nvtx_annotate
+    def size(self) -> int:
+        return len(self)
+
     @_cudf_nvtx_annotate
     def __len__(self):
         return len(self._range)
@@ -479,12 +502,12 @@ def to_pandas(
         )
 
     @property
-    def is_unique(self):
+    def is_unique(self) -> bool:
         return True
 
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_monotonic_increasing(self):
+    def is_monotonic_increasing(self) -> bool:
         return self.step > 0 or len(self) <= 1
 
     @cached_property  # type: ignore
@@ -493,7 +516,7 @@ def is_monotonic_decreasing(self):
         return self.step < 0 or len(self) <= 1
 
     @_cudf_nvtx_annotate
-    def memory_usage(self, deep=False):
+    def memory_usage(self, deep: bool = False) -> int:
         if deep:
             warnings.warn(
                 "The deep parameter is ignored and is only included "
@@ -501,7 +524,7 @@ def memory_usage(self, deep=False):
             )
         return 0
 
-    def unique(self):
+    def unique(self) -> Self:
         # RangeIndex always has unique values
         return self
 
@@ -824,34 +847,37 @@ def _columns(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def values_host(self):
-        return self.to_pandas().values
+    def values_host(self) -> np.ndarray:
+        return np.arange(start=self.start, stop=self.stop, step=self.step)
 
     @_cudf_nvtx_annotate
     def argsort(
         self,
         ascending=True,
         na_position="last",
-    ):
+    ) -> cupy.ndarray:
         if na_position not in {"first", "last"}:
             raise ValueError(f"invalid na_position: {na_position}")
-
-        indices = cupy.arange(0, len(self))
         if (ascending and self.step < 0) or (not ascending and self.step > 0):
-            indices = indices[::-1]
-        return indices
+            return cupy.arange(len(self) - 1, -1, -1)
+        else:
+            return cupy.arange(len(self))
 
     @_cudf_nvtx_annotate
     def where(self, cond, other=None, inplace=False):
         return self._as_int_index().where(cond, other, inplace)
 
     @_cudf_nvtx_annotate
-    def to_numpy(self):
+    def to_numpy(self) -> np.ndarray:
         return self.values_host
 
     @_cudf_nvtx_annotate
-    def to_arrow(self):
-        return self._as_int_index().to_arrow()
+    def to_cupy(self) -> cupy.ndarray:
+        return self.values
+
+    @_cudf_nvtx_annotate
+    def to_arrow(self) -> pa.Array:
+        return pa.array(self._range, type=pa.from_numpy_dtype(self.dtype))
 
     def __array__(self, dtype=None):
         raise TypeError(
@@ -862,17 +888,17 @@ def __array__(self, dtype=None):
         )
 
     @_cudf_nvtx_annotate
-    def nunique(self):
+    def nunique(self) -> int:
         return len(self)
 
     @_cudf_nvtx_annotate
-    def isna(self):
+    def isna(self) -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
     isnull = isna
 
     @_cudf_nvtx_annotate
-    def notna(self):
+    def notna(self) -> cupy.ndarray:
         return cupy.ones(len(self), dtype=bool)
 
     notnull = isna
@@ -896,12 +922,15 @@ def max(self):
         return self._minmax("max")
 
     @property
-    def values(self):
+    def values(self) -> cupy.ndarray:
         return cupy.arange(self.start, self.stop, self.step)
 
-    def any(self):
+    def any(self) -> bool:
         return any(self._range)
 
+    def all(self) -> bool:
+        return 0 not in self._range
+
     def append(self, other):
         result = self._as_int_index().append(other)
         return self._try_reconstruct_range_index(result)
@@ -927,14 +956,20 @@ def isin(self, values):
 
         return self._values.isin(values).values
 
-    def __neg__(self):
-        return -self._as_int_index()
+    def __pos__(self) -> Self:
+        return self.copy()
 
-    def __pos__(self):
-        return +self._as_int_index()
+    def __neg__(self) -> Self:
+        rng = range(-self.start, -self.stop, -self.step)
+        return type(self)(rng, name=self.name)
 
-    def __abs__(self):
-        return abs(self._as_int_index())
+    def __abs__(self) -> Self | Index:
+        if len(self) == 0 or self.min() >= 0:
+            return self.copy()
+        elif self.max() <= 0:
+            return -self
+        else:
+            return abs(self._as_int_index())
 
     def _columns_for_reset_index(
         self, levels: tuple | None
@@ -1093,14 +1128,26 @@ def _concat(cls, objs):
             assert (
                 PANDAS_LT_300
             ), "Need to drop after pandas-3.0 support is added."
-            warnings.warn(
+            warning_msg = (
                 "The behavior of array concatenation with empty entries is "
                 "deprecated. In a future version, this will no longer exclude "
                 "empty items when determining the result dtype. "
                 "To retain the old behavior, exclude the empty entries before "
-                "the concat operation.",
-                FutureWarning,
+                "the concat operation."
             )
+            # Warn only if the type might _actually_ change
+            if len(non_empties) == 0:
+                if not all(objs[0].dtype == index.dtype for index in objs[1:]):
+                    warnings.warn(warning_msg, FutureWarning)
+            else:
+                common_all_type = find_common_type(
+                    [index.dtype for index in objs]
+                )
+                common_non_empty_type = find_common_type(
+                    [index.dtype for index in non_empties]
+                )
+                if common_all_type != common_non_empty_type:
+                    warnings.warn(warning_msg, FutureWarning)
         if all(isinstance(obj, RangeIndex) for obj in non_empties):
             result = _concat_range_index(non_empties)
         else:
@@ -1218,11 +1265,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         )
 
         if not len(self):
-            return result.values
+            return _return_get_indexer_result(result.values)
         try:
             lcol, rcol = _match_join_keys(needle, self._column, "inner")
         except ValueError:
-            return result.values
+            return _return_get_indexer_result(result.values)
 
         scatter_map, indices = libcudf.join.join([lcol], [rcol], how="inner")
         (result,) = libcudf.copying.scatter([indices], scatter_map, [result])
@@ -1249,7 +1296,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
                 "{['ffill'/'pad', 'bfill'/'backfill', 'nearest', None]}"
             )
 
-        return result_series.to_cupy()
+        return _return_get_indexer_result(result_series.to_cupy())
 
     @_cudf_nvtx_annotate
     def get_loc(self, key):
@@ -1485,6 +1532,7 @@ def values(self):
         return self._column.values
 
     def __contains__(self, item):
+        hash(item)
         return item in self._values
 
     def _clean_nulls_from_index(self):
@@ -2232,7 +2280,12 @@ def round(self, freq):
 
         return self.__class__._from_data({self.name: out_column})
 
-    def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
+    def tz_localize(
+        self,
+        tz: str | None,
+        ambiguous: Literal["NaT"] = "NaT",
+        nonexistent: Literal["NaT"] = "NaT",
+    ):
         """
         Localize timezone-naive data to timezone-aware data.
 
@@ -2274,17 +2327,12 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
         ambiguous or nonexistent timestamps are converted
         to 'NaT'.
         """  # noqa: E501
-        from cudf.core._internals.timezones import delocalize, localize
-
-        if tz is None:
-            result_col = delocalize(self._column)
-        else:
-            result_col = localize(self._column, tz, ambiguous, nonexistent)
+        result_col = self._column.tz_localize(tz, ambiguous, nonexistent)
         return DatetimeIndex._from_data(
             {self.name: result_col}, freq=self._freq
         )
 
-    def tz_convert(self, tz):
+    def tz_convert(self, tz: str | None):
         """
         Convert tz-aware datetimes from one time zone to another.
 
@@ -2316,12 +2364,7 @@ def tz_convert(self, tz):
                        '2018-03-03 14:00:00+00:00'],
                       dtype='datetime64[ns, Europe/London]')
         """  # noqa: E501
-        from cudf.core._internals.timezones import convert
-
-        if tz is None:
-            result_col = self._column._utc_time
-        else:
-            result_col = convert(self._column, tz)
+        result_col = self._column.tz_convert(tz)
         return DatetimeIndex._from_data({self.name: result_col})
 
 
@@ -2847,7 +2890,7 @@ def __init__(
 
     @property
     def closed(self):
-        return self._values.dtype.closed
+        return self.dtype.closed
 
     @classmethod
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index e161723747b..dc261707867 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -174,7 +174,7 @@ def _indices_from_labels(obj, labels):
 
         if isinstance(obj.index.dtype, cudf.CategoricalDtype):
             labels = labels.astype("category")
-            codes = labels.codes.astype(obj.index._values.codes.dtype)
+            codes = labels.codes.astype(obj.index.codes.dtype)
             labels = cudf.core.column.build_categorical_column(
                 categories=labels.dtype.categories,
                 codes=codes,
@@ -1903,13 +1903,15 @@ def nans_to_nulls(self):
         1  <NA>  3.14
         2  <NA>  <NA>
         """
-        result_data = {}
-        for name, col in self._data.items():
-            try:
-                result_data[name] = col.nans_to_nulls()
-            except AttributeError:
-                result_data[name] = col.copy()
-        return self._from_data_like_self(result_data)
+        result = (
+            col.nans_to_nulls()
+            if isinstance(col, cudf.core.column.NumericalColumn)
+            else col.copy()
+            for col in self._data.columns
+        )
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(result)
+        )
 
     def _copy_type_metadata(
         self,
@@ -6301,7 +6303,12 @@ def __dask_tokenize__(self):
 
         return [
             type(self),
-            normalize_token(self._dtypes),
+            str(self._dtypes),
+            *[
+                normalize_token(cat.categories)
+                for cat in self._dtypes.values()
+                if cat == "category"
+            ],
             normalize_token(self.index),
             normalize_token(self.hash_values().values_host),
         ]
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index cd89cb74db6..58a2846bf43 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -24,6 +24,7 @@
 from cudf.api.extensions import no_default
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
 from cudf.core import column
+from cudf.core._base_index import _return_get_indexer_result
 from cudf.core.frame import Frame
 from cudf.core.index import (
     BaseIndex,
@@ -1859,11 +1860,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             dtype=libcudf.types.size_type_dtype,
         )
         if not len(self):
-            return result.values
+            return _return_get_indexer_result(result.values)
         try:
             target = cudf.MultiIndex.from_tuples(target)
         except TypeError:
-            return result.values
+            return _return_get_indexer_result(result.values)
 
         join_keys = [
             _match_join_keys(lcol, rcol, "inner")
@@ -1893,7 +1894,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
                 "{['ffill'/'pad', 'bfill'/'backfill', None]}"
             )
 
-        return result_series.to_cupy()
+        return _return_get_indexer_result(result_series.to_cupy())
 
     @_cudf_nvtx_annotate
     def get_loc(self, key):
@@ -2112,6 +2113,8 @@ def _columns_for_reset_index(
             yield from self._split_columns_by_levels(levels, in_levels=True)
 
     def repeat(self, repeats, axis=None):
-        return self._from_columns_like_self(
-            Frame._repeat([*self._columns], repeats, axis), self._column_names
+        return self._from_data(
+            self._data._from_columns_like_self(
+                super()._repeat([*self._columns], repeats, axis)
+            )
         )
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 9008d2f3a1b..26d91bed173 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -122,9 +122,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
 
     Parameters
     ----------
-    objs : list of DataFrame, Series, or Index
+    objs : list or dictionary of DataFrame, Series, or Index
     axis : {0/'index', 1/'columns'}, default 0
         The axis to concatenate along.
+        `axis=1` must be passed if a dictionary is passed.
     join : {'inner', 'outer'}, default 'outer'
         How to handle indexes on other axis (or axes).
     ignore_index : bool, default False
@@ -231,27 +232,71 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
       letter  number  animal    name
     0      a       1    bird   polly
     1      b       2  monkey  george
+
+    Combine a dictionary of DataFrame objects horizontally:
+
+    >>> d = {'first': df1, 'second': df2}
+    >>> cudf.concat(d, axis=1)
+      first           second
+      letter  number  letter  number
+    0      a       1       c       3
+    1      b       2       d       4
     """
     # TODO: Do we really need to have different error messages for an empty
     # list and a list of None?
     if not objs:
         raise ValueError("No objects to concatenate")
 
-    objs = [obj for obj in objs if obj is not None]
-
-    if not objs:
-        raise ValueError("All objects passed were None")
-
     axis = _AXIS_MAP.get(axis, None)
     if axis is None:
         raise ValueError(
             f'`axis` must be 0 / "index" or 1 / "columns", got: {axis}'
         )
 
+    if isinstance(objs, dict):
+        if axis != 1:
+            raise NotImplementedError(
+                f"Can only concatenate dictionary input along axis=1, not {axis}"
+            )
+        objs = {k: obj for k, obj in objs.items() if obj is not None}
+        keys = list(objs)
+        objs = list(objs.values())
+        if any(isinstance(o, cudf.BaseIndex) for o in objs):
+            raise TypeError(
+                "cannot concatenate a dictionary containing indices"
+            )
+    else:
+        objs = [obj for obj in objs if obj is not None]
+        keys = None
+
+    if not objs:
+        raise ValueError("All objects passed were None")
+
+    # Retrieve the base types of `objs`. In order to support sub-types
+    # and object wrappers, we use `isinstance()` instead of comparing
+    # types directly
+    allowed_typs = {
+        cudf.Series,
+        cudf.DataFrame,
+        cudf.BaseIndex,
+    }
+    if not all(isinstance(o, tuple(allowed_typs)) for o in objs):
+        raise TypeError(
+            f"can only concatenate objects which are instances of "
+            f"{allowed_typs}, instead received {[type(o) for o in objs]}"
+        )
+
+    if any(isinstance(o, cudf.BaseIndex) for o in objs):
+        if not all(isinstance(o, cudf.BaseIndex) for o in objs):
+            raise TypeError(
+                "when concatenating indices you must provide ONLY indices"
+            )
+
+    only_series = all(isinstance(o, cudf.Series) for o in objs)
+
     # Return for single object
     if len(objs) == 1:
         obj = objs[0]
-
         if ignore_index:
             if axis == 1:
                 result = cudf.DataFrame._from_data(
@@ -290,6 +335,15 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                 result = cudf.DataFrame._from_data(
                     data, index=obj.index.copy(deep=True)
                 )
+                if keys is not None:
+                    if isinstance(result, cudf.DataFrame):
+                        k = keys[0]
+                        result.columns = cudf.MultiIndex.from_tuples(
+                            [
+                                (k, *c) if isinstance(c, tuple) else (k, c)
+                                for c in result._column_names
+                            ]
+                        )
 
         if isinstance(result, cudf.Series) and axis == 0:
             # sort has no effect for series concatted along axis 0
@@ -297,27 +351,9 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         else:
             return result.sort_index(axis=(1 - axis)) if sort else result
 
-    # Retrieve the base types of `objs`. In order to support sub-types
-    # and object wrappers, we use `isinstance()` instead of comparing
-    # types directly
-    typs = set()
-    for o in objs:
-        if isinstance(o, cudf.MultiIndex):
-            typs.add(cudf.MultiIndex)
-        elif isinstance(o, cudf.BaseIndex):
-            typs.add(type(o))
-        elif isinstance(o, cudf.DataFrame):
-            typs.add(cudf.DataFrame)
-        elif isinstance(o, cudf.Series):
-            typs.add(cudf.Series)
-        else:
-            raise TypeError(f"cannot concatenate object of type {type(o)}")
-
-    allowed_typs = {cudf.Series, cudf.DataFrame}
-
     # when axis is 1 (column) we can concat with Series and Dataframes
     if axis == 1:
-        if not typs.issubset(allowed_typs):
+        if not all(isinstance(o, (cudf.Series, cudf.DataFrame)) for o in objs):
             raise TypeError(
                 "Can only concatenate Series and DataFrame objects when axis=1"
             )
@@ -353,35 +389,71 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
             objs = _align_objs(objs, how=join, sort=sort)
             df.index = objs[0].index
 
-        for o in objs:
-            for name, col in o._data.items():
-                if name in df._data:
-                    raise NotImplementedError(
-                        f"A Column with duplicate name found: {name}, cuDF "
-                        f"doesn't support having multiple columns with "
-                        f"same names yet."
-                    )
-                if empty_inner:
-                    # if join is inner and it contains an empty df
-                    # we return an empty df, hence creating an empty
-                    # column with dtype metadata retained.
-                    df[name] = cudf.core.column.column_empty_like(
-                        col, newsize=0
-                    )
-                else:
-                    df[name] = col
-
-        result_columns = (
-            objs[0]
-            ._data.to_pandas_index()
-            .append([obj._data.to_pandas_index() for obj in objs[1:]])
-        )
+        if keys is None:
+            for o in objs:
+                for name, col in o._data.items():
+                    if name in df._data:
+                        raise NotImplementedError(
+                            f"A Column with duplicate name found: {name}, cuDF "
+                            f"doesn't support having multiple columns with "
+                            f"same names yet."
+                        )
+                    if empty_inner:
+                        # if join is inner and it contains an empty df
+                        # we return an empty df, hence creating an empty
+                        # column with dtype metadata retained.
+                        df[name] = cudf.core.column.column_empty_like(
+                            col, newsize=0
+                        )
+                    else:
+                        df[name] = col
+
+            result_columns = (
+                objs[0]
+                ._data.to_pandas_index()
+                .append([obj._data.to_pandas_index() for obj in objs[1:]])
+                .unique()
+            )
 
-        if ignore_index:
-            # with ignore_index the column names change to numbers
-            df.columns = pd.RangeIndex(len(result_columns.unique()))
+        # need to create a MultiIndex column
         else:
+            # All levels in the multiindex label must have the same type
+            has_multiple_level_types = (
+                len({type(name) for o in objs for name in o._data.keys()}) > 1
+            )
+            if has_multiple_level_types:
+                raise NotImplementedError(
+                    "Cannot construct a MultiIndex column with multiple "
+                    "label types in cuDF at this time. You must convert "
+                    "the labels to the same type."
+                )
+            for k, o in zip(keys, objs):
+                for name, col in o._data.items():
+                    # if only series, then only keep keys as column labels
+                    # if the existing column is multiindex, prepend it
+                    # to handle cases where dfs and srs are concatenated
+                    if only_series:
+                        col_label = k
+                    elif isinstance(name, tuple):
+                        col_label = (k, *name)
+                    else:
+                        col_label = (k, name)
+                    if empty_inner:
+                        df[col_label] = cudf.core.column.column_empty_like(
+                            col, newsize=0
+                        )
+                    else:
+                        df[col_label] = col
+
+        if keys is None:
             df.columns = result_columns.unique()
+            if ignore_index:
+                df.columns = cudf.RangeIndex(len(result_columns.unique()))
+        elif ignore_index:
+            # with ignore_index the column names change to numbers
+            df.columns = cudf.RangeIndex(len(result_columns))
+        elif not only_series:
+            df.columns = cudf.MultiIndex.from_tuples(df._column_names)
 
         if empty_inner:
             # if join is inner and it contains an empty df
@@ -391,18 +463,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         return df
 
     # If we get here, we are always concatenating along axis 0 (the rows).
-    typ = list(typs)[0]
-    if len(typs) > 1:
-        if allowed_typs == typs:
-            # This block of code will run when `objs` has
-            # both Series & DataFrame kind of inputs.
-            _normalize_series_and_dataframe(objs, axis=axis)
-            typ = cudf.DataFrame
-        else:
-            raise TypeError(
-                f"`concat` cannot concatenate objects of "
-                f"types: {sorted([t.__name__ for t in typs])}."
-            )
+    typ = type(objs[0])
+    if len({type(o) for o in objs}) > 1:
+        _normalize_series_and_dataframe(objs, axis=axis)
+        typ = cudf.DataFrame
 
     if typ is cudf.DataFrame:
         old_objs = objs
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index f7d05e53ce7..29460d8c67e 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -223,6 +223,9 @@ def _preprocess_host_value(self, value, dtype):
 
         if dtype is None:
             if not valid:
+                if value is NaT:
+                    value = value.to_numpy()
+
                 if isinstance(value, (np.datetime64, np.timedelta64)):
                     unit, _ = np.datetime_data(value)
                     if unit == "generic":
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 275dc664175..63a49a898f4 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -39,11 +39,9 @@
     _is_scalar_or_zero_d_array,
     is_bool_dtype,
     is_dict_like,
-    is_float_dtype,
     is_integer,
     is_integer_dtype,
     is_scalar,
-    is_string_dtype,
 )
 from cudf.core import indexing_utils
 from cudf.core._compat import PANDAS_LT_300
@@ -205,19 +203,10 @@ def __setitem__(self, key, value):
         if is_scalar(value):
             value = to_cudf_compatible_scalar(value)
             if (
-                not isinstance(
-                    self._frame._column,
-                    (
-                        cudf.core.column.DatetimeColumn,
-                        cudf.core.column.TimeDeltaColumn,
-                    ),
-                )
+                self._frame.dtype.kind not in "mM"
                 and cudf.utils.utils._isnat(value)
                 and not (
-                    isinstance(
-                        self._frame._column, cudf.core.column.StringColumn
-                    )
-                    and isinstance(value, str)
+                    self._frame.dtype == "object" and isinstance(value, str)
                 )
             ):
                 raise MixedTypeError(
@@ -226,14 +215,10 @@ def __setitem__(self, key, value):
                 )
             elif (
                 not (
-                    is_float_dtype(self._frame._column.dtype)
+                    self._frame.dtype.kind == "f"
                     or (
-                        isinstance(
-                            self._frame._column.dtype, cudf.CategoricalDtype
-                        )
-                        and is_float_dtype(
-                            self._frame._column.dtype.categories.dtype
-                        )
+                        isinstance(self._frame.dtype, cudf.CategoricalDtype)
+                        and self._frame.dtype.categories.dtype.kind == "f"
                     )
                 )
                 and isinstance(value, (np.float32, np.float64))
@@ -241,40 +226,37 @@ def __setitem__(self, key, value):
             ):
                 raise MixedTypeError(
                     f"Cannot assign {value=} to "
-                    f"non-float dtype={self._frame._column.dtype}"
+                    f"non-float dtype={self._frame.dtype}"
                 )
             elif (
-                is_bool_dtype(self._frame._column.dtype)
+                self._frame.dtype.kind == "b"
                 and not is_bool_dtype(value)
                 and value not in {None, cudf.NA}
             ):
                 raise MixedTypeError(
                     f"Cannot assign {value=} to "
-                    f"bool dtype={self._frame._column.dtype}"
+                    f"bool dtype={self._frame.dtype}"
                 )
         elif not (
             isinstance(value, (list, dict))
             and isinstance(
-                self._frame._column.dtype, (cudf.ListDtype, cudf.StructDtype)
+                self._frame.dtype, (cudf.ListDtype, cudf.StructDtype)
             )
         ):
             value = as_column(value)
 
         if (
-            (
-                _is_non_decimal_numeric_dtype(self._frame._column.dtype)
-                or is_string_dtype(self._frame._column.dtype)
-            )
+            (self._frame.dtype.kind in "uifb" or self._frame.dtype == "object")
             and hasattr(value, "dtype")
-            and _is_non_decimal_numeric_dtype(value.dtype)
+            and value.dtype.kind in "uifb"
         ):
             # normalize types if necessary:
             # In contrast to Column.__setitem__ (which downcasts the value to
             # the dtype of the column) here we upcast the series to the
             # larger data type mimicking pandas
-            to_dtype = np.result_type(value.dtype, self._frame._column.dtype)
+            to_dtype = np.result_type(value.dtype, self._frame.dtype)
             value = value.astype(to_dtype)
-            if to_dtype != self._frame._column.dtype:
+            if to_dtype != self._frame.dtype:
                 # Do not remove until pandas-3.0 support is added.
                 assert (
                     PANDAS_LT_300
@@ -283,7 +265,7 @@ def __setitem__(self, key, value):
                     f"Setting an item of incompatible dtype is deprecated "
                     "and will raise in a future error of pandas. "
                     f"Value '{value}' has dtype incompatible with "
-                    f"{self._frame._column.dtype}, "
+                    f"{self._frame.dtype}, "
                     "please explicitly cast to a compatible dtype first.",
                     FutureWarning,
                 )
@@ -336,27 +318,27 @@ def __setitem__(self, key, value):
                 and not isinstance(self._frame.index, cudf.MultiIndex)
                 and is_scalar(value)
             ):
-                # TODO: Modifying index in place is bad because
-                # our index are immutable, but columns are not (which
-                # means our index are mutable with internal APIs).
-                # Get rid of the deep copy once columns too are
-                # immutable.
-                idx_copy = self._frame._index.copy(deep=True)
-                if (
-                    isinstance(idx_copy, cudf.RangeIndex)
-                    and isinstance(key, int)
-                    and (key == idx_copy[-1] + idx_copy.step)
-                ):
-                    idx_copy = cudf.RangeIndex(
-                        start=idx_copy.start,
-                        stop=idx_copy.stop + idx_copy.step,
-                        step=idx_copy.step,
-                        name=idx_copy.name,
-                    )
+                idx = self._frame._index
+                if isinstance(idx, cudf.RangeIndex):
+                    if isinstance(key, int) and (key == idx[-1] + idx.step):
+                        idx_copy = cudf.RangeIndex(
+                            start=idx.start,
+                            stop=idx.stop + idx.step,
+                            step=idx.step,
+                            name=idx.name,
+                        )
+                    else:
+                        idx_copy = idx._as_int_index()
+                        _append_new_row_inplace(idx_copy._column, key)
                 else:
-                    if isinstance(idx_copy, cudf.RangeIndex):
-                        idx_copy = idx_copy._as_int_index()
-                    _append_new_row_inplace(idx_copy._values, key)
+                    # TODO: Modifying index in place is bad because
+                    # our index are immutable, but columns are not (which
+                    # means our index are mutable with internal APIs).
+                    # Get rid of the deep copy once columns too are
+                    # immutable.
+                    idx_copy = idx.copy(deep=True)
+                    _append_new_row_inplace(idx_copy._column, key)
+
                 self._frame._index = idx_copy
                 _append_new_row_inplace(self._frame._column, value)
                 return
@@ -1407,34 +1389,23 @@ def __repr__(self):
                     cudf.core.dtypes.DecimalDtype,
                 ),
             )
-        ) or isinstance(
-            preprocess._column,
-            cudf.core.column.timedelta.TimeDeltaColumn,
-        ):
+        ) or preprocess.dtype.kind == "m":
             fill_value = (
                 str(cudf.NaT)
-                if isinstance(
-                    preprocess._column,
-                    (
-                        cudf.core.column.TimeDeltaColumn,
-                        cudf.core.column.DatetimeColumn,
-                    ),
-                )
+                if preprocess.dtype.kind in "mM"
                 else str(cudf.NA)
             )
             output = repr(
                 preprocess.astype("str").fillna(fill_value).to_pandas()
             )
-        elif isinstance(
-            preprocess._column, cudf.core.column.CategoricalColumn
-        ):
+        elif isinstance(preprocess.dtype, cudf.CategoricalDtype):
             min_rows = (
                 height
                 if pd.get_option("display.min_rows") == 0
                 else pd.get_option("display.min_rows")
             )
             show_dimensions = pd.get_option("display.show_dimensions")
-            if preprocess._column.categories.dtype.kind == "f":
+            if preprocess.dtype.categories.dtype.kind == "f":
                 pd_series = (
                     preprocess.astype("str")
                     .to_pandas()
@@ -1461,13 +1432,13 @@ def __repr__(self):
             output = repr(preprocess.to_pandas())
 
         lines = output.split("\n")
-        if isinstance(preprocess._column, cudf.core.column.CategoricalColumn):
+        if isinstance(preprocess.dtype, cudf.CategoricalDtype):
             category_memory = lines[-1]
-            if preprocess._column.categories.dtype.kind == "f":
+            if preprocess.dtype.categories.dtype.kind == "f":
                 category_memory = category_memory.replace("'", "").split(": ")
                 category_memory = (
                     category_memory[0].replace(
-                        "object", preprocess._column.categories.dtype.name
+                        "object", preprocess.dtype.categories.dtype.name
                     )
                     + ": "
                     + category_memory[1]
@@ -3654,7 +3625,9 @@ def pct_change(
     def where(self, cond, other=None, inplace=False):
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
-            self._from_data_like_self({self.name: result_col}),
+            self._from_data_like_self(
+                self._data._from_columns_like_self([result_col])
+            ),
             inplace=inplace,
         )
 
@@ -4753,22 +4726,22 @@ def strftime(self, date_format, *args, **kwargs):
         )
 
     @copy_docstring(DatetimeIndex.tz_localize)
-    def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
-        from cudf.core._internals.timezones import delocalize, localize
-
-        if tz is None:
-            result_col = delocalize(self.series._column)
-        else:
-            result_col = localize(
-                self.series._column, tz, ambiguous, nonexistent
-            )
+    def tz_localize(
+        self,
+        tz: str | None,
+        ambiguous: Literal["NaT"] = "NaT",
+        nonexistent: Literal["NaT"] = "NaT",
+    ):
+        result_col = self.series._column.tz_localize(
+            tz, ambiguous, nonexistent
+        )
         return Series._from_data(
             data={self.series.name: result_col},
             index=self.series._index,
         )
 
     @copy_docstring(DatetimeIndex.tz_convert)
-    def tz_convert(self, tz):
+    def tz_convert(self, tz: str | None):
         """
         Parameters
         ----------
@@ -4778,12 +4751,7 @@ def tz_convert(self, tz):
             A `tz` of None will convert to UTC and remove the
             timezone information.
         """
-        from cudf.core._internals.timezones import convert
-
-        if tz is None:
-            result_col = self.series._column._utc_time
-        else:
-            result_col = convert(self.series._column, tz)
+        result_col = self.series._column.tz_convert(tz)
         return Series._from_data(
             {self.series.name: result_col}, index=self.series._index
         )
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 907f3b586d1..12a1ecc68e0 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -317,9 +317,6 @@ def _process_col(
     format: Optional[str],
     utc: bool,
 ):
-    # Causes circular import
-    from cudf.core._internals.timezones import localize
-
     if col.dtype.kind == "f":
         if unit not in (None, "ns"):
             factor = cudf.Scalar(
@@ -396,7 +393,7 @@ def _process_col(
             f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
         )
     if utc and not isinstance(col.dtype, pd.DatetimeTZDtype):
-        return localize(col, "UTC", ambiguous="NaT", nonexistent="NaT")
+        return col.tz_localize("UTC")
     return col
 
 
@@ -1061,8 +1058,7 @@ def _to_iso_calendar(arg):
         )
     if isinstance(arg, cudf.Index):
         iso_params = [
-            arg._column.as_string_column(arg._values.dtype, fmt)
-            for fmt in formats
+            arg._column.as_string_column(arg.dtype, fmt) for fmt in formats
         ]
         index = arg._column
     elif isinstance(arg.series, cudf.Series):
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index 5ef25a99590..03d07fc3a50 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -26,6 +26,7 @@ def read_json(
     keep_quotes=False,
     storage_options=None,
     mixed_types_as_string=False,
+    prune_columns=False,
     *args,
     **kwargs,
 ):
@@ -101,6 +102,7 @@ def read_json(
             False,
             keep_quotes,
             mixed_types_as_string,
+            prune_columns,
         )
     else:
         warnings.warn(
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 3c82d571939..93bef66de4f 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -103,6 +103,49 @@ def __get__(self, obj, cls=None):
             raise AttributeError()
 
 
+def Timestamp_Timedelta__new__(cls, *args, **kwargs):
+    # Call fast/slow constructor
+    # This takes care of running __init__ as well, but must be paired
+    # with a removal of the defaulted __init__ that
+    # make_final_proxy_type provides.
+    # Timestamp & Timedelta don't always return same types as self,
+    # hence this method is needed.
+    self, _ = _fast_slow_function_call(
+        lambda cls, args, kwargs: cls(*args, **kwargs),
+        cls,
+        args,
+        kwargs,
+    )
+    return self
+
+
+Timedelta = make_final_proxy_type(
+    "Timedelta",
+    _Unusable,
+    pd.Timedelta,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={
+        "__hash__": _FastSlowAttribute("__hash__"),
+        "__new__": Timestamp_Timedelta__new__,
+        "__init__": _DELETE,
+    },
+)
+
+
+Timestamp = make_final_proxy_type(
+    "Timestamp",
+    _Unusable,
+    pd.Timestamp,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={
+        "__hash__": _FastSlowAttribute("__hash__"),
+        "__new__": Timestamp_Timedelta__new__,
+        "__init__": _DELETE,
+    },
+)
+
 DatetimeProperties = make_intermediate_proxy_type(
     "DatetimeProperties",
     cudf.core.series.DatetimeProperties,
@@ -310,6 +353,18 @@ def Index__new__(cls, *args, **kwargs):
     additional_attributes={"__init__": _DELETE},
 )
 
+NumpyExtensionArray = make_final_proxy_type(
+    "NumpyExtensionArray",
+    _Unusable,
+    pd.arrays.NumpyExtensionArray,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_ndarray": _FastSlowAttribute("_ndarray"),
+        "_dtype": _FastSlowAttribute("_dtype"),
+    },
+)
+
 TimedeltaArray = make_final_proxy_type(
     "TimedeltaArray",
     _Unusable,
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 9d8c174b297..835cfa89133 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -1093,7 +1093,7 @@ def _replace_closurevars(
     f: types.FunctionType,
     attribute_name: Literal["_fsproxy_slow", "_fsproxy_fast"],
     seen: Set[int],
-) -> types.FunctionType:
+) -> Callable[..., Any]:
     """
     Return a copy of `f` with its closure variables replaced with
     their corresponding slow (or fast) types.
@@ -1133,12 +1133,11 @@ def _replace_closurevars(
         argdefs=f.__defaults__,
         closure=g_closure,
     )
-    g = functools.update_wrapper(
+    return functools.update_wrapper(
         g,
         f,
         assigned=functools.WRAPPER_ASSIGNMENTS + ("__kwdefaults__",),
     )
-    return g
 
 
 _SPECIAL_METHODS: Set[str] = {
diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
new file mode 100644
index 00000000000..764720d9de1
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf
+from cudf._lib import pylibcudf as plc
+
+VALID_TYPES = [
+    pa.int8(),
+    pa.int16(),
+    pa.int32(),
+    pa.int64(),
+    pa.uint8(),
+    pa.uint16(),
+    pa.uint32(),
+    pa.uint64(),
+    pa.float32(),
+    pa.float64(),
+    pa.bool_(),
+    pa.timestamp("s"),
+    pa.timestamp("ms"),
+    pa.timestamp("us"),
+    pa.timestamp("ns"),
+    pa.duration("s"),
+    pa.duration("ms"),
+    pa.duration("us"),
+    pa.duration("ns"),
+]
+
+
+@pytest.fixture(params=VALID_TYPES, ids=repr)
+def valid_type(request):
+    return request.param
+
+
+@pytest.fixture
+def valid_column(valid_type):
+    if valid_type == pa.bool_():
+        return pa.array([True, False, True], type=valid_type)
+    return pa.array([1, 2, 3], type=valid_type)
+
+
+def test_from_cuda_array_interface(valid_column):
+    col = plc.column.Column.from_cuda_array_interface_obj(
+        cudf.Series(valid_column)
+    )
+    expect = valid_column
+
+    assert_column_eq(col, expect)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_find.py b/python/cudf/cudf/pylibcudf_tests/test_string_find.py
new file mode 100644
index 00000000000..f44c4af9bfc
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_find.py
@@ -0,0 +1,262 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def pa_data_col():
+    return pa.array(
+        [
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+        ]
+    )
+
+
+@pytest.fixture(scope="module")
+def plc_data_col(pa_data_col):
+    return plc.interop.from_arrow(pa_data_col)
+
+
+@pytest.fixture(scope="module")
+def pa_target_col():
+    return pa.array(
+        [
+            "a",
+            "B",
+            "x",
+            "1",
+            " ",
+            "a",
+            None,
+            None,  # find
+            "a",
+            "B",
+            "x",
+            "1",
+            " ",
+            "a",
+            None,
+            None,  # rfind
+            "ab",
+            "12",
+            "BC",
+            "",
+            " ",
+            "a",
+            None,
+            None,  # contains
+            "ab",
+            "ABC",
+            "AB",
+            "",
+            " ",
+            "a",
+            None,
+            None,  # starts_with
+            "3",
+            "23",
+            "a23",
+            "",
+            " ",
+            "a",
+            None,
+            None,  # ends_with
+        ]
+    )
+
+
+@pytest.fixture(scope="module")
+def plc_target_col(pa_target_col):
+    return plc.interop.from_arrow(pa_target_col)
+
+
+@pytest.fixture(params=["a", " ", "A", "Ab", "23"], scope="module")
+def pa_target_scalar(request):
+    return pa.scalar(request.param, type=pa.string())
+
+
+@pytest.fixture(scope="module")
+def plc_target_scalar(pa_target_scalar):
+    return plc.interop.from_arrow(pa_target_scalar)
+
+
+def test_find(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar):
+    got = plc.strings.find.find(plc_data_col, plc_target_scalar, 0, -1)
+
+    expected = pa.array(
+        [
+            elem.find(pa_target_scalar.as_py()) if elem is not None else None
+            for elem in pa_data_col.to_pylist()
+        ],
+        type=pa.int32(),
+    )
+
+    assert_column_eq(got, expected)
+
+
+def colwise_apply(pa_data_col, pa_target_col, operator):
+    def handle_none(st, target):
+        # Match libcudf handling of nulls
+        if st is None:
+            return None
+        elif target is None:
+            return False
+        else:
+            return operator(st, target)
+
+    expected = pa.array(
+        [
+            handle_none(elem, target)
+            for elem, target in zip(
+                pa_data_col.to_pylist(),
+                pa_target_col.to_pylist(),
+            )
+        ],
+        type=pa.bool_(),
+    )
+
+    return expected
+
+
+def test_find_column(pa_data_col, pa_target_col, plc_data_col, plc_target_col):
+    expected = pa.array(
+        [
+            elem.find(target) if not (elem is None or target is None) else None
+            for elem, target in zip(
+                pa_data_col.to_pylist(),
+                pa_target_col.to_pylist(),
+            )
+        ],
+        type=pa.int32(),
+    )
+
+    got = plc.strings.find.find(plc_data_col, plc_target_col, 0)
+    assert_column_eq(got, expected)
+
+
+def test_rfind(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar):
+    py_target = pa_target_scalar.as_py()
+
+    got = plc.strings.find.rfind(plc_data_col, plc_target_scalar, 0, -1)
+
+    expected = pa.array(
+        [
+            elem.rfind(py_target)
+            if not (elem is None or py_target is None)
+            else None
+            for elem in pa_data_col.to_pylist()
+        ],
+        type=pa.int32(),
+    )
+
+    assert_column_eq(got, expected)
+
+
+def test_contains(
+    pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar
+):
+    py_target = pa_target_scalar.as_py()
+
+    got = plc.strings.find.contains(plc_data_col, plc_target_scalar)
+    expected = pa.array(
+        [
+            py_target in elem
+            if not (elem is None or py_target is None)
+            else None
+            for elem in pa_data_col.to_pylist()
+        ],
+        type=pa.bool_(),
+    )
+
+    assert_column_eq(got, expected)
+
+
+def test_contains_column(
+    pa_data_col, pa_target_col, plc_data_col, plc_target_col
+):
+    expected = colwise_apply(
+        pa_data_col, pa_target_col, lambda st, target: target in st
+    )
+    got = plc.strings.find.contains(plc_data_col, plc_target_col)
+    assert_column_eq(got, expected)
+
+
+def test_starts_with(
+    pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar
+):
+    py_target = pa_target_scalar.as_py()
+    got = plc.strings.find.starts_with(plc_data_col, plc_target_scalar)
+    expected = pa.compute.starts_with(pa_data_col, py_target)
+    assert_column_eq(got, expected)
+
+
+def test_starts_with_column(
+    pa_data_col, pa_target_col, plc_data_col, plc_target_col
+):
+    expected = colwise_apply(
+        pa_data_col, pa_target_col, lambda st, target: st.startswith(target)
+    )
+    got = plc.strings.find.starts_with(plc_data_col, plc_target_col)
+    assert_column_eq(got, expected)
+
+
+def test_ends_with(
+    pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar
+):
+    py_target = pa_target_scalar.as_py()
+    got = plc.strings.find.ends_with(plc_data_col, plc_target_scalar)
+    expected = pa.compute.ends_with(pa_data_col, py_target)
+    assert_column_eq(got, expected)
+
+
+def test_ends_with_column(
+    pa_data_col, pa_target_col, plc_data_col, plc_target_col
+):
+    expected = colwise_apply(
+        pa_data_col, pa_target_col, lambda st, target: st.endswith(target)
+    )
+    got = plc.strings.find.ends_with(plc_data_col, plc_target_col)
+    assert_column_eq(got, expected)
diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py
index 6ee339ee3ea..7ef55761b2b 100644
--- a/python/cudf/cudf/tests/series/test_datetimelike.py
+++ b/python/cudf/cudf/tests/series/test_datetimelike.py
@@ -218,3 +218,8 @@ def test_contains_tz_aware(item, expected):
     dti = cudf.date_range("2020", periods=2, freq="D").tz_localize("UTC")
     result = item in dti
     assert result == expected
+
+
+def test_tz_convert_naive_typeerror():
+    with pytest.raises(TypeError):
+        cudf.date_range("2020", periods=2, freq="D").tz_convert(None)
diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py
index 9436d65e0b7..4abe210c6ea 100644
--- a/python/cudf/cudf/tests/test_api_types.py
+++ b/python/cudf/cudf/tests/test_api_types.py
@@ -33,7 +33,6 @@
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -42,7 +41,6 @@
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -61,7 +59,6 @@
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -142,7 +139,6 @@ def test_is_categorical_dtype(obj, expect):
         (np.float64, True),
         (np.complex128, True),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -151,7 +147,6 @@ def test_is_categorical_dtype(obj, expect):
         (np.float64(), True),
         (np.complex128(), True),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -170,7 +165,6 @@ def test_is_categorical_dtype(obj, expect):
         (np.array([], dtype=np.float64), True),
         (np.array([], dtype=np.complex128), True),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -247,7 +241,6 @@ def test_is_numeric_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -256,7 +249,6 @@ def test_is_numeric_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -275,7 +267,6 @@ def test_is_numeric_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -352,7 +343,6 @@ def test_is_integer_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -361,7 +351,6 @@ def test_is_integer_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -380,7 +369,6 @@ def test_is_integer_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -458,7 +446,6 @@ def test_is_integer(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, True),
-        (np.unicode_, True),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -467,7 +454,6 @@ def test_is_integer(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), True),
-        (np.unicode_(), True),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -486,7 +472,6 @@ def test_is_integer(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), True),
-        (np.array([], dtype=np.unicode_), True),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         # (np.array([], dtype=object), False),
@@ -577,7 +562,6 @@ def test_is_string_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, True),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -586,7 +570,6 @@ def test_is_string_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), True),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -605,7 +588,6 @@ def test_is_string_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), True),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -682,7 +664,6 @@ def test_is_datetime_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -691,7 +672,6 @@ def test_is_datetime_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -710,7 +690,6 @@ def test_is_datetime_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -787,7 +766,6 @@ def test_is_list_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -796,7 +774,6 @@ def test_is_list_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -815,7 +792,6 @@ def test_is_list_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -895,7 +871,6 @@ def test_is_struct_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -904,7 +879,6 @@ def test_is_struct_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -923,7 +897,6 @@ def test_is_struct_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -1004,7 +977,6 @@ def test_is_decimal_dtype(obj, expect):
         np.float64,
         np.complex128,
         np.str_,
-        np.unicode_,
         np.datetime64,
         np.timedelta64,
         # NumPy scalars.
@@ -1013,7 +985,6 @@ def test_is_decimal_dtype(obj, expect):
         np.float64(),
         np.complex128(),
         np.str_(),
-        np.unicode_(),
         np.datetime64(),
         np.timedelta64(),
         # NumPy dtype objects.
@@ -1032,7 +1003,6 @@ def test_is_decimal_dtype(obj, expect):
         np.array([], dtype=np.float64),
         np.array([], dtype=np.complex128),
         np.array([], dtype=np.str_),
-        np.array([], dtype=np.unicode_),
         np.array([], dtype=np.datetime64),
         np.array([], dtype=np.timedelta64),
         np.array([], dtype=object),
@@ -1088,7 +1058,6 @@ def test_pandas_agreement(obj):
         np.float64,
         np.complex128,
         np.str_,
-        np.unicode_,
         np.datetime64,
         np.timedelta64,
         # NumPy scalars.
@@ -1097,7 +1066,6 @@ def test_pandas_agreement(obj):
         np.float64(),
         np.complex128(),
         np.str_(),
-        np.unicode_(),
         np.datetime64(),
         np.timedelta64(),
         # NumPy dtype objects.
@@ -1116,7 +1084,6 @@ def test_pandas_agreement(obj):
         np.array([], dtype=np.float64),
         np.array([], dtype=np.complex128),
         np.array([], dtype=np.str_),
-        np.array([], dtype=np.unicode_),
         np.array([], dtype=np.datetime64),
         np.array([], dtype=np.timedelta64),
         np.array([], dtype=object),
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index e21fd53bee4..07ce81e3c39 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -460,7 +460,7 @@ def test_categorical_dataframe_slice_copy():
         pd.Series(["1.0", "2.5", "3.001", None, "9"], dtype="category"),
         pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]),
         pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]),
-        pd.Series([1, 2, 3, 89, None, np.nan, np.NaN], dtype="float64"),
+        pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"),
         pd.Series([1, 2, 3, 89], dtype="float64"),
         pd.Series([1, 2.5, 3.001, 89], dtype="float64"),
         pd.Series([None, None, None]),
@@ -493,7 +493,7 @@ def test_categorical_typecast(data, cat_type):
         pd.Series([1, 2, 3, 89]),
         pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]),
         pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]),
-        pd.Series([1, 2, 3, 89, None, np.nan, np.NaN], dtype="float64"),
+        pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"),
         pd.Series([1, 2, 3, 89], dtype="float64"),
         pd.Series([1, 2.5, 3.001, 89], dtype="float64"),
         pd.Series([None, None, None]),
@@ -859,3 +859,19 @@ def test_cat_from_scalar(scalar):
     gs = cudf.Series(scalar, dtype="category")
 
     assert_eq(ps, gs)
+
+
+def test_cat_groupby_fillna():
+    ps = pd.Series(["a", "b", "c"], dtype="category")
+    gs = cudf.from_pandas(ps)
+
+    with pytest.warns(FutureWarning):
+        pg = ps.groupby(ps)
+    gg = gs.groupby(gs)
+
+    assert_exceptions_equal(
+        lfunc=pg.fillna,
+        rfunc=gg.fillna,
+        lfunc_args_and_kwargs=(("d",), {}),
+        rfunc_args_and_kwargs=(("d",), {}),
+    )
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 87b3beb5589..4b43a33c8c8 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -218,7 +218,8 @@ def test_concat_columns(axis):
     assert_eq(expect, got, check_index_type=True)
 
 
-def test_concat_multiindex_dataframe():
+@pytest.mark.parametrize("axis", [0, 1])
+def test_concat_multiindex_dataframe(axis):
     gdf = cudf.DataFrame(
         {
             "w": np.arange(4),
@@ -233,14 +234,11 @@ def test_concat_multiindex_dataframe():
     pdg2 = pdg.iloc[:, 1:]
     gdg1 = cudf.from_pandas(pdg1)
     gdg2 = cudf.from_pandas(pdg2)
+    expected = pd.concat([pdg1, pdg2], axis=axis)
+    result = cudf.concat([gdg1, gdg2], axis=axis)
     assert_eq(
-        cudf.concat([gdg1, gdg2]).astype("float64"),
-        pd.concat([pdg1, pdg2]),
-        check_index_type=True,
-    )
-    assert_eq(
-        cudf.concat([gdg1, gdg2], axis=1),
-        pd.concat([pdg1, pdg2], axis=1),
+        expected,
+        result,
         check_index_type=True,
     )
 
@@ -1865,3 +1863,137 @@ def test_concat_mixed_list_types_error(s1, s2):
 
     with pytest.raises(NotImplementedError):
         cudf.concat([s1, s2], ignore_index=True)
+
+
+@pytest.mark.parametrize(
+    "axis",
+    [
+        pytest.param(
+            0,
+            marks=pytest.mark.xfail(
+                reason="concat dictionaries with axis=0 not implemented"
+            ),
+        ),
+        1,
+        "columns",
+    ],
+)
+@pytest.mark.parametrize(
+    "d",
+    [
+        {"first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}})},
+        {
+            "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}),
+            "second": (cudf.DataFrame, {"data": {"A": [5, 6], "B": [7, 8]}}),
+            "third": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}),
+        },
+        {
+            "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}),
+            "second": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}),
+            "third": (cudf.DataFrame, {"data": {"A": [5, 6], "B": [7, 8]}}),
+        },
+        {
+            "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}),
+            "second": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}),
+            "third": (cudf.DataFrame, {"data": {"A": [5, 6], "C": [7, 8]}}),
+            "fourth": (cudf.DataFrame, {"data": {"B": [9, 10]}}),
+        },
+        pytest.param(
+            {
+                "first": (cudf.DataFrame, {"data": {2.0: [1, 1]}}),
+                "second": (cudf.DataFrame, {"data": {"test": ["abc", "def"]}}),
+            },
+            marks=pytest.mark.xfail(
+                reason=(
+                    "Cannot construct a MultiIndex column with multiple "
+                    "label types in cuDF at this time. You must convert "
+                    "the labels to the same type."
+                )
+            ),
+        ),
+        {
+            "first": (cudf.Series, {"data": [1, 2, 3]}),
+            "second": (cudf.Series, {"data": [4, 5, 6]}),
+        },
+        {
+            "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}),
+            "second": (cudf.Series, {"data": [5, 6], "name": "C"}),
+        },
+        pytest.param(
+            {
+                "first": (
+                    cudf.DataFrame,
+                    {"data": {("A", "B"): [1, 2], "C": [3, 4]}},
+                ),
+                "second": (
+                    cudf.DataFrame,
+                    {"data": {"D": [5, 6], ("A", "B"): [7, 8]}},
+                ),
+            },
+            marks=pytest.mark.xfail(
+                reason=(
+                    "Cannot construct a MultiIndex column with multiple "
+                    "label types in cuDF at this time. You must convert "
+                    "the labels to the same type."
+                )
+            ),
+        ),
+        pytest.param(
+            {
+                "first": (
+                    cudf.DataFrame,
+                    {"data": {("A", "B"): [3, 4], 2.0: [1, 1]}},
+                ),
+                "second": (
+                    cudf.DataFrame,
+                    {"data": {("C", "D"): [3, 4], 3.0: [5, 6]}},
+                ),
+            },
+            marks=pytest.mark.xfail(
+                reason=(
+                    "Cannot construct a MultiIndex column with multiple "
+                    "label types in cuDF at this time. You must convert "
+                    "the labels to the same type."
+                )
+            ),
+        ),
+        {
+            "first": (
+                cudf.DataFrame,
+                {"data": {(1, 2): [1, 2], (3, 4): [3, 4]}},
+            ),
+            "second": (
+                cudf.DataFrame,
+                {"data": {(1, 2): [5, 6], (5, 6): [7, 8]}},
+            ),
+        },
+    ],
+)
+def test_concat_dictionary(d, axis):
+    _dict = {k: c(**v) for k, (c, v) in d.items()}
+    result = cudf.concat(_dict, axis=axis)
+    expected = cudf.from_pandas(
+        pd.concat({k: df.to_pandas() for k, df in _dict.items()}, axis=axis)
+    )
+    assert_eq(expected, result)
+
+
+@pytest.mark.parametrize(
+    "d",
+    [
+        {"first": cudf.Index([1, 2, 3])},
+        {
+            "first": cudf.MultiIndex(
+                levels=[[1, 2], ["blue", "red"]],
+                codes=[[0, 0, 1, 1], [1, 0, 1, 0]],
+            )
+        },
+        {"first": cudf.CategoricalIndex([1, 2, 3])},
+    ],
+)
+def test_concat_dict_incorrect_type_index(d):
+    with pytest.raises(
+        TypeError,
+        match="cannot concatenate a dictionary containing indices",
+    ):
+        cudf.concat(d, axis=1)
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index 213c6c2c1f9..f98c3ad0475 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -11,7 +11,12 @@
 
 import cudf
 from cudf.core.buffer.spill_manager import get_global_manager
-from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
+from cudf.testing._utils import (
+    DATETIME_TYPES,
+    NUMERIC_TYPES,
+    TIMEDELTA_TYPES,
+    assert_eq,
+)
 
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
@@ -42,7 +47,9 @@ def test_cuda_array_interface_interop_in(dtype, module):
         assert_eq(pd_data, gdf["test"])
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES + ["str"])
+@pytest.mark.parametrize(
+    "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["str"]
+)
 @pytest.mark.parametrize("module", ["cupy", "numba"])
 def test_cuda_array_interface_interop_out(dtype, module):
     expectation = does_not_raise()
@@ -73,7 +80,9 @@ def to_host_function(x):
         assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
+@pytest.mark.parametrize(
+    "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES
+)
 @pytest.mark.parametrize("module", ["cupy", "numba"])
 def test_cuda_array_interface_interop_out_masked(dtype, module):
     expectation = does_not_raise()
@@ -104,7 +113,9 @@ def to_host_function(x):
         module_data = module_constructor(cudf_data)  # noqa: F841
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
+@pytest.mark.parametrize(
+    "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES
+)
 @pytest.mark.parametrize("nulls", ["all", "some", "bools", "none"])
 @pytest.mark.parametrize("mask_type", ["bits", "bools"])
 def test_cuda_array_interface_as_column(dtype, nulls, mask_type):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 2527f9356f7..c3af0106cc7 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -5207,20 +5207,20 @@ def test_df_constructor_dtype(dtype):
         cudf.DataFrame(
             {
                 "a": [1, 2, 3, 4],
-                "b": [7, np.NaN, 9, 10],
+                "b": [7, np.nan, 9, 10],
                 "c": cudf.Series(
-                    [np.NaN, np.NaN, np.NaN, np.NaN], nan_as_null=False
+                    [np.nan, np.nan, np.nan, np.nan], nan_as_null=False
                 ),
                 "d": cudf.Series([None, None, None, None], dtype="int64"),
                 "e": [100, None, 200, None],
-                "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False),
+                "f": cudf.Series([10, None, np.nan, 11], nan_as_null=False),
             }
         ),
         cudf.DataFrame(
             {
                 "a": [10, 11, 12, 13, 14, 15],
                 "b": cudf.Series(
-                    [10, None, np.NaN, 2234, None, np.NaN], nan_as_null=False
+                    [10, None, np.nan, 2234, None, np.nan], nan_as_null=False
                 ),
             }
         ),
@@ -5272,11 +5272,11 @@ def test_rowwise_ops_nullable_dtypes_all_null(op):
     gdf = cudf.DataFrame(
         {
             "a": [1, 2, 3, 4],
-            "b": [7, np.NaN, 9, 10],
-            "c": cudf.Series([np.NaN, np.NaN, np.NaN, np.NaN], dtype=float),
+            "b": [7, np.nan, 9, 10],
+            "c": cudf.Series([np.nan, np.nan, np.nan, np.nan], dtype=float),
             "d": cudf.Series([None, None, None, None], dtype="int64"),
             "e": [100, None, 200, None],
-            "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False),
+            "f": cudf.Series([10, None, np.nan, 11], nan_as_null=False),
         }
     )
 
@@ -5308,7 +5308,7 @@ def test_rowwise_ops_nullable_dtypes_partial_null(op):
         {
             "a": [10, 11, 12, 13, 14, 15],
             "b": cudf.Series(
-                [10, None, np.NaN, 2234, None, np.NaN],
+                [10, None, np.nan, 2234, None, np.nan],
                 nan_as_null=False,
             ),
         }
@@ -10994,3 +10994,23 @@ def test_squeeze(axis, data):
     result = df.squeeze(axis=axis)
     expected = df.to_pandas().squeeze(axis=axis)
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("column", [range(1), np.array([1], dtype=np.int8)])
+@pytest.mark.parametrize(
+    "operation",
+    [
+        lambda df: df.where(df < 2, 2),
+        lambda df: df.nans_to_nulls(),
+        lambda df: df.isna(),
+        lambda df: df.notna(),
+        lambda df: abs(df),
+        lambda df: -df,
+        lambda df: ~df,
+    ],
+)
+def test_op_preserves_column_metadata(column, operation):
+    df = cudf.DataFrame([1], columns=cudf.Index(column))
+    result = operation(df).columns
+    expected = pd.Index(column)
+    pd.testing.assert_index_equal(result, expected, exact=True)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 08a7a9148dd..baa839ecd72 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1039,7 +1039,9 @@ def test_index_append(data, other):
         (len(data) == 0 or len(other) == 0) and pd_data.dtype != pd_other.dtype
     ):
         expected = pd_data.append(pd_other)
-    with expect_warning_if(len(data) == 0 or len(other) == 0):
+    with expect_warning_if(
+        (len(data) == 0 or len(other) == 0) and gd_data.dtype != gd_other.dtype
+    ):
         actual = gd_data.append(gd_other)
     if len(data) == 0 and len(other) == 0:
         # Pandas default dtype to "object" for empty list
@@ -1237,7 +1239,10 @@ def test_index_append_list(data, other):
         and (any(d.dtype != data.dtype for d in other))
     ):
         expected = pd_data.append(pd_other)
-    with expect_warning_if(len(data) == 0 or any(len(d) == 0 for d in other)):
+    with expect_warning_if(
+        (len(data) == 0 or any(len(d) == 0 for d in other))
+        and (any(d.dtype != data.dtype for d in other))
+    ):
         actual = gd_data.append(gd_other)
 
     assert_eq(expected, actual)
@@ -1736,6 +1741,10 @@ def test_get_indexer_single_unique_numeric(idx, key, method):
 
         assert_eq(expected, got)
 
+        with cudf.option_context("mode.pandas_compatible", True):
+            got = gi.get_indexer(key, method=method)
+        assert_eq(expected, got, check_dtype=True)
+
 
 @pytest.mark.parametrize(
     "idx",
@@ -1765,6 +1774,12 @@ def test_get_indexer_rangeindex(idx, key, method, tolerance):
 
     assert_eq(expected, got)
 
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = gi.get_indexer(
+            key, method=method, tolerance=None if method is None else tolerance
+        )
+    assert_eq(expected, got, check_dtype=True)
+
 
 @pytest.mark.parametrize(
     "idx",
@@ -1945,6 +1960,11 @@ def test_get_indexer_single_duplicate_string(idx, key, method):
 
         assert_eq(expected, got)
 
+        with cudf.option_context("mode.pandas_compatible", True):
+            got = gi.get_indexer(key, method=method)
+
+        assert_eq(expected, got, check_dtype=True)
+
 
 @pytest.mark.parametrize(
     "idx",
@@ -2004,6 +2024,11 @@ def test_get_indexer_multi_numeric(idx, key, method):
 
     assert_eq(expected, got)
 
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = gi.get_indexer(key, method=method)
+
+    assert_eq(expected, got, check_dtype=True)
+
 
 @pytest.mark.parametrize(
     "idx",
@@ -2817,8 +2842,7 @@ def test_index_methods(index, func):
 
     if func == "append":
         expected = pidx.append(other=pidx)
-        with expect_warning_if(len(gidx) == 0):
-            actual = gidx.append(other=gidx)
+        actual = gidx.append(other=gidx)
     else:
         expected = getattr(pidx, func)()
         actual = getattr(gidx, func)()
@@ -3176,3 +3200,52 @@ def test_index_to_pandas_arrow_type(scalar):
     result = idx.to_pandas(arrow_type=True)
     expected = pd.Index(pd.arrays.ArrowExtensionArray(pa_array))
     pd.testing.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize("data", [range(-3, 3), range(1, 3), range(0)])
+def test_rangeindex_all(data):
+    result = cudf.RangeIndex(data).all()
+    expected = cudf.Index(list(data)).all()
+    assert result == expected
+
+
+@pytest.mark.parametrize("sort", [True, False])
+@pytest.mark.parametrize("data", [range(2), range(2, -1, -1)])
+def test_rangeindex_factorize(sort, data):
+    res_codes, res_uniques = cudf.RangeIndex(data).factorize(sort=sort)
+    exp_codes, exp_uniques = cudf.Index(list(data)).factorize(sort=sort)
+    assert_eq(res_codes, exp_codes)
+    assert_eq(res_uniques, exp_uniques)
+
+
+def test_rangeindex_dropna():
+    ri = cudf.RangeIndex(range(2))
+    result = ri.dropna()
+    expected = ri.copy()
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("data", [range(2), [10, 11, 12]])
+def test_index_contains_hashable(data):
+    gidx = cudf.Index(data)
+    pidx = gidx.to_pandas()
+
+    assert_exceptions_equal(
+        lambda: [] in gidx,
+        lambda: [] in pidx,
+        lfunc_args_and_kwargs=((),),
+        rfunc_args_and_kwargs=((),),
+    )
+
+
+@pytest.mark.parametrize("data", [[0, 1, 2], [1.1, 2.3, 4.5]])
+@pytest.mark.parametrize("dtype", ["int32", "float32", "float64"])
+@pytest.mark.parametrize("needle", [0, 1, 2.3])
+def test_index_contains_float_int(data, dtype, needle):
+    gidx = cudf.Index(data=data, dtype=dtype)
+    pidx = gidx.to_pandas()
+
+    actual = needle in gidx
+    expected = needle in pidx
+
+    assert_eq(actual, expected)
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 76a82afb78e..dd731fab8f3 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -2153,3 +2153,15 @@ def test_index_to_pandas_arrow_type(scalar):
         levels=[pd.arrays.ArrowExtensionArray(pa_array)], codes=[[0]]
     )
     pd.testing.assert_index_equal(result, expected)
+
+
+def test_multi_index_contains_hashable():
+    gidx = cudf.MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3]))
+    pidx = gidx.to_pandas()
+
+    assert_exceptions_equal(
+        lambda: [] in gidx,
+        lambda: [] in pidx,
+        lfunc_args_and_kwargs=((),),
+        rfunc_args_and_kwargs=((),),
+    )
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 56a4281aad9..1e175f5ff0d 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2,6 +2,7 @@
 
 import datetime
 import glob
+import hashlib
 import math
 import os
 import pathlib
@@ -211,7 +212,7 @@ def make_pdf(nrows, ncolumns=1, nvalids=0, dtype=np.int64):
         # Randomly but reproducibly mark subset of rows as invalid
         random.seed(1337)
         mask = random.sample(range(nrows), nvalids)
-        test_pdf[test_pdf.index.isin(mask)] = np.NaN
+        test_pdf[test_pdf.index.isin(mask)] = np.nan
     if dtype:
         test_pdf = test_pdf.astype(dtype)
 
@@ -1310,8 +1311,19 @@ def test_parquet_delta_byte_array(datadir):
     assert_eq(cudf.read_parquet(fname), pd.read_parquet(fname))
 
 
+# values chosen to exercise:
+#    1 - header only, no bitpacked values
+#    2 - one bitpacked value
+#   23 - one partially filled miniblock
+#   32 - almost full miniblock
+#   33 - one full miniblock
+#   34 - one full miniblock plus one value in new miniblock
+#  128 - almost full block
+#  129 - one full block
+#  130 - one full block plus one value in new block
+# 1000 - multiple blocks
 def delta_num_rows():
-    return [1, 2, 23, 32, 33, 34, 64, 65, 66, 128, 129, 130, 20000, 50000]
+    return [1, 2, 23, 32, 33, 34, 128, 129, 130, 1000]
 
 
 @pytest.mark.parametrize("nrows", delta_num_rows())
@@ -1411,17 +1423,16 @@ def test_delta_byte_array_roundtrip(
     pcdf = cudf.from_pandas(test_pdf)
     assert_eq(cdf, pcdf)
 
-    # Test DELTA_LENGTH_BYTE_ARRAY writing as well
-    if str_encoding == "DELTA_LENGTH_BYTE_ARRAY":
-        cudf_fname = tmpdir.join("cdfdeltaba.parquet")
-        pcdf.to_parquet(
-            cudf_fname,
-            compression="snappy",
-            header_version="2.0",
-            use_dictionary=False,
-        )
-        cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
-        assert_eq(cdf2, cdf)
+    # Write back out with cudf and make sure pyarrow can read it
+    cudf_fname = tmpdir.join("cdfdeltaba.parquet")
+    pcdf.to_parquet(
+        cudf_fname,
+        compression="snappy",
+        header_version="2.0",
+        use_dictionary=False,
+    )
+    cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
+    assert_eq(cdf2, cdf)
 
 
 @pytest.mark.parametrize("nrows", delta_num_rows())
@@ -1478,17 +1489,16 @@ def string_list_gen_wrapped(x, y):
     pcdf = cudf.from_pandas(test_pdf)
     assert_eq(cdf, pcdf)
 
-    # Test DELTA_LENGTH_BYTE_ARRAY writing as well
-    if str_encoding == "DELTA_LENGTH_BYTE_ARRAY":
-        cudf_fname = tmpdir.join("cdfdeltaba.parquet")
-        pcdf.to_parquet(
-            cudf_fname,
-            compression="snappy",
-            header_version="2.0",
-            use_dictionary=False,
-        )
-        cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
-        assert_eq(cdf2, cdf)
+    # Write back out with cudf and make sure pyarrow can read it
+    cudf_fname = tmpdir.join("cdfdeltaba.parquet")
+    pcdf.to_parquet(
+        cudf_fname,
+        compression="snappy",
+        header_version="2.0",
+        use_dictionary=False,
+    )
+    cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
+    assert_eq(cdf2, cdf)
 
 
 @pytest.mark.parametrize(
@@ -2807,6 +2817,24 @@ def test_parquet_reader_fixed_bin(datadir):
     assert_eq(expect, got)
 
 
+def test_parquet_reader_fixed_len_with_dict(tmpdir):
+    def flba(i):
+        hasher = hashlib.sha256()
+        hasher.update(i.to_bytes(4, "little"))
+        return hasher.digest()
+
+    # use pyarrow to write table of fixed_len_byte_array
+    num_rows = 200
+    data = pa.array([flba(i) for i in range(num_rows)], type=pa.binary(32))
+    padf = pa.Table.from_arrays([data], names=["flba"])
+    padf_fname = tmpdir.join("padf.parquet")
+    pq.write_table(padf, padf_fname, use_dictionary=True)
+
+    expect = pd.read_parquet(padf_fname)
+    got = cudf.read_parquet(padf_fname)
+    assert_eq(expect, got)
+
+
 def test_parquet_reader_rle_boolean(datadir):
     fname = datadir / "rle_boolean_encoding.parquet"
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 642dbde3790..6a9de197374 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2786,3 +2786,31 @@ def test_squeeze(axis, data):
 def test_squeeze_invalid_axis(axis):
     with pytest.raises(ValueError):
         cudf.Series([1]).squeeze(axis=axis)
+
+
+@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0])
+def test_timestamp_series_init(data):
+    scalar = pd.Timestamp(data)
+    expected = pd.Series([scalar])
+    actual = cudf.Series([scalar])
+
+    assert_eq(expected, actual)
+
+    expected = pd.Series(scalar)
+    actual = cudf.Series(scalar)
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0])
+def test_timedelta_series_init(data):
+    scalar = pd.Timedelta(data)
+    expected = pd.Series([scalar])
+    actual = cudf.Series([scalar])
+
+    assert_eq(expected, actual)
+
+    expected = pd.Series(scalar)
+    actual = cudf.Series(scalar)
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index b9eb42906e8..27811d0fcde 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -507,7 +507,7 @@ def test_df_corr(method):
 @pytest.mark.parametrize(
     "data",
     [
-        [0.0, 1, 3, 6, np.NaN, 7, 5.0, np.nan, 5, 2, 3, -100],
+        [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100],
         [np.nan] * 3,
         [1, 5, 3],
         [],
@@ -555,7 +555,7 @@ def test_nans_stats(data, ops, skipna):
 @pytest.mark.parametrize(
     "data",
     [
-        [0.0, 1, 3, 6, np.NaN, 7, 5.0, np.nan, 5, 2, 3, -100],
+        [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100],
         [np.nan] * 3,
         [1, 5, 3],
     ],
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index a33b5ca139c..2aa3129ab30 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -91,6 +91,10 @@
 BOOL_TYPES = {"bool"}
 ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES
 
+# The NumPy scalar types are a bit of a mess as they align with the C types
+# so for now we use the `sctypes` dict (although it was made private in 2.0)
+_NUMPY_SCTYPES = np.sctypes if hasattr(np, "sctypes") else np._core.sctypes
+
 
 def np_to_pa_dtype(dtype):
     """Util to convert numpy dtype to PyArrow dtype."""
@@ -335,7 +339,7 @@ def min_signed_type(x, min_size=8):
     Return the smallest *signed* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in np.sctypes["int"]:
+    for int_dtype in _NUMPY_SCTYPES["int"]:
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max:
                 return int_dtype
@@ -348,7 +352,7 @@ def min_unsigned_type(x, min_size=8):
     Return the smallest *unsigned* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in np.sctypes["uint"]:
+    for int_dtype in _NUMPY_SCTYPES["uint"]:
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if 0 <= x <= np.iinfo(int_dtype).max:
                 return int_dtype
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 66e14f4b9de..6bd7558d322 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -692,7 +692,6 @@
 
        This parameter is only supported with ``engine='cudf'``.
 
-    This parameter is only supported in ``cudf`` engine.
     If `True`, any string values are read literally (and wrapped in an
     additional set of quotes).
     If `False` string values are parsed into Python strings.
@@ -703,7 +702,22 @@
     For other URLs (e.g. starting with "s3://", and "gcs://") the key-value
     pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
     ``urllib`` for more details.
+mixed_types_as_string : bool, default False
 
+    .. admonition:: GPU-accelerated feature
+
+       This parameter is only supported with ``engine='cudf'``.
+
+    If True, mixed type columns are returned as string columns.
+    If `False` parsing mixed type columns will thrown an error.
+prune_columns : bool, default False
+
+    .. admonition:: GPU-accelerated feature
+
+       This parameter is only supported with ``engine='cudf'``.
+
+    If True, only return those columns mentioned in the dtype argument.
+    If `False` dtype argument is used a type inference suggestion.
 Returns
 -------
 result : Series or DataFrame, depending on the value of `typ`.
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 90356a01404..8d319cfe640 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1205,6 +1205,14 @@ def test_pickle_groupby(dataframe):
     tm.assert_equal(pgb.sum(), gb.sum())
 
 
+def test_numpy_extension_array():
+    np_array = np.array([0, 1, 2, 3])
+    xarray = xpd.arrays.NumpyExtensionArray(np_array)
+    array = pd.arrays.NumpyExtensionArray(np_array)
+
+    tm.assert_equal(xarray, array)
+
+
 def test_isinstance_base_offset():
     offset = xpd.tseries.frequencies.to_offset("1s")
     assert isinstance(offset, xpd.tseries.offsets.BaseOffset)
@@ -1220,3 +1228,17 @@ def my_apply(df, unused):
     result = df.apply(my_apply, axis=1, unused=True)
     expected = xpd.Series([1])
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("data", [pd.NaT, 1234, "nat"])
+def test_timestamp(data):
+    xtimestamp = xpd.Timestamp(data)
+    timestamp = pd.Timestamp(data)
+    tm.assert_equal(xtimestamp, timestamp)
+
+
+@pytest.mark.parametrize("data", [pd.NaT, 1234, "nat"])
+def test_timedelta(data):
+    xtimedelta = xpd.Timedelta(data)
+    timedelta = pd.Timedelta(data)
+    tm.assert_equal(xtimedelta, timedelta)
diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py
index 4921446ab6b..dd8d9287972 100644
--- a/python/cudf/cudf_pandas_tests/test_profiler.py
+++ b/python/cudf/cudf_pandas_tests/test_profiler.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -30,11 +30,13 @@ def test_profiler():
 
     per_function_stats = profiler.per_function_stats
     assert set(per_function_stats) == {
+        "Timestamp",
         "DataFrame",
         "DataFrame.groupby",
         "DataFrameGroupBy.sum",
         "DataFrame.sum",
         "Series.__getitem__",
+        "Timedelta",
     }
     for name, func in per_function_stats.items():
         assert (
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index e6c86351ac9..7786bf98bef 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -103,3 +103,13 @@ skip = [
     "dist",
     "__init__.py",
 ]
+
+[tool.pytest.ini_options]
+filterwarnings = [
+    "error",
+    "ignore:unclosed <socket.socket:ResourceWarning",
+    "ignore:Port .* is already in use.:UserWarning:distributed",
+    # Should be fixed in the next streamz release
+    # https://github.com/python-streamz/streamz/commit/2812f1f961dfcb3f17e948d8b12a12472975558e
+    "ignore:pkg_resources is deprecated as an API:DeprecationWarning:streamz",
+]
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index bfe58531a73..3f0cfeb6d2c 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -55,9 +55,20 @@ def __repr__(self):
 
     @_dask_cudf_nvtx_annotate
     def to_dask_dataframe(self, **kwargs):
-        """Create a dask.dataframe object from a dask_cudf object"""
-        nullable = kwargs.get("nullable", False)
-        return self.map_partitions(M.to_pandas, nullable=nullable)
+        """Create a dask.dataframe object from a dask_cudf object
+
+        WARNING: This API is deprecated, and may not work properly
+        when query-planning is active. Please use `*.to_backend("pandas")`
+        to convert the underlying data to pandas.
+        """
+
+        warnings.warn(
+            "The `to_dask_dataframe` API is now deprecated. "
+            "Please use `*.to_backend('pandas')` instead.",
+            FutureWarning,
+        )
+
+        return self.to_backend("pandas", **kwargs)
 
 
 concat = dd.concat
@@ -733,6 +744,10 @@ def from_dask_dataframe(df):
     Convert a Dask :class:`dask.dataframe.DataFrame` to a Dask-cuDF
     one.
 
+    WARNING: This API is deprecated, and may not work properly
+    when query-planning is active. Please use `*.to_backend("cudf")`
+    to convert the underlying data to cudf.
+
     Parameters
     ----------
     df : dask.dataframe.DataFrame
@@ -742,7 +757,14 @@ def from_dask_dataframe(df):
     -------
     dask_cudf.DataFrame : A new Dask collection backed by cuDF objects
     """
-    return df.map_partitions(cudf.from_pandas)
+
+    warnings.warn(
+        "The `from_dask_dataframe` API is now deprecated. "
+        "Please use `*.to_backend('cudf')` instead.",
+        FutureWarning,
+    )
+
+    return df.to_backend("cudf")
 
 
 for name in (
diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index 516e35a4335..d50dfb24256 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+import warnings
 from functools import cached_property
 
 from dask_expr import (
@@ -17,14 +18,39 @@
 
 import cudf
 
+_LEGACY_WORKAROUND = (
+    "To enable the 'legacy' dask-cudf API, set the "
+    "global 'dataframe.query-planning' config to "
+    "`False` before dask is imported. This can also "
+    "be done by setting an environment variable: "
+    "`DASK_DATAFRAME__QUERY_PLANNING=False` "
+)
+
+
 ##
 ## Custom collection classes
 ##
 
 
-# VarMixin can be removed if cudf#15179 is addressed.
-# See: https://github.com/rapidsai/cudf/issues/15179
-class VarMixin:
+class CudfFrameBase(FrameBase):
+    def to_dask_dataframe(self, **kwargs):
+        """Create a dask.dataframe object from a dask_cudf object
+
+        WARNING: This API is deprecated, and may not work properly.
+        Please use `*.to_backend("pandas")` to convert the
+        underlying data to pandas.
+        """
+
+        warnings.warn(
+            "The `to_dask_dataframe` API is now deprecated. "
+            "Please use `*.to_backend('pandas')` instead.",
+            FutureWarning,
+        )
+
+        return self.to_backend("pandas", **kwargs)
+
+    # var can be removed if cudf#15179 is addressed.
+    # See: https://github.com/rapidsai/cudf/issues/15179
     def var(
         self,
         axis=0,
@@ -49,7 +75,7 @@ def var(
         )
 
 
-class DataFrame(VarMixin, DXDataFrame):
+class DataFrame(DXDataFrame, CudfFrameBase):
     @classmethod
     def from_dict(cls, *args, **kwargs):
         with config.set({"dataframe.backend": "cudf"}):
@@ -71,6 +97,21 @@ def groupby(
                 f"`by` must be a column name or list of columns, got {by}."
             )
 
+        if "as_index" in kwargs:
+            msg = (
+                "The `as_index` argument is now deprecated. All groupby "
+                "results will be consistent with `as_index=True`."
+            )
+
+            if kwargs.pop("as_index") is not True:
+                raise NotImplementedError(
+                    f"{msg} Please reset the index after aggregating, or "
+                    "use the legacy API if `as_index=False` is required.\n"
+                    f"{_LEGACY_WORKAROUND}"
+                )
+            else:
+                warnings.warn(msg, FutureWarning)
+
         return GroupBy(
             self,
             by,
@@ -94,7 +135,7 @@ def read_text(*args, **kwargs):
         return from_legacy_dataframe(ddf)
 
 
-class Series(VarMixin, DXSeries):
+class Series(DXSeries, CudfFrameBase):
     def groupby(self, by, **kwargs):
         from dask_cudf.expr._groupby import SeriesGroupBy
 
@@ -113,7 +154,7 @@ def struct(self):
         return StructMethods(self)
 
 
-class Index(DXIndex):
+class Index(DXIndex, CudfFrameBase):
     pass  # Same as pandas (for now)
 
 
diff --git a/python/dask_cudf/dask_cudf/expr/_groupby.py b/python/dask_cudf/dask_cudf/expr/_groupby.py
index 7f275151f75..116893891e3 100644
--- a/python/dask_cudf/dask_cudf/expr/_groupby.py
+++ b/python/dask_cudf/dask_cudf/expr/_groupby.py
@@ -3,13 +3,55 @@
 from dask_expr._groupby import (
     GroupBy as DXGroupBy,
     SeriesGroupBy as DXSeriesGroupBy,
+    SingleAggregation,
 )
 from dask_expr._util import is_scalar
 
+from dask.dataframe.groupby import Aggregation
+
 ##
 ## Custom groupby classes
 ##
 
+
+class Collect(SingleAggregation):
+    @staticmethod
+    def groupby_chunk(arg):
+        return arg.agg("collect")
+
+    @staticmethod
+    def groupby_aggregate(arg):
+        gb = arg.agg("collect")
+        if gb.ndim > 1:
+            for col in gb.columns:
+                gb[col] = gb[col].list.concat()
+            return gb
+        else:
+            return gb.list.concat()
+
+
+collect_aggregation = Aggregation(
+    name="collect",
+    chunk=Collect.groupby_chunk,
+    agg=Collect.groupby_aggregate,
+)
+
+
+def _translate_arg(arg):
+    # Helper function to translate args so that
+    # they can be processed correctly by upstream
+    # dask & dask-expr. Right now, the only necessary
+    # translation is "collect" aggregations.
+    if isinstance(arg, dict):
+        return {k: _translate_arg(v) for k, v in arg.items()}
+    elif isinstance(arg, list):
+        return [_translate_arg(x) for x in arg]
+    elif arg in ("collect", "list", list):
+        return collect_aggregation
+    else:
+        return arg
+
+
 # TODO: These classes are mostly a work-around for missing
 # `observed=False` support.
 # See: https://github.com/rapidsai/cudf/issues/15173
@@ -41,8 +83,20 @@ def __getitem__(self, key):
         )
         return g
 
+    def collect(self, **kwargs):
+        return self._single_agg(Collect, **kwargs)
+
+    def aggregate(self, arg, **kwargs):
+        return super().aggregate(_translate_arg(arg), **kwargs)
+
 
 class SeriesGroupBy(DXSeriesGroupBy):
     def __init__(self, *args, observed=None, **kwargs):
         observed = observed if observed is not None else True
         super().__init__(*args, observed=observed, **kwargs)
+
+    def collect(self, **kwargs):
+        return self._single_agg(Collect, **kwargs)
+
+    def aggregate(self, arg, **kwargs):
+        return super().aggregate(_translate_arg(arg), **kwargs)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
index a09dfbff188..dc780478794 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_json.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -12,8 +12,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support for dask_expr<1.0.6
-pytestmark = skip_dask_expr(lt_version="1.0.6")
+# No dask-expr support for dask<2024.4.0
+pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 
 def test_read_json_backend_dispatch(tmp_path):
@@ -84,9 +84,8 @@ def test_read_json_nested(tmp_path):
         }
     )
     kwargs = dict(orient="records", lines=True)
-    with tmp_path / "data.json" as f, dask.config.set(
-        {"dataframe.convert-string": False}
-    ):
+    f = tmp_path / "data.json"
+    with dask.config.set({"dataframe.convert-string": False}):
         df.to_json(f, **kwargs)
         # Ensure engine='cudf' is tested.
         actual = dask_cudf.read_json(f, engine="cudf", **kwargs)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
index 7be6c712511..457e5546bd9 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
@@ -14,8 +14,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support for dask_expr<1.0.6
-pytestmark = skip_dask_expr(lt_version="1.0.6")
+# No dask-expr support for dask<2024.4.0
+pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 cur_dir = os.path.dirname(__file__)
 sample_orc = os.path.join(cur_dir, "data/orc/sample.orc")
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 68460653119..6f4737db5be 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -113,7 +113,7 @@ def test_roundtrip_from_dask_none_index_false(tmpdir):
 @pytest.mark.parametrize("write_meta", [True, False])
 def test_roundtrip_from_dask_cudf(tmpdir, write_meta):
     tmpdir = str(tmpdir)
-    gddf = dask_cudf.from_dask_dataframe(ddf)
+    gddf = ddf.to_backend("cudf")
     gddf.to_parquet(tmpdir, write_metadata_file=write_meta)
 
     gddf2 = dask_cudf.read_parquet(tmpdir, calculate_divisions=True)
@@ -536,7 +536,7 @@ def test_check_file_size(tmpdir):
         dask_cudf.io.read_parquet(fn, check_file_size=1).compute()
 
 
-@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="1.0")
+@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="2024.3.0")
 def test_null_partition(tmpdir):
     import pyarrow as pa
     from pyarrow.dataset import HivePartitioning
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py
index e3a9d380857..8912b7d5da6 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_text.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py
@@ -11,8 +11,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support for dask_expr<1.0.6
-pytestmark = skip_dask_expr(lt_version="1.0.6")
+# No dask-expr support for dask<2024.4.0
+pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 cur_dir = os.path.dirname(__file__)
 text_file = os.path.join(cur_dir, "data/text/sample.pgn")
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index ebb8e4be187..035b73094e7 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -111,7 +111,7 @@ def test_categorical_accessor_initialization2(data):
         dsr.cat
 
 
-@xfail_dask_expr("TODO: Unexplained dask-expr failure")
+@xfail_dask_expr(lt_version="2024.5.0")
 @pytest.mark.parametrize("data", [data_cat_1()])
 def test_categorical_basic(data):
     cat = data.copy()
@@ -203,7 +203,6 @@ def test_categorical_compare_unordered(data):
         dsr < dsr
 
 
-@xfail_dask_expr("TODO: Unexplained dask-expr failure")
 @pytest.mark.parametrize("data", [data_cat_3()])
 def test_categorical_compare_ordered(data):
     cat1 = data[0].copy()
@@ -274,7 +273,6 @@ def test_categorical_categories():
     )
 
 
-@xfail_dask_expr("TODO: Unexplained dask-expr failure")
 def test_categorical_as_known():
     df = dask_cudf.from_cudf(DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2)
     df["col_1"] = df["col_1"].astype("category")
@@ -283,7 +281,19 @@ def test_categorical_as_known():
     pdf = dd.from_pandas(pd.DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2)
     pdf["col_1"] = pdf["col_1"].astype("category")
     expected = pdf["col_1"].cat.as_known()
-    dd.assert_eq(expected, actual)
+
+    # Note: Categories may be ordered differently in
+    # cudf and pandas. Therefore, we need to compare
+    # the global set of categories (before and after
+    # calling `compute`), then we need to check that
+    # the initial order of rows was preserved.
+    assert set(expected.cat.categories) == set(
+        actual.cat.categories.values_host
+    )
+    assert set(expected.compute().cat.categories) == set(
+        actual.compute().cat.categories.values_host
+    )
+    dd.assert_eq(expected, actual.astype(expected.dtype))
 
 
 def test_str_slice():
@@ -533,7 +543,7 @@ def test_struct_explode(data):
 
 
 def test_tz_localize():
-    data = Series(date_range("2000-04-01", "2000-04-03", freq="H"))
+    data = Series(date_range("2000-04-01", "2000-04-03", freq="h"))
     expect = data.dt.tz_localize(
         "US/Eastern", ambiguous="NaT", nonexistent="NaT"
     )
@@ -550,8 +560,8 @@ def test_tz_localize():
 @pytest.mark.parametrize(
     "data",
     [
-        date_range("2000-04-01", "2000-04-03", freq="H").tz_localize("UTC"),
-        date_range("2000-04-01", "2000-04-03", freq="H").tz_localize(
+        date_range("2000-04-01", "2000-04-03", freq="h").tz_localize("UTC"),
+        date_range("2000-04-01", "2000-04-03", freq="h").tz_localize(
             "US/Eastern"
         ),
     ],
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index c6918c94559..4878d44d636 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -32,6 +32,30 @@ def test_from_dict_backend_dispatch():
     dd.assert_eq(expect, ddf)
 
 
+def test_to_dask_dataframe_deprecated():
+    gdf = cudf.DataFrame({"a": range(100)})
+    ddf = dd.from_pandas(gdf, npartitions=2)
+    assert isinstance(ddf._meta, cudf.DataFrame)
+
+    with pytest.warns(FutureWarning, match="API is now deprecated"):
+        assert isinstance(
+            ddf.to_dask_dataframe()._meta,
+            pd.DataFrame,
+        )
+
+
+def test_from_dask_dataframe_deprecated():
+    gdf = pd.DataFrame({"a": range(100)})
+    ddf = dd.from_pandas(gdf, npartitions=2)
+    assert isinstance(ddf._meta, pd.DataFrame)
+
+    with pytest.warns(FutureWarning, match="API is now deprecated"):
+        assert isinstance(
+            dask_cudf.from_dask_dataframe(ddf)._meta,
+            cudf.DataFrame,
+        )
+
+
 def test_to_backend():
     np.random.seed(0)
     data = {
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 3bb3e3b0bb8..f96b5b760d8 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -14,16 +14,6 @@
 from dask_cudf.groupby import OPTIMIZED_AGGS, _aggs_optimized
 from dask_cudf.tests.utils import QUERY_PLANNING_ON, xfail_dask_expr
 
-# XFAIL "collect" tests for now
-agg_params = [agg for agg in OPTIMIZED_AGGS if agg != "collect"]
-if QUERY_PLANNING_ON:
-    agg_params.append(
-        # TODO: "collect" not supported with dask-expr yet
-        pytest.param("collect", marks=pytest.mark.xfail)
-    )
-else:
-    agg_params.append("collect")
-
 
 def assert_cudf_groupby_layers(ddf):
     for prefix in ("cudf-aggregate-chunk", "cudf-aggregate-agg"):
@@ -57,7 +47,7 @@ def pdf(request):
     return pdf
 
 
-@pytest.mark.parametrize("aggregation", agg_params)
+@pytest.mark.parametrize("aggregation", OPTIMIZED_AGGS)
 @pytest.mark.parametrize("series", [False, True])
 def test_groupby_basic(series, aggregation, pdf):
     gdf = cudf.DataFrame.from_pandas(pdf)
@@ -110,7 +100,7 @@ def test_groupby_cumulative(aggregation, pdf, series):
     dd.assert_eq(a, b)
 
 
-@pytest.mark.parametrize("aggregation", agg_params)
+@pytest.mark.parametrize("aggregation", OPTIMIZED_AGGS)
 @pytest.mark.parametrize(
     "func",
     [
@@ -243,7 +233,7 @@ def test_groupby_split_out(split_out, column):
     gddf = dask_cudf.from_cudf(gdf, npartitions=3)
 
     ddf_result = (
-        ddf.groupby(column)
+        ddf.groupby(column, observed=True)
         .a.mean(split_out=split_out)
         .compute()
         .sort_values()
@@ -378,10 +368,10 @@ def test_groupby_dropna_dask(dropna, by):
 
     if dropna is None:
         dask_cudf_result = gddf.groupby(by).e.sum()
-        dask_result = ddf.groupby(by).e.sum()
+        dask_result = ddf.groupby(by, observed=True).e.sum()
     else:
         dask_cudf_result = gddf.groupby(by, dropna=dropna).e.sum()
-        dask_result = ddf.groupby(by, dropna=dropna).e.sum()
+        dask_result = ddf.groupby(by, dropna=dropna, observed=True).e.sum()
 
     dd.assert_eq(dask_cudf_result, dask_result)
 
@@ -515,7 +505,7 @@ def test_groupby_reset_index_dtype():
     a = df.groupby("a").agg({"b": ["count"]})
 
     assert a.index.dtype == "int8"
-    assert a.reset_index().dtypes[0] == "int8"
+    assert a.reset_index().dtypes.iloc[0] == "int8"
 
 
 def test_groupby_reset_index_names():
@@ -562,9 +552,9 @@ def test_groupby_reset_index_string_name():
 def test_groupby_categorical_key():
     # See https://github.com/rapidsai/cudf/issues/4608
     df = dask.datasets.timeseries()
-    gddf = dask_cudf.from_dask_dataframe(df)
+    gddf = df.to_backend("cudf")
     gddf["name"] = gddf["name"].astype("category")
-    ddf = gddf.to_dask_dataframe()
+    ddf = gddf.to_backend("pandas")
 
     got = gddf.groupby("name", sort=True).agg(
         {"x": ["mean", "max"], "y": ["mean", "count"]}
@@ -573,14 +563,22 @@ def test_groupby_categorical_key():
     # (See: https://github.com/dask/dask/issues/9515)
     expect = (
         ddf.compute()
-        .groupby("name", sort=True)
+        .groupby("name", sort=True, observed=True)
         .agg({"x": ["mean", "max"], "y": ["mean", "count"]})
     )
     dd.assert_eq(expect, got)
 
 
-@xfail_dask_expr("as_index not supported in dask-expr")
-@pytest.mark.parametrize("as_index", [True, False])
+@pytest.mark.parametrize(
+    "as_index",
+    [
+        True,
+        pytest.param(
+            False,
+            marks=xfail_dask_expr("as_index not supported in dask-expr"),
+        ),
+    ],
+)
 @pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2])
 @pytest.mark.parametrize("split_every", [False, 4])
 @pytest.mark.parametrize("npartitions", [1, 10])
@@ -603,10 +601,19 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
     if split_out == "use_dask_default":
         split_kwargs.pop("split_out")
 
+    # Avoid using as_index when query-planning is enabled
+    if QUERY_PLANNING_ON:
+        with pytest.warns(FutureWarning, match="argument is now deprecated"):
+            # Should warn when `as_index` is used
+            ddf.groupby(["name", "a"], sort=False, as_index=as_index)
+        maybe_as_index = {"as_index": as_index} if as_index is False else {}
+    else:
+        maybe_as_index = {"as_index": as_index}
+
     # Check `sort=True` behavior
     if split_out == 1:
         gf = (
-            ddf.groupby(["name", "a"], sort=True, as_index=as_index)
+            ddf.groupby(["name", "a"], sort=True, **maybe_as_index)
             .aggregate(
                 agg_dict,
                 **split_kwargs,
@@ -628,7 +635,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
             )
 
     # Full check (`sort=False`)
-    gr = ddf.groupby(["name", "a"], sort=False, as_index=as_index).aggregate(
+    gr = ddf.groupby(["name", "a"], sort=False, **maybe_as_index).aggregate(
         agg_dict,
         **split_kwargs,
     )
diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py
index 42ecc130298..ed291ef31a7 100644
--- a/python/dask_cudf/dask_cudf/tests/test_join.py
+++ b/python/dask_cudf/dask_cudf/tests/test_join.py
@@ -66,8 +66,12 @@ def test_join_inner(left_nrows, right_nrows, left_nkeys, right_nkeys):
     def gather(df, grows):
         grows[df["x"].values[0]] = (set(df.al), set(df.ar))
 
-    expect.reset_index().groupby("x").apply(partial(gather, grows=expect_rows))
-    expect.reset_index().groupby("x").apply(partial(gather, grows=got_rows))
+    expect.reset_index().groupby("x")[["x", "al", "ar"]].apply(
+        partial(gather, grows=expect_rows)
+    )
+    expect.reset_index().groupby("x")[["x", "al", "ar"]].apply(
+        partial(gather, grows=got_rows)
+    )
 
     assert got_rows == expect_rows
 
@@ -127,9 +131,13 @@ def gather(df, grows):
 
         grows[df["x"].values[0]] = (cola, colb)
 
-    expect.reset_index().groupby("x").apply(partial(gather, grows=expect_rows))
+    expect.reset_index().groupby("x")[["x", "al", "ar"]].apply(
+        partial(gather, grows=expect_rows)
+    )
 
-    expect.reset_index().groupby("x").apply(partial(gather, grows=got_rows))
+    expect.reset_index().groupby("x")[["x", "al", "ar"]].apply(
+        partial(gather, grows=got_rows)
+    )
 
     for k in expect_rows:
         np.testing.assert_array_equal(expect_rows[k][0], got_rows[k][0])
diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py
index 1ca1758736b..c7dedbb6b4a 100644
--- a/python/dask_cudf/dask_cudf/tests/utils.py
+++ b/python/dask_cudf/dask_cudf/tests/utils.py
@@ -5,6 +5,7 @@
 import pytest
 from packaging.version import Version
 
+import dask
 import dask.dataframe as dd
 
 import cudf
@@ -12,11 +13,9 @@
 from dask_cudf.expr import QUERY_PLANNING_ON
 
 if QUERY_PLANNING_ON:
-    import dask_expr
-
-    DASK_EXPR_VERSION = Version(dask_expr.__version__)
+    DASK_VERSION = Version(dask.__version__)
 else:
-    DASK_EXPR_VERSION = None
+    DASK_VERSION = None
 
 
 def _make_random_frame(nelem, npartitions=2, include_na=False):
@@ -37,7 +36,7 @@ def _make_random_frame(nelem, npartitions=2, include_na=False):
 
 def skip_dask_expr(reason=_default_reason, lt_version=None):
     if lt_version is not None:
-        skip = QUERY_PLANNING_ON and DASK_EXPR_VERSION < Version(lt_version)
+        skip = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version)
     else:
         skip = QUERY_PLANNING_ON
     return pytest.mark.skipif(skip, reason=reason)
@@ -45,7 +44,7 @@ def skip_dask_expr(reason=_default_reason, lt_version=None):
 
 def xfail_dask_expr(reason=_default_reason, lt_version=None):
     if lt_version is not None:
-        xfail = QUERY_PLANNING_ON and DASK_EXPR_VERSION < Version(lt_version)
+        xfail = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version)
     else:
         xfail = QUERY_PLANNING_ON
     return pytest.mark.xfail(xfail, reason=reason)
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index fcf83e82989..5fbdd98225e 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -107,3 +107,13 @@ skip = [
     "build",
     "dist",
 ]
+
+[tool.pytest.ini_options]
+filterwarnings = [
+    "error::FutureWarning",
+    "error::DeprecationWarning",
+    "ignore:create_block_manager_from_blocks is deprecated and will be removed in a future version. Use public APIs instead.:DeprecationWarning",
+    # https://github.com/dask/partd/blob/main/partd/pandas.py#L198
+    "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning",
+    "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask",
+]