diff --git a/README.md b/README.md
index 996e5ff4800..0cf168123cf 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,10 @@ print(tips_df.groupby("size").tip_percentage.mean())
 - [libcudf (C++/CUDA) documentation](https://docs.rapids.ai/api/libcudf/stable/)
 - [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate.
 
+See the [RAPIDS install page](https://docs.rapids.ai/install) for
+the most up-to-date information and commands for installing cuDF
+and other RAPIDS packages.
+
 ## Installation
 
 ### CUDA/GPU requirements
@@ -64,6 +68,24 @@ print(tips_df.groupby("size").tip_percentage.mean())
 * NVIDIA driver 450.80.02+
 * Volta architecture or better (Compute Capability >=7.0)
 
+### Pip
+
+cuDF can be installed via `pip` from the NVIDIA Python Package Index.
+Be sure to select the appropriate cuDF package depending
+on the major version of CUDA available in your environment:
+
+For CUDA 11.x:
+
+```bash
+pip install --extra-index-url=https://pypi.nvidia.com cudf-cu11
+```
+
+For CUDA 12.x:
+
+```bash
+pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12
+```
+
 ### Conda
 
 cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects/miniconda/en/latest/) or the full [Anaconda distribution](https://www.anaconda.com/download) from the `rapidsai` channel:
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index f1ad8ee7778..740a6409ccd 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -1,9 +1,13 @@
 #!/bin/bash
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
-source rapids-env-update
+rapids-configure-conda-channels
+
+source rapids-configure-sccache
+
+source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
 
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 67d611340c5..78518cdad53 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 32fe7b6b3ce..3c2a7761e1a 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -1,9 +1,13 @@
 #!/bin/bash
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
-source rapids-env-update
+rapids-configure-conda-channels
+
+source rapids-configure-sccache
+
+source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
 
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 9c674518810..c4b794e81f7 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -48,7 +48,7 @@ fi
 if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then
     sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file}
     sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file}
-    sed -i "/ptxcompiler/d" ${pyproject_file}
+    sed -i "s/ptxcompiler/pynvjitlink/g" ${pyproject_file}
     sed -i "/cubinlinker/d" ${pyproject_file}
 fi
 
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index e79b9a35aa2..cde22bb70d1 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 package_dir="python/cudf"
 
-export SKBUILD_CONFIGURE_OPTIONS="-DUSE_LIBARROW_FROM_PYARROW=ON"
+export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON"
 
 ./ci/build_wheel.sh cudf ${package_dir}
 
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 21b540e24ab..47b377013ce 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -12,6 +12,7 @@ dependencies:
 - benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
+- breathe>=4.35.0
 - c-compiler
 - cachetools
 - clang-tools=16.0.6
@@ -74,7 +75,7 @@ dependencies:
 - pydata-sphinx-theme!=0.14.2
 - pytest
 - pytest-benchmark
-- pytest-cases
+- pytest-cases>=3.8.2
 - pytest-cov
 - pytest-xdist
 - python-confluent-kafka>=1.9.0,<1.10.0a0
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index c109dcca625..4cf1d5427f4 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -12,6 +12,7 @@ dependencies:
 - benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
+- breathe>=4.35.0
 - c-compiler
 - cachetools
 - clang-tools=16.0.6
@@ -69,9 +70,10 @@ dependencies:
 - protobuf>=4.21,<5
 - pyarrow==14.0.1.*
 - pydata-sphinx-theme!=0.14.2
+- pynvjitlink
 - pytest
 - pytest-benchmark
-- pytest-cases
+- pytest-cases>=3.8.2
 - pytest-cov
 - pytest-xdist
 - python-confluent-kafka>=1.9.0,<1.10.0a0
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index bc91ee61f6f..4f39a9fe452 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
@@ -98,6 +98,7 @@ requirements:
     # xref: https://github.com/rapidsai/cudf/issues/12822
     - cuda-nvrtc
     - cuda-python >=12.0,<13.0a0
+    - pynvjitlink
     {% endif %}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     - nvtx >=0.2.1
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index cb1fdb1f557..2c0f601ca74 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -380,6 +380,8 @@ add_library(
   src/io/orc/dict_enc.cu
   src/io/orc/orc.cpp
   src/io/orc/reader_impl.cu
+  src/io/orc/reader_impl_helpers.cpp
+  src/io/orc/reader_impl_preprocess.cu
   src/io/orc/stats_enc.cu
   src/io/orc/stripe_data.cu
   src/io/orc/stripe_enc.cu
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 9c3a05a2f5f..35b03fa33d0 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -183,6 +183,11 @@ ConfigureNVBench(
   sort/sort_lists.cpp sort/sort_structs.cpp
 )
 
+# ##################################################################################################
+# * structs benchmark
+# --------------------------------------------------------------------------------
+ConfigureNVBench(STRUCT_CREATION_NVBENCH structs/create_structs.cpp)
+
 # ##################################################################################################
 # * quantiles benchmark
 # --------------------------------------------------------------------------------
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index bb7529bb37a..0ea13957868 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -540,7 +540,7 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
   return cudf::make_strings_column(
     num_rows,
     std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0),
-    std::make_unique<cudf::column>(std::move(chars), rmm::device_buffer{}, 0),
+    chars.release(),
     null_count,
     profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{});
 }
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index bc6c2e52da8..36370560727 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <benchmark/benchmark.h>
+#include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -33,7 +34,8 @@ inline auto make_pool_instance()
 {
   static rmm::mr::cuda_memory_resource cuda_mr;
   static auto pool_mr =
-    std::make_shared<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>>(&cuda_mr);
+    std::make_shared<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>>(
+      &cuda_mr, rmm::percent_of_free_device_memory(50));
   return pool_mr;
 }
 }  // namespace
diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp
index e08f9101522..701ed67e666 100644
--- a/cpp/benchmarks/fixture/nvbench_fixture.hpp
+++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/cuda_async_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
@@ -42,7 +43,8 @@ struct nvbench_base_fixture {
 
   inline auto make_pool()
   {
-    return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
+    return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+      make_cuda(), rmm::percent_of_free_device_memory(50));
   }
 
   inline auto make_async() { return std::make_shared<rmm::mr::cuda_async_memory_resource>(); }
@@ -56,7 +58,8 @@ struct nvbench_base_fixture {
 
   inline auto make_managed_pool()
   {
-    return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_managed());
+    return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+      make_managed(), rmm::percent_of_free_device_memory(50));
   }
 
   inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp
index e679b4b62d2..4930fc59ac3 100644
--- a/cpp/benchmarks/hashing/hash.cpp
+++ b/cpp/benchmarks/hashing/hash.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@ static void bench_hash(nvbench::state& state)
 
   // collect statistics
   cudf::strings_column_view input(data->get_column(1).view());
-  auto const chars_size = input.chars_size();
+  auto const chars_size = input.chars_size(stream);
   // add memory read from string column
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
   // add memory read from int64_t column
diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index 84e607a9f28..b14541564dd 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@
 
 #include <cassert>
 
-__global__ static void init_curand(curandState* state, int const nstates)
+CUDF_KERNEL void init_curand(curandState* state, int const nstates)
 {
   int ithread = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -39,11 +39,11 @@ __global__ static void init_curand(curandState* state, int const nstates)
 }
 
 template <typename key_type, typename size_type>
-__global__ static void init_build_tbl(key_type* const build_tbl,
-                                      size_type const build_tbl_size,
-                                      int const multiplicity,
-                                      curandState* state,
-                                      int const num_states)
+CUDF_KERNEL void init_build_tbl(key_type* const build_tbl,
+                                size_type const build_tbl_size,
+                                int const multiplicity,
+                                curandState* state,
+                                int const num_states)
 {
   auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
   auto const stride    = blockDim.x * gridDim.x;
@@ -61,14 +61,14 @@ __global__ static void init_build_tbl(key_type* const build_tbl,
 }
 
 template <typename key_type, typename size_type>
-__global__ void init_probe_tbl(key_type* const probe_tbl,
-                               size_type const probe_tbl_size,
-                               size_type const build_tbl_size,
-                               key_type const rand_max,
-                               double const selectivity,
-                               int const multiplicity,
-                               curandState* state,
-                               int const num_states)
+CUDF_KERNEL void init_probe_tbl(key_type* const probe_tbl,
+                                size_type const probe_tbl_size,
+                                size_type const build_tbl_size,
+                                key_type const rand_max,
+                                double const selectivity,
+                                int const multiplicity,
+                                curandState* state,
+                                int const num_states)
 {
   auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
   auto const stride    = blockDim.x * gridDim.x;
diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu
index 5dc30aebe38..020c8e413b3 100644
--- a/cpp/benchmarks/json/json.cu
+++ b/cpp/benchmarks/json/json.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -177,10 +177,10 @@ auto build_json_string_column(int desired_bytes, int num_rows)
   auto d_store_order = cudf::column_device_view::create(float_2bool_columns->get_column(2));
   json_benchmark_row_builder jb{
     desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order};
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets, chars] = cudf::strings::detail::make_strings_children(
     jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   return cudf::make_strings_column(
-    num_rows, std::move(children.first), std::move(children.second), 0, {});
+    num_rows, std::move(offsets), std::move(chars->release().data.release()[0]), 0, {});
 }
 
 void BM_case(benchmark::State& state, std::string query_arg)
@@ -190,7 +190,7 @@ void BM_case(benchmark::State& state, std::string query_arg)
   int desired_bytes = state.range(1);
   auto input        = build_json_string_column(desired_bytes, num_rows);
   cudf::strings_column_view scv(input->view());
-  size_t num_chars = scv.chars().size();
+  size_t num_chars = scv.chars_size(cudf::get_default_stream());
 
   std::string json_path(query_arg);
 
diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
index 385bb7630f8..639a3dc1181 100644
--- a/cpp/benchmarks/string/case.cpp
+++ b/cpp/benchmarks/string/case.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,18 +43,18 @@ void bench_case(nvbench::state& state)
   if (encoding == "ascii") {
     data_profile ascii_profile = data_profile_builder().no_validity().distribution(
       cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126);  // nice ASCII range
-    auto input = cudf::strings_column_view(col_view);
-    auto ascii_column =
-      create_random_column(cudf::type_id::INT8, row_count{input.chars_size()}, ascii_profile);
+    auto input        = cudf::strings_column_view(col_view);
+    auto ascii_column = create_random_column(
+      cudf::type_id::INT8, row_count{input.chars_size(cudf::get_default_stream())}, ascii_profile);
     auto ascii_data = ascii_column->view();
 
     col_view = cudf::column_view(col_view.type(),
                                  col_view.size(),
-                                 nullptr,
+                                 ascii_data.data<char>(),
                                  col_view.null_mask(),
                                  col_view.null_count(),
                                  0,
-                                 {input.offsets(), ascii_data});
+                                 {input.offsets()});
 
     ascii_contents = ascii_column->release();
   }
@@ -62,9 +62,9 @@ void bench_case(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  state.add_element_count(input.chars_size(), "chars_size");
-  state.add_global_memory_reads<nvbench::int8_t>(input.chars_size());
-  state.add_global_memory_writes<nvbench::int8_t>(input.chars_size());
+  state.add_element_count(input.chars_size(cudf::get_default_stream()), "chars_size");
+  state.add_global_memory_reads<nvbench::int8_t>(input.chars_size(cudf::get_default_stream()));
+  state.add_global_memory_writes<nvbench::int8_t>(input.chars_size(cudf::get_default_stream()));
 
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto result = cudf::strings::to_lower(input); });
diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp
index 59e6245fd41..eec9a5f54d7 100644
--- a/cpp/benchmarks/string/char_types.cpp
+++ b/cpp/benchmarks/string/char_types.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ static void bench_char_types(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
   if (api_type == "all") {
     state.add_global_memory_writes<nvbench::int8_t>(num_rows);  // output is a bool8 per row
diff --git a/cpp/benchmarks/string/combine.cpp b/cpp/benchmarks/string/combine.cpp
index 4ed54a38a48..7acfb1ffb0d 100644
--- a/cpp/benchmarks/string/combine.cpp
+++ b/cpp/benchmarks/string/combine.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,8 @@ static void BM_combine(benchmark::State& state)
     cudf::strings::concatenate(table->view(), separator);
   }
 
-  state.SetBytesProcessed(state.iterations() * (input1.chars_size() + input2.chars_size()));
+  state.SetBytesProcessed(state.iterations() * (input1.chars_size(cudf::get_default_stream()) +
+                                                input2.chars_size(cudf::get_default_stream())));
 }
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)
diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index af45d5d8fee..6d839c1de64 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -100,7 +100,7 @@ static void bench_contains(nvbench::state& state)
   auto pattern = patterns[pattern_index];
   auto program = cudf::strings::regex_program::create(pattern);
 
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_element_count(chars_size, "chars_size");
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
   state.add_global_memory_writes<nvbench::int32_t>(input.size());
diff --git a/cpp/benchmarks/string/convert_datetime.cpp b/cpp/benchmarks/string/convert_datetime.cpp
index 5f332a3e1a0..5deca3664b7 100644
--- a/cpp/benchmarks/string/convert_datetime.cpp
+++ b/cpp/benchmarks/string/convert_datetime.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,7 +48,8 @@ void BM_convert_datetime(benchmark::State& state, direction dir)
       cudf::strings::from_timestamps(input, "%Y-%m-%d %H:%M:%S");
   }
 
-  auto const bytes = dir == direction::to ? source_string.chars_size() : n_rows * sizeof(TypeParam);
+  auto const bytes = dir == direction::to ? source_string.chars_size(cudf::get_default_stream())
+                                          : n_rows * sizeof(TypeParam);
   state.SetBytesProcessed(state.iterations() * bytes);
 }
 
diff --git a/cpp/benchmarks/string/convert_fixed_point.cpp b/cpp/benchmarks/string/convert_fixed_point.cpp
index 0cc98ee146c..e5bd794e405 100644
--- a/cpp/benchmarks/string/convert_fixed_point.cpp
+++ b/cpp/benchmarks/string/convert_fixed_point.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,8 +49,9 @@ void convert_to_fixed_point(benchmark::State& state)
   }
 
   // bytes_processed = bytes_input + bytes_output
-  state.SetBytesProcessed(state.iterations() *
-                          (strings_view.chars_size() + rows * cudf::size_of(dtype)));
+  state.SetBytesProcessed(
+    state.iterations() *
+    (strings_view.chars_size(cudf::get_default_stream()) + rows * cudf::size_of(dtype)));
 }
 
 class StringsFromFixedPoint : public cudf::benchmark {};
@@ -74,7 +75,8 @@ void convert_from_fixed_point(benchmark::State& state)
   // bytes_processed = bytes_input + bytes_output
   state.SetBytesProcessed(
     state.iterations() *
-    (cudf::strings_column_view(results->view()).chars_size() + rows * cudf::size_of(dtype)));
+    (cudf::strings_column_view(results->view()).chars_size(cudf::get_default_stream()) +
+     rows * cudf::size_of(dtype)));
 }
 
 #define CONVERT_TO_FIXED_POINT_BMD(name, fixed_point_type)                  \
diff --git a/cpp/benchmarks/string/convert_numerics.cpp b/cpp/benchmarks/string/convert_numerics.cpp
index cce5d0f6a4d..8f875c5c80f 100644
--- a/cpp/benchmarks/string/convert_numerics.cpp
+++ b/cpp/benchmarks/string/convert_numerics.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,8 +63,9 @@ void convert_to_number(benchmark::State& state)
   }
 
   // bytes_processed = bytes_input + bytes_output
-  state.SetBytesProcessed(state.iterations() *
-                          (strings_view.chars_size() + rows * sizeof(NumericType)));
+  state.SetBytesProcessed(
+    state.iterations() *
+    (strings_view.chars_size(cudf::get_default_stream()) + rows * sizeof(NumericType)));
 }
 
 class StringsFromNumeric : public cudf::benchmark {};
@@ -90,7 +91,8 @@ void convert_from_number(benchmark::State& state)
   // bytes_processed = bytes_input + bytes_output
   state.SetBytesProcessed(
     state.iterations() *
-    (cudf::strings_column_view(results->view()).chars_size() + rows * sizeof(NumericType)));
+    (cudf::strings_column_view(results->view()).chars_size(cudf::get_default_stream()) +
+     rows * sizeof(NumericType)));
 }
 
 #define CONVERT_TO_NUMERICS_BD(name, type)                               \
diff --git a/cpp/benchmarks/string/copy.cu b/cpp/benchmarks/string/copy.cu
index 27438f80f92..6b2f6c3a0a7 100644
--- a/cpp/benchmarks/string/copy.cu
+++ b/cpp/benchmarks/string/copy.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,8 +64,9 @@ static void BM_copy(benchmark::State& state, copy_type ct)
     }
   }
 
-  state.SetBytesProcessed(state.iterations() *
-                          cudf::strings_column_view(source->view().column(0)).chars_size());
+  state.SetBytesProcessed(
+    state.iterations() *
+    cudf::strings_column_view(source->view().column(0)).chars_size(cudf::get_default_stream()));
 }
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)
diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp
index 08406462632..a656010dca5 100644
--- a/cpp/benchmarks/string/count.cpp
+++ b/cpp/benchmarks/string/count.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,7 +47,7 @@ static void bench_count(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_element_count(chars_size, "chars_size");
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
   state.add_global_memory_writes<nvbench::int32_t>(input.size());
diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp
index 135dadabbe4..af4fedb5799 100644
--- a/cpp/benchmarks/string/extract.cpp
+++ b/cpp/benchmarks/string/extract.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ static void bench_extract(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
-  auto chars_size = strings_view.chars_size();
+  auto chars_size = strings_view.chars_size(cudf::get_default_stream());
   state.add_element_count(chars_size, "chars_size");            // number of bytes;
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
   state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
diff --git a/cpp/benchmarks/string/factory.cu b/cpp/benchmarks/string/factory.cu
index c73bcb0b0ad..c4e74c4d97e 100644
--- a/cpp/benchmarks/string/factory.cu
+++ b/cpp/benchmarks/string/factory.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ static void BM_factory(benchmark::State& state)
   }
 
   cudf::strings_column_view input(column->view());
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
+  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
 }
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)
diff --git a/cpp/benchmarks/string/filter.cpp b/cpp/benchmarks/string/filter.cpp
index b935fc4a11f..613834b1f3e 100644
--- a/cpp/benchmarks/string/filter.cpp
+++ b/cpp/benchmarks/string/filter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,7 +57,7 @@ static void BM_filter_chars(benchmark::State& state, FilterAPI api)
     }
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
+  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
 }
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)
diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index 5f2e6946b8b..e866092f3a3 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ static void BM_find_scalar(benchmark::State& state, FindAPI find_api)
     }
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
+  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
 }
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)
diff --git a/cpp/benchmarks/string/gather.cpp b/cpp/benchmarks/string/gather.cpp
index 530b09b7d6a..5b1c679be7d 100644
--- a/cpp/benchmarks/string/gather.cpp
+++ b/cpp/benchmarks/string/gather.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,8 @@ static void bench_gather(nvbench::state& state)
     create_random_table({cudf::type_id::INT32}, row_count{num_rows}, map_profile);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-  auto chars_size = cudf::strings_column_view(input_table->view().column(0)).chars_size();
+  auto chars_size =
+    cudf::strings_column_view(input_table->view().column(0)).chars_size(cudf::get_default_stream());
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
   state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 
diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp
index a122c0022a9..6dcf731ad3c 100644
--- a/cpp/benchmarks/string/join_strings.cpp
+++ b/cpp/benchmarks/string/join_strings.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ static void bench_join(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
-  auto const chars_size = input.chars_size();
+  auto const chars_size = input.chars_size(cudf::get_default_stream());
   state.add_element_count(chars_size, "chars_size");            // number of bytes;
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
   state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
diff --git a/cpp/benchmarks/string/lengths.cpp b/cpp/benchmarks/string/lengths.cpp
index 36c4bf64a00..a19060ead3b 100644
--- a/cpp/benchmarks/string/lengths.cpp
+++ b/cpp/benchmarks/string/lengths.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ static void bench_lengths(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output is an integer per row
 
diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp
index 6ac832471a5..99cef640dc3 100644
--- a/cpp/benchmarks/string/like.cpp
+++ b/cpp/benchmarks/string/like.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -99,7 +99,7 @@ static void bench_like(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_element_count(chars_size, "chars_size");           // number of bytes;
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
   state.add_global_memory_writes<nvbench::int8_t>(n_rows);     // writes are BOOL8
diff --git a/cpp/benchmarks/string/repeat_strings.cpp b/cpp/benchmarks/string/repeat_strings.cpp
index 92645524efb..f1d1516f248 100644
--- a/cpp/benchmarks/string/repeat_strings.cpp
+++ b/cpp/benchmarks/string/repeat_strings.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,7 +59,7 @@ static void BM_repeat_strings_scalar_times(benchmark::State& state)
     cudf::strings::repeat_strings(strings_col, default_repeat_times);
   }
 
-  state.SetBytesProcessed(state.iterations() * strings_col.chars_size());
+  state.SetBytesProcessed(state.iterations() * strings_col.chars_size(cudf::get_default_stream()));
 }
 
 static void BM_repeat_strings_column_times(benchmark::State& state)
@@ -75,8 +75,8 @@ static void BM_repeat_strings_column_times(benchmark::State& state)
     cudf::strings::repeat_strings(strings_col, repeat_times_col);
   }
 
-  state.SetBytesProcessed(state.iterations() *
-                          (strings_col.chars_size() + repeat_times_col.size() * sizeof(int32_t)));
+  state.SetBytesProcessed(state.iterations() * (strings_col.chars_size(cudf::get_default_stream()) +
+                                                repeat_times_col.size() * sizeof(int32_t)));
 }
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)
diff --git a/cpp/benchmarks/string/replace.cpp b/cpp/benchmarks/string/replace.cpp
index 5ddf09f5cec..c8f26142193 100644
--- a/cpp/benchmarks/string/replace.cpp
+++ b/cpp/benchmarks/string/replace.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,7 +58,7 @@ static void BM_replace(benchmark::State& state, replace_type rt)
     }
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
+  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
 }
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)
diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp
index b8efd76ab41..4dcf1314f83 100644
--- a/cpp/benchmarks/string/replace_re.cpp
+++ b/cpp/benchmarks/string/replace_re.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ static void bench_replace(nvbench::state& state)
 
   auto program = cudf::strings::regex_program::create("(\\d+)");
 
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_element_count(chars_size, "chars_size");
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
   state.add_global_memory_writes<nvbench::int8_t>(chars_size);
diff --git a/cpp/benchmarks/string/reverse.cpp b/cpp/benchmarks/string/reverse.cpp
index 31cd4639115..a2676609a40 100644
--- a/cpp/benchmarks/string/reverse.cpp
+++ b/cpp/benchmarks/string/reverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ static void bench_reverse(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_element_count(chars_size, "chars_size");            // number of bytes;
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
   state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
diff --git a/cpp/benchmarks/string/slice.cpp b/cpp/benchmarks/string/slice.cpp
index 6c1d7d98d3a..0f973a7c8b5 100644
--- a/cpp/benchmarks/string/slice.cpp
+++ b/cpp/benchmarks/string/slice.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,7 +58,7 @@ static void BM_slice(benchmark::State& state, slice_type rt)
     }
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
+  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
 }
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)
diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp
index eb724fabfd1..9ef58daf0fc 100644
--- a/cpp/benchmarks/string/split.cpp
+++ b/cpp/benchmarks/string/split.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ static void bench_split(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_element_count(chars_size, "chars_size");            // number of bytes;
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
   state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
diff --git a/cpp/benchmarks/string/split_re.cpp b/cpp/benchmarks/string/split_re.cpp
index 67aa6f0e008..1fdb6e67109 100644
--- a/cpp/benchmarks/string/split_re.cpp
+++ b/cpp/benchmarks/string/split_re.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ static void bench_split(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_element_count(chars_size, "chars_size");            // number of bytes;
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
   state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
diff --git a/cpp/benchmarks/string/translate.cpp b/cpp/benchmarks/string/translate.cpp
index 00ca7459964..dc3c8c71488 100644
--- a/cpp/benchmarks/string/translate.cpp
+++ b/cpp/benchmarks/string/translate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ static void BM_translate(benchmark::State& state, int entry_count)
     cudf::strings::translate(input, entries);
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
+  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
 }
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)
diff --git a/cpp/benchmarks/string/url_decode.cu b/cpp/benchmarks/string/url_decode.cu
index 9ede89bee43..b3aeb69e5ea 100644
--- a/cpp/benchmarks/string/url_decode.cu
+++ b/cpp/benchmarks/string/url_decode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ auto generate_column(cudf::size_type num_rows, cudf::size_type chars_per_row, do
   auto col_1a     = cudf::test::strings_column_wrapper(strings.begin(), strings.end());
   auto table_a    = cudf::repeat(cudf::table_view{{col_1a}}, num_rows);
   auto result_col = std::move(table_a->release()[0]);  // string column with num_rows  aaa...
-  auto chars_col  = result_col->child(cudf::strings_column_view::chars_column_index).mutable_view();
+  auto chars_data = static_cast<char*>(result_col->mutable_view().head());
   auto offset_col = result_col->child(cudf::strings_column_view::offsets_column_index).view();
 
   auto engine = thrust::default_random_engine{};
@@ -75,7 +75,7 @@ auto generate_column(cudf::size_type num_rows, cudf::size_type chars_per_row, do
                      thrust::make_zip_iterator(offset_col.begin<cudf::size_type>(),
                                                offset_col.begin<cudf::size_type>() + 1),
                      num_rows,
-                     url_string_generator{chars_col.begin<char>(), esc_seq_chance, engine});
+                     url_string_generator{chars_data, esc_seq_chance, engine});
   return result_col;
 }
 
diff --git a/cpp/benchmarks/structs/create_structs.cpp b/cpp/benchmarks/structs/create_structs.cpp
new file mode 100644
index 00000000000..480a719461e
--- /dev/null
+++ b/cpp/benchmarks/structs/create_structs.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_nested_types.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+void nvbench_create_structs(nvbench::state& state)
+{
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto const table_ptr = create_structs_data(state); });
+}
+
+NVBENCH_BENCH(nvbench_create_structs)
+  .set_name("create_structs")
+  .add_int64_power_of_two_axis("NumRows", {10, 18, 26})
+  .add_int64_axis("Depth", {1, 8, 16})
+  .add_int64_axis("Nulls", {0, 1});
diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp
index 8a8bd9ae586..0a1ea52c415 100644
--- a/cpp/benchmarks/text/edit_distance.cpp
+++ b/cpp/benchmarks/text/edit_distance.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,8 @@ static void bench_edit_distance(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  auto chars_size = input1.chars_size() + input2.chars_size();
+  auto chars_size =
+    input1.chars_size(cudf::get_default_stream()) + input2.chars_size(cudf::get_default_stream());
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
   // output are integers (one per row)
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);
diff --git a/cpp/benchmarks/text/hash_ngrams.cpp b/cpp/benchmarks/text/hash_ngrams.cpp
index 5bbd2fc6819..3df0c61fc31 100644
--- a/cpp/benchmarks/text/hash_ngrams.cpp
+++ b/cpp/benchmarks/text/hash_ngrams.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@ static void bench_hash_ngrams(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
   // output are hashes: approximate total number of hashes
   state.add_global_memory_writes<nvbench::int32_t>(num_rows * ngrams);
diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp
index 70470b829bd..60251c96096 100644
--- a/cpp/benchmarks/text/jaccard.cpp
+++ b/cpp/benchmarks/text/jaccard.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <benchmarks/common/generate_input.hpp>
 
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <nvtext/jaccard.hpp>
 
@@ -44,9 +45,10 @@ static void bench_jaccard(nvbench::state& state)
   cudf::strings_column_view input1(input_table->view().column(0));
   cudf::strings_column_view input2(input_table->view().column(1));
 
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
 
-  auto chars_size = input1.chars_size() + input2.chars_size();
+  auto chars_size = input1.chars_size(stream) + input2.chars_size(stream);
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
   state.add_global_memory_writes<nvbench::float32_t>(num_rows);
 
diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp
index 1b60caa24de..d10d0d307d7 100644
--- a/cpp/benchmarks/text/minhash.cpp
+++ b/cpp/benchmarks/text/minhash.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,7 +52,7 @@ static void bench_minhash(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
 
diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp
index f3fd5cc5729..8e48f8e9a05 100644
--- a/cpp/benchmarks/text/ngrams.cpp
+++ b/cpp/benchmarks/text/ngrams.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ static void BM_ngrams(benchmark::State& state, ngrams_type nt)
     }
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
+  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
 }
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)
diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp
index 6878fa4f8b6..71bccd80d39 100644
--- a/cpp/benchmarks/text/normalize.cpp
+++ b/cpp/benchmarks/text/normalize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@ static void bench_normalize(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
   state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 
diff --git a/cpp/benchmarks/text/replace.cpp b/cpp/benchmarks/text/replace.cpp
index 257f62aa728..767ebab3eee 100644
--- a/cpp/benchmarks/text/replace.cpp
+++ b/cpp/benchmarks/text/replace.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,7 +59,7 @@ static void bench_replace(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  auto chars_size = view.chars_size();
+  auto chars_size = view.chars_size(cudf::get_default_stream());
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
   state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 
diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp
index b556a84c541..2151b28d637 100644
--- a/cpp/benchmarks/text/tokenize.cpp
+++ b/cpp/benchmarks/text/tokenize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ static void bench_tokenize(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
   state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 
diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp
index 80942e2697d..770519294ad 100644
--- a/cpp/benchmarks/text/vocab.cpp
+++ b/cpp/benchmarks/text/vocab.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,7 @@
 
 static void bench_vocab_tokenize(nvbench::state& state)
 {
+  auto const stream    = cudf::get_default_stream();
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
 
@@ -63,16 +64,16 @@ static void bench_vocab_tokenize(nvbench::state& state)
   }();
   auto const vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocab_col->view()));
 
-  auto token_count = [input] {
+  auto token_count = [input, stream] {
     auto const counts = nvtext::count_tokens(input);
     auto const agg    = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
     auto const count  = cudf::reduce(counts->view(), *agg, counts->type());
-    return static_cast<cudf::scalar_type_t<cudf::size_type>*>(count.get())
-      ->value(cudf::get_default_stream());
+    return static_cast<cudf::scalar_type_t<cudf::size_type>*>(count.get())->value(stream);
   }();
 
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-  auto chars_size = input.chars_size() + cudf::strings_column_view(vocab_col->view()).chars_size();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size =
+    input.chars_size(stream) + cudf::strings_column_view(vocab_col->view()).chars_size(stream);
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
   state.add_global_memory_writes<nvbench::int32_t>(token_count);
 
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
index 3f985cffb1f..161328ae088 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,7 +58,7 @@ constexpr int block_size = 256;
 
 // This is for NO_DISPATCHING
 template <FunctorType functor_type, class T>
-__global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols)
+CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols)
 {
   using F               = Functor<T, functor_type>;
   cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -72,7 +72,7 @@ __global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_
 
 // This is for HOST_DISPATCHING
 template <FunctorType functor_type, class T>
-__global__ void host_dispatching_kernel(cudf::mutable_column_device_view source_column)
+CUDF_KERNEL void host_dispatching_kernel(cudf::mutable_column_device_view source_column)
 {
   using F               = Functor<T, functor_type>;
   T* A                  = source_column.data<T>();
@@ -124,7 +124,7 @@ struct RowHandle {
 
 // This is for DEVICE_DISPATCHING
 template <FunctorType functor_type>
-__global__ void device_dispatching_kernel(cudf::mutable_table_device_view source)
+CUDF_KERNEL void device_dispatching_kernel(cudf::mutable_table_device_view source)
 {
   cudf::size_type const n_rows = source.num_rows();
   cudf::size_type index        = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index fc2f72de33c..c38151d7518 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -1197,17 +1197,15 @@ This is related to [Arrow's "Variable-Size List" memory layout](https://arrow.ap
 
 ## Strings columns
 
-Strings are represented in much the same way as lists, except that the data child column is always
-a non-nullable column of `INT8` data. The parent column's type is `STRING` and contains no data,
+Strings are represented as a column with a data device buffer and a child offsets column.
+The parent column's type is `STRING` and its data holds all the characters across all the strings packed together
 but its size represents the number of strings in the column, and its null mask represents the
 validity of each string. To summarize, the strings column children are:
 
 1. A non-nullable column of [`size_type`](#cudfsize_type) elements that indicates the offset to the beginning of each
-   string in a dense column of all characters.
-2. A non-nullable column of `INT8` elements of all the characters across all the strings packed
-   together.
+   string in a dense data buffer of all characters.
 
-With this representation, `characters[offsets[i]]` is the first character of string `i`, and the
+With this representation, `data[offsets[i]]` is the first character of string `i`, and the
 size of string `i` is given by `offsets[i+1] - offsets[i]`. The following image shows an example of
 this compound column representation of strings.
 
diff --git a/cpp/doxygen/developer_guide/TESTING.md b/cpp/doxygen/developer_guide/TESTING.md
index c19976a956b..a4ffe0f575b 100644
--- a/cpp/doxygen/developer_guide/TESTING.md
+++ b/cpp/doxygen/developer_guide/TESTING.md
@@ -464,9 +464,9 @@ the host (`to_host`).
 
 ### Background
 
-libcudf employs a custom-built [preload library
-docs](https://man7.org/linux/man-pages/man8/ld.so.8.html) to validate its internal stream usage (the
-code may be found
+libcudf employs a custom-built [preload
+library](https://man7.org/linux/man-pages/man8/ld.so.8.html) to validate its internal stream usage
+(the code may be found
 [`here`](https://github.com/rapidsai/cudf/blob/main/cpp/tests/utilities/identify_stream_usage.cpp)).
 This library wraps every asynchronous CUDA runtime API call that accepts a stream with a check to
 ensure that the passed CUDA stream is a valid one, immediately throwing an exception if an invalid
diff --git a/cpp/doxygen/developer_guide/strings.png b/cpp/doxygen/developer_guide/strings.png
index 85ffef283b6..1d18ea8a407 100644
Binary files a/cpp/doxygen/developer_guide/strings.png and b/cpp/doxygen/developer_guide/strings.png differ
diff --git a/cpp/examples/basic/src/process_csv.cpp b/cpp/examples/basic/src/process_csv.cpp
index edd14d9ee5f..0d2b6b099ac 100644
--- a/cpp/examples/basic/src/process_csv.cpp
+++ b/cpp/examples/basic/src/process_csv.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/io/csv.hpp>
 #include <cudf/table/table.hpp>
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
@@ -82,7 +83,7 @@ int main(int argc, char** argv)
   // Construct a memory pool using the CUDA memory resource
   // Using a memory pool for device memory allocations is important for good performance in libcudf.
   // The pool defaults to allocating half of the available GPU memory.
-  rmm::mr::pool_memory_resource mr{&cuda_mr};
+  rmm::mr::pool_memory_resource mr{&cuda_mr, rmm::percent_of_free_device_memory(50)};
 
   // Set the pool resource to be used by default for all device memory allocations
   // Note: It is the user's responsibility to ensure the `mr` object stays alive for the duration of
diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp
index 5969985cc72..c7c54592b70 100644
--- a/cpp/examples/nested_types/deduplication.cpp
+++ b/cpp/examples/nested_types/deduplication.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
@@ -57,7 +58,10 @@
 std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool pool)
 {
   auto cuda_mr = std::make_shared<rmm::mr::cuda_memory_resource>();
-  if (pool) { return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(cuda_mr); }
+  if (pool) {
+    return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+      cuda_mr, rmm::percent_of_free_device_memory(50));
+  }
   return cuda_mr;
 }
 
diff --git a/cpp/examples/strings/common.hpp b/cpp/examples/strings/common.hpp
index 2fd9daf9339..0dbe6fe2b7b 100644
--- a/cpp/examples/strings/common.hpp
+++ b/cpp/examples/strings/common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
@@ -60,7 +61,8 @@ auto make_cuda_mr() { return std::make_shared<rmm::mr::cuda_memory_resource>();
  */
 auto make_pool_mr()
 {
-  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda_mr());
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+    make_cuda_mr(), rmm::percent_of_free_device_memory(50));
 }
 
 /**
diff --git a/cpp/examples/strings/custom_optimized.cu b/cpp/examples/strings/custom_optimized.cu
index 36521871ad8..522093bc647 100644
--- a/cpp/examples/strings/custom_optimized.cu
+++ b/cpp/examples/strings/custom_optimized.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,9 +37,9 @@
  * @param d_visibilities Column of visibilities
  * @param d_sizes Output sizes for each row
  */
-__global__ void sizes_kernel(cudf::column_device_view const d_names,
-                             cudf::column_device_view const d_visibilities,
-                             cudf::size_type* d_sizes)
+__global__ static void sizes_kernel(cudf::column_device_view const d_names,
+                                    cudf::column_device_view const d_visibilities,
+                                    cudf::size_type* d_sizes)
 {
   // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
@@ -74,10 +74,10 @@ __global__ void sizes_kernel(cudf::column_device_view const d_names,
  * @param d_offsets Byte offset in `d_chars` for each row
  * @param d_chars Output memory for all rows
  */
-__global__ void redact_kernel(cudf::column_device_view const d_names,
-                              cudf::column_device_view const d_visibilities,
-                              cudf::size_type const* d_offsets,
-                              char* d_chars)
+__global__ static void redact_kernel(cudf::column_device_view const d_names,
+                                     cudf::column_device_view const d_visibilities,
+                                     cudf::size_type const* d_offsets,
+                                     char* d_chars)
 {
   // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/cpp/examples/strings/custom_prealloc.cu b/cpp/examples/strings/custom_prealloc.cu
index 0af4c47e947..93194899fe1 100644
--- a/cpp/examples/strings/custom_prealloc.cu
+++ b/cpp/examples/strings/custom_prealloc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,12 +37,12 @@
  * @param d_offsets Byte offset in `d_chars` for each row
  * @param d_output Output array of string_view objects
  */
-__global__ void redact_kernel(cudf::column_device_view const d_names,
-                              cudf::column_device_view const d_visibilities,
-                              cudf::string_view redaction,
-                              char* working_memory,
-                              cudf::size_type const* d_offsets,
-                              cudf::string_view* d_output)
+__global__ static void redact_kernel(cudf::column_device_view const d_names,
+                                     cudf::column_device_view const d_visibilities,
+                                     cudf::string_view redaction,
+                                     char* working_memory,
+                                     cudf::size_type const* d_offsets,
+                                     cudf::string_view* d_output)
 {
   // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
@@ -101,7 +101,7 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
   auto const offsets = scv.offsets_begin();
 
   // create working memory to hold the output of each string
-  auto working_memory = rmm::device_uvector<char>(scv.chars_size(), stream);
+  auto working_memory = rmm::device_uvector<char>(scv.chars_size(stream), stream);
   // create a vector for the output strings' pointers
   auto str_ptrs = rmm::device_uvector<cudf::string_view>(names.size(), stream);
 
diff --git a/cpp/examples/strings/custom_with_malloc.cu b/cpp/examples/strings/custom_with_malloc.cu
index 32f7bf7cbd0..e02fb52cd76 100644
--- a/cpp/examples/strings/custom_with_malloc.cu
+++ b/cpp/examples/strings/custom_with_malloc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,10 +64,10 @@ void set_malloc_heap_size(size_t heap_size = 1073741824)  // 1GB
  * @param redaction Redacted string replacement
  * @param d_output Output array of string_view objects
  */
-__global__ void redact_kernel(cudf::column_device_view const d_names,
-                              cudf::column_device_view const d_visibilities,
-                              cudf::string_view redaction,
-                              cudf::string_view* d_output)
+__global__ static void redact_kernel(cudf::column_device_view const d_names,
+                                     cudf::column_device_view const d_visibilities,
+                                     cudf::string_view redaction,
+                                     cudf::string_view* d_output)
 {
   // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
@@ -107,7 +107,9 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
  * @param redaction Redacted string replacement (not to be freed)
  * @param d_output Output array of string_view objects to free
  */
-__global__ void free_kernel(cudf::string_view redaction, cudf::string_view* d_output, int count)
+__global__ static void free_kernel(cudf::string_view redaction,
+                                   cudf::string_view* d_output,
+                                   int count)
 {
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
   if (index >= count) return;
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index daee443a5f3..19722d127cb 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -445,7 +445,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   __device__ T element(size_type element_index) const noexcept
   {
     size_type index       = element_index + offset();  // account for this view's _offset
-    char const* d_strings = d_children[strings_column_view::chars_column_index].data<char>();
+    char const* d_strings = static_cast<char const*>(_data);
     auto const offsets    = d_children[strings_column_view::offsets_column_index];
     auto const itr        = cudf::detail::input_offsetalator(offsets.head(), offsets.type());
     auto const offset     = itr[index];
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index ce5772dcf3c..a6167d983c5 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -462,9 +462,31 @@ std::unique_ptr<column> make_strings_column(
  *  nulls is used for interpreting this bitmask.
  * @return Constructed strings column
  */
+[[deprecated]] std::unique_ptr<column> make_strings_column(size_type num_strings,
+                                                           std::unique_ptr<column> offsets_column,
+                                                           std::unique_ptr<column> chars_column,
+                                                           size_type null_count,
+                                                           rmm::device_buffer&& null_mask);
+/**
+ * @brief Construct a STRING type column given offsets column, chars columns, and null mask and null
+ * count.
+ *
+ * The columns and mask are moved into the resulting strings column.
+ *
+ * @param num_strings The number of strings the column represents.
+ * @param offsets_column The column of offset values for this column. The number of elements is
+ *  one more than the total number of strings so the `offset[last] - offset[0]` is the total number
+ *  of bytes in the strings vector.
+ * @param chars_buffer The buffer of char bytes for all the strings for this column. Individual
+ *  strings are identified by the offsets and the nullmask.
+ * @param null_count The number of null string entries.
+ * @param null_mask The bits specifying the null strings in device memory. Arrow format for
+ *  nulls is used for interpreting this bitmask.
+ * @return Constructed strings column
+ */
 std::unique_ptr<column> make_strings_column(size_type num_strings,
                                             std::unique_ptr<column> offsets_column,
-                                            std::unique_ptr<column> chars_column,
+                                            rmm::device_buffer&& chars_buffer,
                                             size_type null_count,
                                             rmm::device_buffer&& null_mask);
 
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index ebe7e052b6d..1d051ea32ff 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,10 +52,10 @@ namespace detail {
 
 // Compute the count of elements that pass the mask within each block
 template <typename Filter, int block_size>
-__global__ void compute_block_counts(cudf::size_type* __restrict__ block_counts,
-                                     cudf::size_type size,
-                                     cudf::size_type per_thread,
-                                     Filter filter)
+CUDF_KERNEL void compute_block_counts(cudf::size_type* __restrict__ block_counts,
+                                      cudf::size_type size,
+                                      cudf::size_type per_thread,
+                                      Filter filter)
 {
   int tid   = threadIdx.x + per_thread * block_size * blockIdx.x;
   int count = 0;
@@ -96,7 +96,7 @@ __device__ cudf::size_type block_scan_mask(bool mask_true, cudf::size_type& bloc
 //
 // Note: `filter` is not run on indices larger than the input column size
 template <typename T, typename Filter, int block_size, bool has_validity>
-__launch_bounds__(block_size) __global__
+__launch_bounds__(block_size) CUDF_KERNEL
   void scatter_kernel(cudf::mutable_column_device_view output_view,
                       cudf::size_type* output_null_count,
                       cudf::column_device_view input_view,
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index 04ad1f20196..6162fa5ecf1 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ template <size_type block_size,
           typename RightIter,
           typename Filter,
           bool has_nulls>
-__launch_bounds__(block_size) __global__
+__launch_bounds__(block_size) CUDF_KERNEL
   void copy_if_else_kernel(LeftIter lhs,
                            RightIter rhs,
                            Filter filter,
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index 16e4e7a1297..4bfdaa94c53 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,12 +40,12 @@ template <cudf::size_type block_size,
           typename SourceValidityIterator,
           typename T,
           bool has_validity>
-__global__ void copy_range_kernel(SourceValueIterator source_value_begin,
-                                  SourceValidityIterator source_validity_begin,
-                                  cudf::mutable_column_device_view target,
-                                  cudf::size_type target_begin,
-                                  cudf::size_type target_end,
-                                  cudf::size_type* __restrict__ const null_count)
+CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin,
+                                   SourceValidityIterator source_validity_begin,
+                                   cudf::mutable_column_device_view target,
+                                   cudf::size_type target_begin,
+                                   cudf::size_type target_end,
+                                   cudf::size_type* __restrict__ const null_count)
 {
   using cudf::detail::warp_size;
 
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index ae05d4c6954..e57d85f2998 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,12 +61,12 @@ namespace detail {
  * @param count_ptr Pointer to counter of set bits
  */
 template <int block_size, typename Binop>
-__global__ void offset_bitmask_binop(Binop op,
-                                     device_span<bitmask_type> destination,
-                                     device_span<bitmask_type const* const> source,
-                                     device_span<size_type const> source_begin_bits,
-                                     size_type source_size_bits,
-                                     size_type* count_ptr)
+CUDF_KERNEL void offset_bitmask_binop(Binop op,
+                                      device_span<bitmask_type> destination,
+                                      device_span<bitmask_type const* const> source,
+                                      device_span<size_type const> source_begin_bits,
+                                      size_type source_size_bits,
+                                      size_type* count_ptr)
 {
   auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -214,11 +214,11 @@ enum class count_bits_policy : bool {
  * in each range is updated.
  */
 template <typename OffsetIterator, typename OutputIterator>
-__global__ void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bitmask,
-                                                          size_type num_ranges,
-                                                          OffsetIterator first_bit_indices,
-                                                          OffsetIterator last_bit_indices,
-                                                          OutputIterator null_counts)
+CUDF_KERNEL void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bitmask,
+                                                           size_type num_ranges,
+                                                           OffsetIterator first_bit_indices,
+                                                           OffsetIterator last_bit_indices,
+                                                           OutputIterator null_counts)
 {
   constexpr size_type const word_size_in_bits{detail::size_in_bits<bitmask_type>()};
 
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index 264302df0e9..86c85ca8d06 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -211,7 +211,7 @@ __device__ inline T round_up_pow2(T number_to_round, T modulus)
 }
 
 template <class F>
-__global__ void single_thread_kernel(F f)
+CUDF_KERNEL void single_thread_kernel(F f)
 {
   f();
 }
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index f3f95dad017..d0073177445 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ namespace detail {
  * @param[out] valid_count The count of set bits in the output bitmask
  */
 template <size_type block_size, typename InputIterator, typename Predicate>
-__global__ void valid_if_kernel(
+CUDF_KERNEL void valid_if_kernel(
   bitmask_type* output, InputIterator begin, size_type size, Predicate p, size_type* valid_count)
 {
   constexpr size_type leader_lane{0};
@@ -151,13 +151,13 @@ template <typename InputIterator1,
           typename InputIterator2,
           typename BinaryPredicate,
           int32_t block_size>
-__global__ void valid_if_n_kernel(InputIterator1 begin1,
-                                  InputIterator2 begin2,
-                                  BinaryPredicate p,
-                                  bitmask_type* masks[],
-                                  size_type mask_count,
-                                  size_type mask_num_bits,
-                                  size_type* valid_counts)
+CUDF_KERNEL void valid_if_n_kernel(InputIterator1 begin1,
+                                   InputIterator2 begin2,
+                                   BinaryPredicate p,
+                                   bitmask_type* masks[],
+                                   size_type mask_count,
+                                   size_type mask_num_bits,
+                                   size_type* valid_counts)
 {
   for (size_type mask_idx = 0; mask_idx < mask_count; mask_idx++) {
     auto const mask = masks[mask_idx];
diff --git a/cpp/include/cudf/hashing/detail/helper_functions.cuh b/cpp/include/cudf/hashing/detail/helper_functions.cuh
index cd58ec5f57d..3489fdeccee 100644
--- a/cpp/include/cudf/hashing/detail/helper_functions.cuh
+++ b/cpp/include/cudf/hashing/detail/helper_functions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -130,10 +130,10 @@ __forceinline__ __device__ void store_pair_vectorized(pair_type* __restrict__ co
 }
 
 template <typename value_type, typename size_type, typename key_type, typename elem_type>
-__global__ void init_hashtbl(value_type* __restrict__ const hashtbl_values,
-                             size_type const n,
-                             key_type const key_val,
-                             elem_type const elem_val)
+CUDF_KERNEL void init_hashtbl(value_type* __restrict__ const hashtbl_values,
+                              size_type const n,
+                              key_type const key_val,
+                              elem_type const elem_val)
 {
   size_type const idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < n) {
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 3ef356bed1b..a3f76817f8a 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -393,6 +393,7 @@ class orc_reader_options_builder {
  * @endcode
  *
  * @param options Settings for controlling reading behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the table in the returned
  * table_with_metadata.
  *
@@ -400,6 +401,7 @@ class orc_reader_options_builder {
  */
 table_with_metadata read_orc(
   orc_reader_options const& options,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
@@ -864,8 +866,10 @@ class orc_writer_options_builder {
  * @endcode
  *
  * @param options Settings for controlling reading behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  */
-void write_orc(orc_writer_options const& options);
+void write_orc(orc_writer_options const& options,
+               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Builds settings to use for `write_orc_chunked()`.
@@ -1287,8 +1291,10 @@ class orc_chunked_writer {
    * @brief Constructor with chunked writer options
    *
    * @param[in] options options used to write table
+   * @param[in] stream CUDA stream used for device memory operations and kernel launches
    */
-  orc_chunked_writer(chunked_orc_writer_options const& options);
+  orc_chunked_writer(chunked_orc_writer_options const& options,
+                     rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   /**
    * @brief Writes table to output.
diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index 25e0c130dff..19d44263d1b 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,10 +63,12 @@ struct raw_orc_statistics {
  * @endcode
  *
  * @param src_info Dataset source
+ * @param stream CUDA stream used for device memory operations and kernel launches
  *
  * @return Column names and encoded ORC statistics
  */
-raw_orc_statistics read_raw_orc_statistics(source_info const& src_info);
+raw_orc_statistics read_raw_orc_statistics(
+  source_info const& src_info, rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Monostate type alias for the statistics variant.
@@ -207,10 +209,12 @@ struct parsed_orc_statistics {
  * @ingroup io_readers
  *
  * @param src_info Dataset source
+ * @param stream CUDA stream used for device memory operations and kernel launches
  *
  * @return Column names and decoded ORC statistics
  */
-parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info);
+parsed_orc_statistics read_parsed_orc_statistics(
+  source_info const& src_info, rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Schema of an ORC column, including the nested columns.
@@ -368,10 +372,12 @@ class orc_metadata {
  * @ingroup io_readers
  *
  * @param src_info Dataset source
+ * @param stream CUDA stream used for device memory operations and kernel launches
  *
  * @return orc_metadata with ORC schema, number of rows and number of stripes.
  */
-orc_metadata read_orc_metadata(source_info const& src_info);
+orc_metadata read_orc_metadata(source_info const& src_info,
+                               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 47a48f2175b..3208a81cd63 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -693,6 +693,11 @@ class column_in_metadata {
   column_in_metadata& set_output_as_binary(bool binary) noexcept
   {
     _output_as_binary = binary;
+    if (_output_as_binary and children.size() == 1) {
+      children.emplace_back();
+    } else if (!_output_as_binary and children.size() == 2) {
+      children.pop_back();
+    }
     return *this;
   }
 
diff --git a/cpp/include/cudf/labeling/label_bins.hpp b/cpp/include/cudf/labeling/label_bins.hpp
index 2776f50a939..d8ea262dfe1 100644
--- a/cpp/include/cudf/labeling/label_bins.hpp
+++ b/cpp/include/cudf/labeling/label_bins.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,6 +64,7 @@ enum class inclusive { YES, NO };
  * @param left_inclusive Whether or not the left edge is inclusive.
  * @param right_edges Value of the right edge of each bin.
  * @param right_inclusive Whether or not the right edge is inclusive.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device.
  * @return The integer labels of the elements in `input` according to the specified bins.
  */
@@ -73,6 +74,7 @@ std::unique_ptr<column> label_bins(
   inclusive left_inclusive,
   column_view const& right_edges,
   inclusive right_inclusive,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index 6f0b199ff12..64e14dcc549 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -109,7 +109,7 @@ std::unique_ptr<cudf::column> copy_if_else(StringIterLeft lhs_begin,
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(chars_column->release().data.release()[0]),
                              null_count,
                              std::move(null_mask));
 }
diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh
index 5da3addd9a4..567452bac4e 100644
--- a/cpp/include/cudf/strings/detail/copy_range.cuh
+++ b/cpp/include/cudf/strings/detail/copy_range.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -205,7 +205,7 @@ std::unique_ptr<column> copy_range(SourceValueIterator source_value_begin,
 
     return make_strings_column(target.size(),
                                std::move(p_offsets_column),
-                               std::move(p_chars_column),
+                               std::move(p_chars_column->release().data.release()[0]),
                                null_count,
                                std::move(null_mask));
   }
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index e681373e6e0..442155380a2 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -78,11 +78,11 @@ __forceinline__ __device__ uint4 load_uint4(char const* ptr)
  * @param total_out_strings Number of output strings to be gathered.
  */
 template <typename StringIterator, typename MapIterator>
-__global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
-                                                char* out_chars,
-                                                cudf::detail::input_offsetalator const out_offsets,
-                                                MapIterator string_indices,
-                                                size_type total_out_strings)
+CUDF_KERNEL void gather_chars_fn_string_parallel(StringIterator strings_begin,
+                                                 char* out_chars,
+                                                 cudf::detail::input_offsetalator const out_offsets,
+                                                 MapIterator string_indices,
+                                                 size_type total_out_strings)
 {
   constexpr size_t out_datatype_size = sizeof(uint4);
   constexpr size_t in_datatype_size  = sizeof(uint);
@@ -160,11 +160,11 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
  * @param total_out_strings Number of output strings to be gathered.
  */
 template <int strings_per_threadblock, typename StringIterator, typename MapIterator>
-__global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
-                                              char* out_chars,
-                                              cudf::detail::input_offsetalator const out_offsets,
-                                              MapIterator string_indices,
-                                              size_type total_out_strings)
+CUDF_KERNEL void gather_chars_fn_char_parallel(StringIterator strings_begin,
+                                               char* out_chars,
+                                               cudf::detail::input_offsetalator const out_offsets,
+                                               MapIterator string_indices,
+                                               size_type total_out_strings)
 {
   __shared__ int64_t out_offsets_threadblock[strings_per_threadblock + 1];
 
@@ -321,7 +321,7 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
 
   return make_strings_column(output_count,
                              std::move(out_offsets_column),
-                             std::move(out_chars_column),
+                             std::move(out_chars_column->release().data.release()[0]),
                              0,  // caller sets these
                              rmm::device_buffer{});
 }
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
index aef1fe93792..8049895c3c2 100644
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ b/cpp/include/cudf/strings/detail/merge.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,9 +89,8 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
   auto d_offsets = offsets_column->view().template data<int32_t>();
 
   // create the chars column
-  auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr);
-  // merge the strings
-  auto d_chars = chars_column->mutable_view().template data<char>();
+  rmm::device_uvector<char> chars(bytes, stream, mr);
+  auto d_chars = chars.data();
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
@@ -103,11 +102,8 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
                        memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes());
                      });
 
-  return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
-                             null_count,
-                             std::move(null_mask));
+  return make_strings_column(
+    strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask));
 }
 
 }  // namespace detail
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index de7db4ce47b..fcbdfa619f4 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -137,7 +137,7 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(chars_column->release().data.release()[0]),
                              null_count,
                              std::move(null_mask));
 }
@@ -187,13 +187,12 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
                       [] __device__(auto offset) { return static_cast<int32_t>(offset); }));
 
   // build chars column
-  auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr);
-  auto chars_view   = chars_column->mutable_view();
-  thrust::copy(rmm::exec_policy(stream), chars_begin, chars_end, chars_view.data<char>());
+  rmm::device_uvector<char> chars_data(bytes, stream, mr);
+  thrust::copy(rmm::exec_policy(stream), chars_begin, chars_end, chars_data.begin());
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             chars_data.release(),
                              null_count,
                              std::move(null_mask));
 }
diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp
index e04572535de..5587597cb51 100644
--- a/cpp/include/cudf/strings/detail/utf8.hpp
+++ b/cpp/include/cudf/strings/detail/utf8.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,9 +23,6 @@
  */
 
 namespace cudf {
-
-using char_utf8 = uint32_t;  ///< UTF-8 characters are 1-4 bytes
-
 namespace strings {
 namespace detail {
 
diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp
index e27d32fceb9..e6546777f3f 100644
--- a/cpp/include/cudf/strings/strings_column_view.hpp
+++ b/cpp/include/cudf/strings/strings_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 /**
  * @file
@@ -58,7 +59,6 @@ class strings_column_view : private column_view {
   strings_column_view& operator=(strings_column_view&&) = default;
 
   static constexpr size_type offsets_column_index{0};  ///< Child index of the offsets column
-  static constexpr size_type chars_column_index{1};    ///< Child index of the characters column
 
   using column_view::has_nulls;
   using column_view::is_empty;
@@ -107,9 +107,11 @@ class strings_column_view : private column_view {
    * @brief Returns the internal column of chars
    *
    * @throw cudf::logic_error if this is an empty column
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @return The chars column
    */
-  [[nodiscard]] column_view chars() const;
+  [[deprecated]] [[nodiscard]] column_view chars(
+    rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
 
   /**
    * @brief Returns the number of bytes in the chars child column.
@@ -117,9 +119,10 @@ class strings_column_view : private column_view {
    * This accounts for empty columns but does not reflect a sliced parent column
    * view  (i.e.: non-zero offset or reduced row count).
    *
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @return Number of bytes in the chars child column
    */
-  [[nodiscard]] size_type chars_size() const noexcept;
+  [[nodiscard]] size_type chars_size(rmm::cuda_stream_view stream) const noexcept;
 
   /**
    * @brief Return an iterator for the chars child column.
@@ -128,11 +131,11 @@ class strings_column_view : private column_view {
    * The offsets child must be used to properly address the char bytes.
    *
    * For example, to access the first character of string `i` (accounting for
-   * a sliced column offset) use: `chars_begin()[offsets_begin()[i]]`.
+   * a sliced column offset) use: `chars_begin(stream)[offsets_begin()[i]]`.
    *
    * @return Iterator pointing to the first char byte.
    */
-  [[nodiscard]] chars_iterator chars_begin() const;
+  [[nodiscard]] chars_iterator chars_begin(rmm::cuda_stream_view) const;
 
   /**
    * @brief Return an end iterator for the offsets child column.
@@ -140,9 +143,10 @@ class strings_column_view : private column_view {
    * This does not apply the offset of the parent.
    * The offsets child must be used to properly address the char bytes.
    *
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @return Iterator pointing 1 past the last char byte.
    */
-  [[nodiscard]] chars_iterator chars_end() const;
+  [[nodiscard]] chars_iterator chars_end(rmm::cuda_stream_view stream) const;
 };
 
 //! Strings column APIs.
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index addab160b6e..baf07fa3db6 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,23 @@
 #pragma once
 
 #ifdef __CUDACC__
+/**
+ * @brief Indicates that the function or method is usable on host and device
+ */
 #define CUDF_HOST_DEVICE __host__ __device__
+/**
+ * @brief Indicates that the function is a CUDA kernel
+ */
+#define CUDF_KERNEL __global__ static
 #else
+/**
+ * @brief Indicates that the function or method is usable on host and device
+ */
 #define CUDF_HOST_DEVICE
+/**
+ * @brief Indicates that the function is a CUDA kernel
+ */
+#define CUDF_KERNEL static
 #endif
 
 #include <cassert>
@@ -48,7 +62,6 @@ class mutable_column_view;
 class string_view;
 class list_view;
 class struct_view;
-
 class scalar;
 
 // clang-format off
@@ -81,6 +94,7 @@ using size_type         = int32_t;   ///< Row index type for columns and tables
 using bitmask_type      = uint32_t;  ///< Bitmask type stored as 32-bit unsigned integer
 using valid_type        = uint8_t;   ///< Valid type in host memory
 using thread_index_type = int64_t;   ///< Thread index type in kernels
+using char_utf8         = uint32_t;  ///< UTF-8 characters are 1-4 bytes
 
 /**
  * @brief Similar to `std::distance` but returns `cudf::size_type` and performs `static_cast`
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index 6231f8207f9..49d5098f823 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -226,15 +226,15 @@ template <>
 inline std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(column_view c)
 {
   thrust::host_vector<std::string> host_data(c.size());
+  auto stream = cudf::get_default_stream();
   if (c.size() > c.null_count()) {
     auto const scv     = strings_column_view(c);
     auto const h_chars = cudf::detail::make_std_vector_sync<char>(
-      cudf::device_span<char const>(scv.chars().data<char>(), scv.chars().size()),
-      cudf::get_default_stream());
+      cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
     auto const h_offsets = cudf::detail::make_std_vector_sync(
       cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
                                                scv.size() + 1),
-      cudf::get_default_stream());
+      stream);
 
     // build std::string vector from chars and offsets
     std::transform(
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index abcd89c3035..c4fa4be0f89 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -757,20 +757,21 @@ class strings_column_wrapper : public detail::column_wrapper {
   strings_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{}
   {
     size_type num_strings = std::distance(begin, end);
+    if (num_strings == 0) {
+      wrapped = cudf::make_empty_column(cudf::type_id::STRING);
+      return;
+    }
     auto all_valid        = thrust::make_constant_iterator(true);
     auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid);
-    auto d_chars          = std::make_unique<cudf::column>(
-      cudf::detail::make_device_uvector_sync(
-        chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
-      rmm::device_buffer{},
-      0);
+    auto d_chars          = cudf::detail::make_device_uvector_async(
+      chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
     auto d_offsets = std::make_unique<cudf::column>(
       cudf::detail::make_device_uvector_sync(
         offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
       rmm::device_buffer{},
       0);
     wrapped =
-      cudf::make_strings_column(num_strings, std::move(d_offsets), std::move(d_chars), 0, {});
+      cudf::make_strings_column(num_strings, std::move(d_offsets), d_chars.release(), 0, {});
   }
 
   /**
@@ -805,23 +806,24 @@ class strings_column_wrapper : public detail::column_wrapper {
   strings_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v)
     : column_wrapper{}
   {
-    size_type num_strings        = std::distance(begin, end);
+    size_type num_strings = std::distance(begin, end);
+    if (num_strings == 0) {
+      wrapped = cudf::make_empty_column(cudf::type_id::STRING);
+      return;
+    }
     auto [chars, offsets]        = detail::make_chars_and_offsets(begin, end, v);
     auto [null_mask, null_count] = detail::make_null_mask_vector(v, v + num_strings);
-    auto d_chars                 = std::make_unique<cudf::column>(
-      cudf::detail::make_device_uvector_sync(
-        chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
-      rmm::device_buffer{},
-      0);
+    auto d_chars                 = cudf::detail::make_device_uvector_async(
+      chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
     auto d_offsets = std::make_unique<cudf::column>(
-      cudf::detail::make_device_uvector_sync(
+      cudf::detail::make_device_uvector_async(
         offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
       rmm::device_buffer{},
       0);
     auto d_bitmask = cudf::detail::make_device_uvector_sync(
       null_mask, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
     wrapped = cudf::make_strings_column(
-      num_strings, std::move(d_offsets), std::move(d_chars), null_count, d_bitmask.release());
+      num_strings, std::move(d_offsets), d_chars.release(), null_count, d_bitmask.release());
   }
 
   /**
diff --git a/cpp/include/cudf_test/print_utilities.cuh b/cpp/include/cudf_test/print_utilities.cuh
index 37ffcd401fc..ae6c8cef029 100644
--- a/cpp/include/cudf_test/print_utilities.cuh
+++ b/cpp/include/cudf_test/print_utilities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -103,7 +103,7 @@ CUDF_HOST_DEVICE void print_values(int32_t width, char delimiter, T arg, Ts... a
 }
 
 template <typename... Ts>
-__global__ void print_array_kernel(std::size_t count, int32_t width, char delimiter, Ts... args)
+CUDF_KERNEL void print_array_kernel(std::size_t count, int32_t width, char delimiter, Ts... args)
 {
   if (threadIdx.x == 0 && blockIdx.x == 0) {
     for (std::size_t i = 0; i < count; i++) {
diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index 12dbb4c7851..88e3088d794 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -21,6 +21,7 @@
 
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/aligned.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/binning_memory_resource.hpp>
@@ -43,9 +44,9 @@ inline auto make_managed() { return std::make_shared<rmm::mr::managed_memory_res
 
 inline auto make_pool()
 {
-  auto const [free, total] = rmm::detail::available_device_memory();
-  auto min_alloc =
-    rmm::detail::align_down(std::min(free, total / 10), rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+  auto const [free, total] = rmm::available_device_memory();
+  auto const min_alloc =
+    rmm::align_down(std::min(free, total / 10), rmm::CUDA_ALLOCATION_ALIGNMENT);
   return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda(), min_alloc);
 }
 
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 464c15dac9d..73ba15e39f3 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -86,11 +86,11 @@ scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::strin
   // Construct string column_view
   auto col_v = column_view(s.type(),
                            1,
-                           nullptr,
+                           h_scalar_type_view.data(),
                            reinterpret_cast<bitmask_type const*>(s.validity_data()),
                            static_cast<size_type>(!s.is_valid(stream)),
                            0,
-                           {offsets_column->view(), chars_column_v});
+                           {offsets_column->view()});
   return std::pair{col_v, std::move(offsets_column)};
 }
 
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 9a50eb0d0ec..d605c877d3f 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -237,7 +237,7 @@ struct binary_op_double_device_dispatcher {
  * @param f Functor object to call for each element.
  */
 template <typename Functor>
-__global__ void for_each_kernel(cudf::size_type size, Functor f)
+CUDF_KERNEL void for_each_kernel(cudf::size_type size, Functor f)
 {
   int tid    = threadIdx.x;
   int blkid  = blockIdx.x;
diff --git a/cpp/src/binaryop/jit/kernel.cu b/cpp/src/binaryop/jit/kernel.cu
index c9cc61a4f34..39735a43474 100644
--- a/cpp/src/binaryop/jit/kernel.cu
+++ b/cpp/src/binaryop/jit/kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -43,10 +43,10 @@ struct UserDefinedOp {
 };
 
 template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
-__global__ void kernel_v_v(cudf::size_type size,
-                           TypeOut* out_data,
-                           TypeLhs* lhs_data,
-                           TypeRhs* rhs_data)
+CUDF_KERNEL void kernel_v_v(cudf::size_type size,
+                            TypeOut* out_data,
+                            TypeLhs* lhs_data,
+                            TypeRhs* rhs_data)
 {
   int tid    = threadIdx.x;
   int blkid  = blockIdx.x;
@@ -62,15 +62,15 @@ __global__ void kernel_v_v(cudf::size_type size,
 }
 
 template <typename TypeOut, typename TypeLhs, typename TypeRhs, typename TypeOpe>
-__global__ void kernel_v_v_with_validity(cudf::size_type size,
-                                         TypeOut* out_data,
-                                         TypeLhs* lhs_data,
-                                         TypeRhs* rhs_data,
-                                         cudf::bitmask_type* output_mask,
-                                         cudf::bitmask_type const* lhs_mask,
-                                         cudf::size_type lhs_offset,
-                                         cudf::bitmask_type const* rhs_mask,
-                                         cudf::size_type rhs_offset)
+CUDF_KERNEL void kernel_v_v_with_validity(cudf::size_type size,
+                                          TypeOut* out_data,
+                                          TypeLhs* lhs_data,
+                                          TypeRhs* rhs_data,
+                                          cudf::bitmask_type* output_mask,
+                                          cudf::bitmask_type const* lhs_mask,
+                                          cudf::size_type lhs_offset,
+                                          cudf::bitmask_type const* rhs_mask,
+                                          cudf::size_type rhs_offset)
 {
   int tid    = threadIdx.x;
   int blkid  = blockIdx.x;
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 1a1cbb17d15..bb320e4b81a 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -98,11 +98,11 @@ rmm::device_buffer create_null_mask(size_type size,
 }
 
 namespace {
-__global__ void set_null_mask_kernel(bitmask_type* __restrict__ destination,
-                                     size_type begin_bit,
-                                     size_type end_bit,
-                                     bool valid,
-                                     size_type number_of_mask_words)
+CUDF_KERNEL void set_null_mask_kernel(bitmask_type* __restrict__ destination,
+                                      size_type begin_bit,
+                                      size_type end_bit,
+                                      bool valid,
+                                      size_type number_of_mask_words)
 {
   auto x                            = destination + word_index(begin_bit);
   thread_index_type const last_word = word_index(end_bit) - word_index(begin_bit);
@@ -190,11 +190,11 @@ namespace {
  * @param number_of_mask_words The number of `cudf::bitmask_type` words to copy
  */
 // TODO: Also make binops test that uses offset in column_view
-__global__ void copy_offset_bitmask(bitmask_type* __restrict__ destination,
-                                    bitmask_type const* __restrict__ source,
-                                    size_type source_begin_bit,
-                                    size_type source_end_bit,
-                                    size_type number_of_mask_words)
+CUDF_KERNEL void copy_offset_bitmask(bitmask_type* __restrict__ destination,
+                                     bitmask_type const* __restrict__ source,
+                                     size_type source_begin_bit,
+                                     size_type source_end_bit,
+                                     size_type number_of_mask_words)
 {
   auto const stride = cudf::detail::grid_1d::grid_stride();
   for (thread_index_type destination_word_index = grid_1d::global_thread_id();
@@ -260,10 +260,10 @@ namespace {
  * @param[out] global_count The number of non-zero bits in the specified range
  */
 template <size_type block_size>
-__global__ void count_set_bits_kernel(bitmask_type const* bitmask,
-                                      size_type first_bit_index,
-                                      size_type last_bit_index,
-                                      size_type* global_count)
+CUDF_KERNEL void count_set_bits_kernel(bitmask_type const* bitmask,
+                                       size_type first_bit_index,
+                                       size_type last_bit_index,
+                                       size_type* global_count)
 {
   constexpr auto const word_size{detail::size_in_bits<bitmask_type>()};
 
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index 75722ede9d2..4d16298c605 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,7 +51,9 @@ column_view_base::column_view_base(data_type type,
     CUDF_EXPECTS(nullptr == data, "EMPTY column should have no data.");
     CUDF_EXPECTS(nullptr == null_mask, "EMPTY column should have no null mask.");
   } else if (is_compound(type)) {
-    CUDF_EXPECTS(nullptr == data, "Compound (parent) columns cannot have data");
+    if (type.id() != type_id::STRING) {
+      CUDF_EXPECTS(nullptr == data, "Compound (parent) columns cannot have data");
+    }
   } else if (size > 0) {
     CUDF_EXPECTS(nullptr != data, "Null data pointer.");
   }
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index ddf39e21685..b1d850e0b27 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,12 +111,12 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
  * @param out_valid_count To hold the total number of valid bits set
  */
 template <size_type block_size>
-__global__ void concatenate_masks_kernel(column_device_view const* views,
-                                         size_t const* output_offsets,
-                                         size_type number_of_views,
-                                         bitmask_type* dest_mask,
-                                         size_type number_of_mask_bits,
-                                         size_type* out_valid_count)
+CUDF_KERNEL void concatenate_masks_kernel(column_device_view const* views,
+                                          size_t const* output_offsets,
+                                          size_type number_of_views,
+                                          bitmask_type* dest_mask,
+                                          size_type number_of_mask_bits,
+                                          size_type* out_valid_count)
 {
   auto tidx         = cudf::detail::grid_1d::global_thread_id();
   auto const stride = cudf::detail::grid_1d::grid_stride();
@@ -187,11 +187,11 @@ size_type concatenate_masks(host_span<column_view const> views,
 
 namespace {
 template <typename T, size_type block_size, bool Nullable>
-__global__ void fused_concatenate_kernel(column_device_view const* input_views,
-                                         size_t const* input_offsets,
-                                         size_type num_input_views,
-                                         mutable_column_device_view output_view,
-                                         size_type* out_valid_count)
+CUDF_KERNEL void fused_concatenate_kernel(column_device_view const* input_views,
+                                          size_t const* input_offsets,
+                                          size_type num_input_views,
+                                          mutable_column_device_view output_view,
+                                          size_type* out_valid_count)
 {
   auto const output_size = output_view.size();
   auto* output_data      = output_view.data<T>();
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index dd4af236ecf..d711f40605a 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -280,9 +280,9 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
  * @param buf_info Information on the range of values to be copied for each destination buffer
  */
 template <int block_size, typename IndexToDstBuf>
-__global__ void copy_partitions(IndexToDstBuf index_to_buffer,
-                                uint8_t const** src_bufs,
-                                dst_buf_info* buf_info)
+CUDF_KERNEL void copy_partitions(IndexToDstBuf index_to_buffer,
+                                 uint8_t const** src_bufs,
+                                 dst_buf_info* buf_info)
 {
   auto const buf_index     = blockIdx.x;
   auto const src_buf_index = buf_info[buf_index].src_buf_index;
@@ -502,23 +502,34 @@ std::pair<src_buf_info*, size_type> buf_info_functor::operator()<cudf::string_vi
   int offset_stack_pos,
   int parent_offset_index,
   int offset_depth,
-  rmm::cuda_stream_view)
+  rmm::cuda_stream_view stream)
 {
   if (col.nullable()) {
     std::tie(current, offset_stack_pos) =
       add_null_buffer(col, current, offset_stack_pos, parent_offset_index, offset_depth);
   }
 
-  // string columns hold no actual data, but we need to keep a record
-  // of it so we know it's size when we are constructing the output columns
-  *current = src_buf_info(
-    type_id::STRING, nullptr, offset_stack_pos, parent_offset_index, false, col.offset());
+  // the way strings are arranged, the strings column itself contains char data, but our child
+  // offsets column actually contains our offsets. So our parent_offset_index is actually our child.
+
+  // string columns don't necessarily have children if they are empty
+  auto const has_offsets_child = col.num_children() > 0;
+
+  // string columns contain the underlying chars data.
+  *current = src_buf_info(type_id::STRING,
+                          nullptr,
+                          offset_stack_pos,
+                          // if I have an offsets child, it's index will be my parent_offset_index
+                          has_offsets_child ? ((current + 1) - head) : parent_offset_index,
+                          false,
+                          col.offset());
+
+  // if I have offsets, I need to include that in the stack size
+  offset_stack_pos += has_offsets_child ? offset_depth + 1 : offset_depth;
   current++;
-  offset_stack_pos += offset_depth;
 
-  // string columns don't necessarily have children
-  if (col.num_children() > 0) {
-    CUDF_EXPECTS(col.num_children() == 2, "Encountered malformed string column");
+  if (has_offsets_child) {
+    CUDF_EXPECTS(col.num_children() == 1, "Encountered malformed string column");
     strings_column_view scv(col);
 
     // info for the offsets buffer
@@ -539,15 +550,6 @@ std::pair<src_buf_info*, size_type> buf_info_functor::operator()<cudf::string_vi
     // since we are crossing an offset boundary, calculate our new depth and parent offset index.
     offset_depth++;
     parent_offset_index = offset_col - head;
-
-    // prevent appending buf_info for non-existent chars buffer
-    CUDF_EXPECTS(not scv.chars().nullable(), "Encountered nullable string chars column");
-
-    // info for the chars buffer
-    *current = src_buf_info(
-      type_id::INT8, nullptr, offset_stack_pos, parent_offset_index, false, col.offset());
-    current++;
-    offset_stack_pos += offset_depth;
   }
 
   return {current, offset_stack_pos};
@@ -716,10 +718,24 @@ std::tuple<size_type, int64_t, int64_t, size_type> build_output_column_metadata(
   }();
 
   // size/data pointer for the column
-  auto const col_size       = static_cast<size_type>(current_info->num_elements);
-  int64_t const data_offset = src.num_children() > 0 || col_size == 0 || src.head() == nullptr
-                                ? -1
-                                : static_cast<int64_t>(current_info->dst_offset);
+  auto const col_size = [&]() {
+    // if I am a string column, I need to use the number of rows from my child offset column. the
+    // number of rows in my dst_buf_info struct will be equal to the number of chars, which is
+    // incorrect. this is a quirk of how cudf stores strings.
+    if (src.type().id() == type_id::STRING) {
+      // if I have no children (no offsets), then I must have a row count of 0
+      if (src.num_children() == 0) { return 0; }
+
+      // otherwise my actual number of rows will be the num_rows field of the next dst_buf_info
+      // struct (our child offsets column)
+      return (current_info + 1)->num_rows;
+    }
+
+    // otherwise the number of rows is the number of elements
+    return static_cast<size_type>(current_info->num_elements);
+  }();
+  int64_t const data_offset =
+    col_size == 0 || src.head() == nullptr ? -1 : static_cast<int64_t>(current_info->dst_offset);
 
   mb.add_column_info_to_meta(
     src.type(), col_size, null_count, data_offset, bitmask_offset, src.num_children());
@@ -902,11 +918,19 @@ struct dst_valid_count_output_iterator {
  */
 struct size_of_helper {
   template <typename T>
-  constexpr std::enable_if_t<not is_fixed_width<T>(), int> __device__ operator()() const
+  constexpr std::enable_if_t<!is_fixed_width<T>() && !std::is_same_v<T, cudf::string_view>, int>
+    __device__ operator()() const
   {
     return 0;
   }
 
+  template <typename T>
+  constexpr std::enable_if_t<!is_fixed_width<T>() && std::is_same_v<T, cudf::string_view>, int>
+    __device__ operator()() const
+  {
+    return sizeof(cudf::device_storage_type_t<int8_t>);
+  }
+
   template <typename T>
   constexpr std::enable_if_t<is_fixed_width<T>(), int> __device__ operator()() const noexcept
   {
@@ -1236,7 +1260,7 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
       }
 
       // final element indices and row count
-      int const out_element_index = src_info.is_validity ? row_start / 32 : row_start;
+      int const src_element_index = src_info.is_validity ? row_start / 32 : row_start;
       int const num_rows          = row_end - row_start;
       // if I am an offsets column, all my values need to be shifted
       int const value_shift = src_info.offsets == nullptr ? 0 : src_info.offsets[row_start];
@@ -1259,7 +1283,7 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
                           num_elements,
                           element_size,
                           num_rows,
-                          out_element_index,
+                          src_element_index,
                           0,
                           value_shift,
                           bit_shift,
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 8f326184012..517435503ee 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,9 +50,9 @@ namespace detail {
 namespace {
 
 template <bool mark_true, typename MapIterator>
-__global__ void marking_bitmask_kernel(mutable_column_device_view destination,
-                                       MapIterator scatter_map,
-                                       size_type num_scatter_rows)
+CUDF_KERNEL void marking_bitmask_kernel(mutable_column_device_view destination,
+                                        MapIterator scatter_map,
+                                        size_type num_scatter_rows)
 {
   auto row          = cudf::detail::grid_1d::global_thread_id();
   auto const stride = cudf::detail::grid_1d::grid_stride();
diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index 8fc3e63bc59..002c9a9137b 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -333,9 +333,8 @@ std::unique_ptr<column> md5(table_view const& input,
   auto [offsets_column, bytes] =
     cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
 
-  auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr);
-  auto chars_view   = chars_column->mutable_view();
-  auto d_chars      = chars_view.data<char>();
+  rmm::device_uvector<char> chars(bytes, stream, mr);
+  auto d_chars = chars.data();
 
   auto const device_input = table_device_view::create(input, stream);
 
@@ -366,8 +365,7 @@ std::unique_ptr<column> md5(table_view const& input,
       }
     });
 
-  return make_strings_column(
-    input.num_rows(), std::move(offsets_column), std::move(chars_column), 0, {});
+  return make_strings_column(input.num_rows(), std::move(offsets_column), chars.release(), 0, {});
 }
 
 }  // namespace detail
diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
index e39625c92e7..7b44fb41288 100644
--- a/cpp/src/interop/from_arrow.cu
+++ b/cpp/src/interop/from_arrow.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -290,7 +290,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
   auto const num_rows = offsets_column->size() - 1;
   auto out_col        = make_strings_column(num_rows,
                                      std::move(offsets_column),
-                                     std::move(chars_column),
+                                     std::move(chars_column->release().data.release()[0]),
                                      array.null_count(),
                                      std::move(*get_mask_buffer(array, stream, mr)));
 
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 3a9fe50d25b..04ca1250ed5 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/interop.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -49,16 +51,16 @@ namespace {
  * @brief Create arrow data buffer from given cudf column
  */
 template <typename T>
-std::shared_ptr<arrow::Buffer> fetch_data_buffer(column_view input_view,
+std::shared_ptr<arrow::Buffer> fetch_data_buffer(device_span<T const> input,
                                                  arrow::MemoryPool* ar_mr,
                                                  rmm::cuda_stream_view stream)
 {
-  int64_t const data_size_in_bytes = sizeof(T) * input_view.size();
+  int64_t const data_size_in_bytes = sizeof(T) * input.size();
 
   auto data_buffer = allocate_arrow_buffer(data_size_in_bytes, ar_mr);
 
   CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
-                                input_view.data<T>(),
+                                input.data(),
                                 data_size_in_bytes,
                                 cudaMemcpyDefault,
                                 stream.value()));
@@ -136,11 +138,13 @@ struct dispatch_to_arrow {
                                            arrow::MemoryPool* ar_mr,
                                            rmm::cuda_stream_view stream)
   {
-    return to_arrow_array(id,
-                          static_cast<int64_t>(input_view.size()),
-                          fetch_data_buffer<T>(input_view, ar_mr, stream),
-                          fetch_mask_buffer(input_view, ar_mr, stream),
-                          static_cast<int64_t>(input_view.null_count()));
+    return to_arrow_array(
+      id,
+      static_cast<int64_t>(input_view.size()),
+      fetch_data_buffer<T>(
+        device_span<T const>(input_view.data<T>(), input_view.size()), ar_mr, stream),
+      fetch_mask_buffer(input_view, ar_mr, stream),
+      static_cast<int64_t>(input_view.null_count()));
   }
 };
 
@@ -280,7 +284,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::string_view>(
 {
   std::unique_ptr<column> tmp_column =
     ((input.offset() != 0) or
-     ((input.num_children() == 2) and (input.child(0).size() - 1 != input.size())))
+     ((input.num_children() == 1) and (input.child(0).size() - 1 != input.size())))
       ? std::make_unique<cudf::column>(input, stream)
       : nullptr;
 
@@ -295,8 +299,13 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::string_view>(
     return std::make_shared<arrow::StringArray>(
       0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer));
   }
-  auto offset_buffer = child_arrays[0]->data()->buffers[1];
-  auto data_buffer   = child_arrays[1]->data()->buffers[1];
+  auto offset_buffer = child_arrays[strings_column_view::offsets_column_index]->data()->buffers[1];
+  auto const sview   = strings_column_view{input_view};
+  auto data_buffer   = fetch_data_buffer<char>(
+    device_span<char const>{sview.chars_begin(stream),
+                              static_cast<std::size_t>(sview.chars_size(stream))},
+    ar_mr,
+    stream);
   return std::make_shared<arrow::StringArray>(static_cast<int64_t>(input_view.size()),
                                               offset_buffer,
                                               data_buffer,
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index 365f6d6875c..59177a68ee7 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -324,7 +324,7 @@ avro_decode_row(schemadesc_s const* schema,
  * @param[in] min_row_size Minimum size in bytes of a row
  */
 // blockDim {32,num_warps,1}
-__global__ void __launch_bounds__(num_warps * 32, 2)
+CUDF_KERNEL void __launch_bounds__(num_warps * 32, 2)
   gpuDecodeAvroColumnData(device_span<block_desc_s const> blocks,
                           schemadesc_s* schema_g,
                           device_span<string_index_pair const> global_dictionary,
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index 8bafd054bdb..9c936fefd6c 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1911,7 +1911,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, brotli_dictionary_s
  * @param scratch_size Size of scratch heap space (smaller sizes may result in serialization between
  * blocks)
  */
-__global__ void __launch_bounds__(block_size, 2)
+CUDF_KERNEL void __launch_bounds__(block_size, 2)
   gpu_debrotli_kernel(device_span<device_span<uint8_t const> const> inputs,
                       device_span<device_span<uint8_t> const> outputs,
                       device_span<compression_result> results,
diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
index 8993815e560..cd50545afbd 100644
--- a/cpp/src/io/comp/gpuinflate.cu
+++ b/cpp/src/io/comp/gpuinflate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1024,7 +1024,7 @@ __device__ int parse_gzip_header(uint8_t const* src, size_t src_size)
  * @param parse_hdr If nonzero, indicates that the compressed bitstream includes a GZIP header
  */
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   inflate_kernel(device_span<device_span<uint8_t const> const> inputs,
                  device_span<device_span<uint8_t> const> outputs,
                  device_span<compression_result> results,
@@ -1152,7 +1152,7 @@ __global__ void __launch_bounds__(block_size)
  *
  * @param inputs Source and destination information per block
  */
-__global__ void __launch_bounds__(1024)
+CUDF_KERNEL void __launch_bounds__(1024)
   copy_uncompressed_kernel(device_span<device_span<uint8_t const> const> inputs,
                            device_span<device_span<uint8_t> const> outputs)
 {
diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu
index 0428f4edaf2..a45e8b2083b 100644
--- a/cpp/src/io/comp/snap.cu
+++ b/cpp/src/io/comp/snap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -257,7 +257,7 @@ static __device__ uint32_t Match60(uint8_t const* src1,
  * @param[out] outputs Compression status per block
  * @param[in] count Number of blocks to compress
  */
-__global__ void __launch_bounds__(128)
+CUDF_KERNEL void __launch_bounds__(128)
   snap_kernel(device_span<device_span<uint8_t const> const> inputs,
               device_span<device_span<uint8_t> const> outputs,
               device_span<compression_result> results)
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index 504a2fe377c..46555a97e9c 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -628,7 +628,7 @@ __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_s
  * @param[out] outputs Decompression status per block
  */
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   unsnap_kernel(device_span<device_span<uint8_t const> const> inputs,
                 device_span<device_span<uint8_t> const> outputs,
                 device_span<compression_result> results)
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 248e17669bc..8252cccbdb9 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -168,7 +168,7 @@ __device__ __inline__ bool is_floatingpoint(long len,
  * @param row_offsets The start the CSV data of interest
  * @param d_column_data The count for each column data type
  */
-__global__ void __launch_bounds__(csvparse_block_dim)
+CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
   data_type_detection(parse_options_view const opts,
                       device_span<char const> csv_text,
                       device_span<column_parse::flags const> const column_flags,
@@ -305,7 +305,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
  * @param[out] valids The bitmaps indicating whether column fields are valid
  * @param[out] valid_counts The number of valid fields in each column
  */
-__global__ void __launch_bounds__(csvparse_block_dim)
+CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
   convert_csv_to_cudf(cudf::io::parse_options_view options,
                       device_span<char const> data,
                       device_span<column_parse::flags const> column_flags,
@@ -622,7 +622,7 @@ static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxt
  * @param escapechar Delimiter escape character
  * @param commentchar Comment line character (skip rows starting with this character)
  */
-__global__ void __launch_bounds__(rowofs_block_dim)
+CUDF_KERNEL void __launch_bounds__(rowofs_block_dim)
   gather_row_offsets_gpu(uint64_t* row_ctx,
                          device_span<uint64_t> offsets_out,
                          device_span<char const> const data,
diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu
index 66143d3fdee..f4d32edac89 100644
--- a/cpp/src/io/csv/durations.cu
+++ b/cpp/src/io/csv/durations.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -202,7 +202,7 @@ struct dispatch_from_durations_fn {
     //
     return make_strings_column(strings_count,
                                std::move(offsets_column),
-                               std::move(chars_column),
+                               std::move(chars_column->release().data.release()[0]),
                                durations.null_count(),
                                std::move(null_mask));
   }
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index aad761acdba..65473073e31 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -181,11 +181,12 @@ struct column_to_strings_fn {
 
     auto d_column = column_device_view::create(column_v, stream_);
     escape_strings_fn fn{*d_column, delimiter.value(stream_)};
-    auto children = cudf::strings::detail::make_strings_children(fn, column_v.size(), stream_, mr_);
+    auto [offsets_column, chars_column] =
+      cudf::strings::detail::make_strings_children(fn, column_v.size(), stream_, mr_);
 
     return make_strings_column(column_v.size(),
-                               std::move(children.first),
-                               std::move(children.second),
+                               std::move(offsets_column),
+                               std::move(chars_column->release().data.release()[0]),
                                column_v.null_count(),
                                cudf::detail::copy_bitmask(column_v, stream_, mr_));
   }
@@ -377,8 +378,8 @@ void write_chunked(data_sink* out_sink,
                                                             rmm::mr::get_current_device_resource());
   strings_column_view strings_column{p_str_col_w_nl->view()};
 
-  auto total_num_bytes      = strings_column.chars_size();
-  char const* ptr_all_bytes = strings_column.chars_begin();
+  auto total_num_bytes      = strings_column.chars_size(stream);
+  char const* ptr_all_bytes = strings_column.chars_begin(stream);
 
   if (out_sink->is_device_write_preferred(total_num_bytes)) {
     // Direct write from device memory
diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index f867a95a864..9bb087e788d 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -493,7 +493,7 @@ template <bool IS_TRANS_VECTOR_PASS,
           typename TransducedOutItT,
           typename TransducedIndexOutItT,
           typename TransducedCountOutItT>
-__launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__
+__launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
   void SimulateDFAKernel(DfaT dfa,
                          SymbolItT d_chars,
                          OffsetT const num_chars,
diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh
index a5c1a4f4f5c..be63ec6539f 100644
--- a/cpp/src/io/fst/dispatch_dfa.cuh
+++ b/cpp/src/io/fst/dispatch_dfa.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ struct DeviceFSMPolicy {
  * @return
  */
 template <typename TileState>
-__global__ void initialization_pass_kernel(TileState items_state, uint32_t num_tiles)
+CUDF_KERNEL void initialization_pass_kernel(TileState items_state, uint32_t num_tiles)
 {
   items_state.InitializeStatus(num_tiles);
 }
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index e5489963618..42f2fd02d52 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -269,9 +269,9 @@ void write_csv(csv_writer_options const& options,
     mr);
 }
 
-raw_orc_statistics read_raw_orc_statistics(source_info const& src_info)
+raw_orc_statistics read_raw_orc_statistics(source_info const& src_info,
+                                           rmm::cuda_stream_view stream)
 {
-  auto stream = cudf::get_default_stream();
   // Get source to read statistics from
   std::unique_ptr<datasource> source;
   if (src_info.type() == io_type::FILEPATH) {
@@ -342,9 +342,10 @@ column_statistics::column_statistics(orc::column_statistics&& cs)
   }
 }
 
-parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info)
+parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info,
+                                                 rmm::cuda_stream_view stream)
 {
-  auto const raw_stats = read_raw_orc_statistics(src_info);
+  auto const raw_stats = read_raw_orc_statistics(src_info, stream);
 
   parsed_orc_statistics result;
   result.column_names = raw_stats.column_names;
@@ -395,12 +396,12 @@ orc_column_schema make_orc_column_schema(host_span<orc::SchemaType const> orc_sc
 }
 };  // namespace
 
-orc_metadata read_orc_metadata(source_info const& src_info)
+orc_metadata read_orc_metadata(source_info const& src_info, rmm::cuda_stream_view stream)
 {
   auto sources = make_datasources(src_info);
 
   CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported.");
-  auto const footer = orc::metadata(sources.front().get(), cudf::detail::default_stream_value).ff;
+  auto const footer = orc::metadata(sources.front().get(), stream).ff;
 
   return {{make_orc_column_schema(footer.types, 0, "")},
           static_cast<size_type>(footer.numberOfRows),
@@ -410,21 +411,21 @@ orc_metadata read_orc_metadata(source_info const& src_info)
 /**
  * @copydoc cudf::io::read_orc
  */
-table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_orc(orc_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
   auto datasources = make_datasources(options.get_source());
-  auto reader      = std::make_unique<orc::detail::reader>(
-    std::move(datasources), options, cudf::get_default_stream(), mr);
-
+  auto reader = std::make_unique<orc::detail::reader>(std::move(datasources), options, stream, mr);
   return reader->read(options);
 }
 
 /**
  * @copydoc cudf::io::write_orc
  */
-void write_orc(orc_writer_options const& options)
+void write_orc(orc_writer_options const& options, rmm::cuda_stream_view stream)
 {
   namespace io_detail = cudf::io::detail;
 
@@ -434,8 +435,7 @@ void write_orc(orc_writer_options const& options)
   CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for ORC writing");
 
   auto writer = std::make_unique<orc::detail::writer>(
-    std::move(sinks[0]), options, io_detail::single_write_mode::YES, cudf::get_default_stream());
-
+    std::move(sinks[0]), options, io_detail::single_write_mode::YES, stream);
   try {
     writer->write(options.get_table());
   } catch (...) {
@@ -451,7 +451,8 @@ void write_orc(orc_writer_options const& options)
 /**
  * @copydoc cudf::io::orc_chunked_writer::orc_chunked_writer
  */
-orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options)
+orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options,
+                                       rmm::cuda_stream_view stream)
 {
   namespace io_detail = cudf::io::detail;
 
@@ -459,7 +460,7 @@ orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options
   CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for ORC writing");
 
   writer = std::make_unique<orc::detail::writer>(
-    std::move(sinks[0]), options, io_detail::single_write_mode::NO, cudf::get_default_stream());
+    std::move(sinks[0]), options, io_detail::single_write_mode::NO, stream);
 }
 
 /**
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 056cce18a52..f1296daca26 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -363,7 +363,7 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
     if (col.is_empty()) return std::vector<std::string>{};
     auto const scv     = cudf::strings_column_view(col);
     auto const h_chars = cudf::detail::make_std_vector_sync<char>(
-      cudf::device_span<char const>(scv.chars().data<char>(), scv.chars().size()), stream);
+      cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
     auto const h_offsets = cudf::detail::make_std_vector_sync(
       cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
                                                scv.size() + 1),
diff --git a/cpp/src/io/json/legacy/json_gpu.cu b/cpp/src/io/json/legacy/json_gpu.cu
index b358cc2071b..4d5293e12fd 100644
--- a/cpp/src/io/json/legacy/json_gpu.cu
+++ b/cpp/src/io/json/legacy/json_gpu.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -245,14 +245,14 @@ __device__ std::pair<char const*, char const*> get_row_data_range(
  * @param[out] valid_fields The bitmaps indicating whether column fields are valid
  * @param[out] num_valid_fields The numbers of valid fields in columns
  */
-__global__ void convert_data_to_columns_kernel(parse_options_view opts,
-                                               device_span<char const> const data,
-                                               device_span<uint64_t const> const row_offsets,
-                                               device_span<data_type const> const column_types,
-                                               col_map_type col_map,
-                                               device_span<void* const> const output_columns,
-                                               device_span<bitmask_type* const> const valid_fields,
-                                               device_span<cudf::size_type> const num_valid_fields)
+CUDF_KERNEL void convert_data_to_columns_kernel(parse_options_view opts,
+                                                device_span<char const> const data,
+                                                device_span<uint64_t const> const row_offsets,
+                                                device_span<data_type const> const column_types,
+                                                col_map_type col_map,
+                                                device_span<void* const> const output_columns,
+                                                device_span<bitmask_type* const> const valid_fields,
+                                                device_span<cudf::size_type> const num_valid_fields)
 {
   auto const rec_id = grid_1d::global_thread_id();
   if (rec_id >= row_offsets.size()) return;
@@ -321,7 +321,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
  * @param[in] num_columns The number of columns of input data
  * @param[out] column_infos The count for each column data type
  */
-__global__ void detect_data_types_kernel(
+CUDF_KERNEL void detect_data_types_kernel(
   parse_options_view const opts,
   device_span<char const> const data,
   device_span<uint64_t const> const row_offsets,
@@ -481,11 +481,11 @@ __device__ key_value_range get_next_key_value_range(char const* begin,
  * @param[out] keys_cnt Number of keys found in the file
  * @param[out] keys_info optional, information (offset, length, hash) for each found key
  */
-__global__ void collect_keys_info_kernel(parse_options_view const options,
-                                         device_span<char const> const data,
-                                         device_span<uint64_t const> const row_offsets,
-                                         unsigned long long int* keys_cnt,
-                                         thrust::optional<mutable_table_device_view> keys_info)
+CUDF_KERNEL void collect_keys_info_kernel(parse_options_view const options,
+                                          device_span<char const> const data,
+                                          device_span<uint64_t const> const row_offsets,
+                                          unsigned long long int* keys_cnt,
+                                          thrust::optional<mutable_table_device_view> keys_info)
 {
   auto const rec_id = grid_1d::global_thread_id();
   if (rec_id >= row_offsets.size()) return;
diff --git a/cpp/src/io/json/legacy/reader_impl.cu b/cpp/src/io/json/legacy/reader_impl.cu
index 5580628b0fe..d461f27c921 100644
--- a/cpp/src/io/json/legacy/reader_impl.cu
+++ b/cpp/src/io/json/legacy/reader_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -530,29 +530,27 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
   auto repl_chars   = std::vector<char>{'"', '\\', '\t', '\r', '\b'};
   auto repl_offsets = std::vector<size_type>{0, 1, 2, 3, 4, 5};
 
-  auto target = make_strings_column(
-    static_cast<size_type>(target_offsets.size() - 1),
-    std::make_unique<cudf::column>(
-      cudf::detail::make_device_uvector_async(
-        target_offsets, stream, rmm::mr::get_current_device_resource()),
-      rmm::device_buffer{},
-      0),
-    std::make_unique<cudf::column>(cudf::detail::make_device_uvector_async(
-                                     target_chars, stream, rmm::mr::get_current_device_resource()),
-                                   rmm::device_buffer{},
-                                   0),
-    0,
-    {});
+  auto target =
+    make_strings_column(static_cast<size_type>(target_offsets.size() - 1),
+                        std::make_unique<cudf::column>(
+                          cudf::detail::make_device_uvector_async(
+                            target_offsets, stream, rmm::mr::get_current_device_resource()),
+                          rmm::device_buffer{},
+                          0),
+                        cudf::detail::make_device_uvector_async(
+                          target_chars, stream, rmm::mr::get_current_device_resource())
+                          .release(),
+                        0,
+                        {});
   auto repl = make_strings_column(
     static_cast<size_type>(repl_offsets.size() - 1),
     std::make_unique<cudf::column>(cudf::detail::make_device_uvector_async(
                                      repl_offsets, stream, rmm::mr::get_current_device_resource()),
                                    rmm::device_buffer{},
                                    0),
-    std::make_unique<cudf::column>(cudf::detail::make_device_uvector_async(
-                                     repl_chars, stream, rmm::mr::get_current_device_resource()),
-                                   rmm::device_buffer{},
-                                   0),
+    cudf::detail::make_device_uvector_async(
+      repl_chars, stream, rmm::mr::get_current_device_resource())
+      .release(),
     0,
     {});
 
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index b2017ee513f..84e0ac9e74d 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -170,12 +170,12 @@ struct escape_strings_fn {
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
   {
-    auto children =
+    auto [offsets_column, chars_column] =
       cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr);
 
     return make_strings_column(column_v.size(),
-                               std::move(children.first),
-                               std::move(children.second),
+                               std::move(offsets_column),
+                               std::move(chars_column->release().data.release()[0]),
                                column_v.null_count(),
                                cudf::detail::copy_bitmask(column_v, stream, mr));
   }
@@ -347,10 +347,11 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
                  d_strview_offsets + row_string_offsets.size(),
                  old_offsets.begin<size_type>(),
                  row_string_offsets.begin());
+  auto chars_data = joined_col->release().data;
   return make_strings_column(
     strings_columns.num_rows(),
     std::make_unique<cudf::column>(std::move(row_string_offsets), rmm::device_buffer{}, 0),
-    std::move(joined_col->release().children[strings_column_view::chars_column_index]),
+    std::move(chars_data.release()[0]),
     0,
     {});
 }
@@ -469,10 +470,11 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
                  d_strview_offsets.end(),
                  old_offsets.begin<size_type>(),
                  row_string_offsets.begin());
+  auto chars_data = joined_col->release().data;
   return make_strings_column(
     num_lists,
     std::make_unique<cudf::column>(std::move(row_string_offsets), rmm::device_buffer{}, 0),
-    std::move(joined_col->release().children[strings_column_view::chars_column_index]),
+    std::move(chars_data.release()[0]),
     lists_strings.null_count(),
     cudf::detail::copy_bitmask(lists_strings.parent(), stream, mr));
 }
@@ -774,11 +776,7 @@ std::unique_ptr<column> make_strings_column_from_host(host_span<std::string cons
     rmm::device_buffer{},
     0);
   return cudf::make_strings_column(
-    host_strings.size(),
-    std::move(d_offsets),
-    std::make_unique<cudf::column>(std::move(d_chars), rmm::device_buffer{}, 0),
-    0,
-    {});
+    host_strings.size(), std::move(d_offsets), d_chars.release(), 0, {});
 }
 
 std::unique_ptr<column> make_column_names_column(host_span<column_name_info const> column_names,
@@ -812,8 +810,8 @@ void write_chunked(data_sink* out_sink,
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
 
-  auto const total_num_bytes = str_column_view.chars_size() - skip_last_chars;
-  char const* ptr_all_bytes  = str_column_view.chars_begin();
+  auto const total_num_bytes = str_column_view.chars_size(stream) - skip_last_chars;
+  char const* ptr_all_bytes  = str_column_view.chars_begin(stream);
 
   if (out_sink->is_device_write_preferred(total_num_bytes)) {
     // Direct write from device memory
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 2e5eeab7298..8cae1ff5309 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@
 
 #include <algorithm>
 #include <numeric>
-#include <optional>
 
 namespace cudf::io::orc::detail {
 
@@ -220,27 +219,25 @@ aggregate_orc_metadata::select_stripes(
   }
 
   // Read each stripe's stripefooter metadata
-  if (not selected_stripes_mapping.empty()) {
-    for (auto& mapping : selected_stripes_mapping) {
-      // Resize to all stripe_info for the source level
-      per_file_metadata[mapping.source_idx].stripefooters.resize(mapping.stripe_info.size());
-
-      for (size_t i = 0; i < mapping.stripe_info.size(); i++) {
-        auto const stripe         = mapping.stripe_info[i].first;
-        auto const sf_comp_offset = stripe->offset + stripe->indexLength + stripe->dataLength;
-        auto const sf_comp_length = stripe->footerLength;
-        CUDF_EXPECTS(
-          sf_comp_offset + sf_comp_length < per_file_metadata[mapping.source_idx].source->size(),
-          "Invalid stripe information");
-        auto const buffer =
-          per_file_metadata[mapping.source_idx].source->host_read(sf_comp_offset, sf_comp_length);
-        auto sf_data = per_file_metadata[mapping.source_idx].decompressor->decompress_blocks(
-          {buffer->data(), buffer->size()}, stream);
-        ProtobufReader(sf_data.data(), sf_data.size())
-          .read(per_file_metadata[mapping.source_idx].stripefooters[i]);
-        mapping.stripe_info[i].second = &per_file_metadata[mapping.source_idx].stripefooters[i];
-        if (stripe->indexLength == 0) { row_grp_idx_present = false; }
-      }
+  for (auto& mapping : selected_stripes_mapping) {
+    // Resize to all stripe_info for the source level
+    per_file_metadata[mapping.source_idx].stripefooters.resize(mapping.stripe_info.size());
+
+    for (size_t i = 0; i < mapping.stripe_info.size(); i++) {
+      auto const stripe         = mapping.stripe_info[i].first;
+      auto const sf_comp_offset = stripe->offset + stripe->indexLength + stripe->dataLength;
+      auto const sf_comp_length = stripe->footerLength;
+      CUDF_EXPECTS(
+        sf_comp_offset + sf_comp_length < per_file_metadata[mapping.source_idx].source->size(),
+        "Invalid stripe information");
+      auto const buffer =
+        per_file_metadata[mapping.source_idx].source->host_read(sf_comp_offset, sf_comp_length);
+      auto sf_data = per_file_metadata[mapping.source_idx].decompressor->decompress_blocks(
+        {buffer->data(), buffer->size()}, stream);
+      ProtobufReader(sf_data.data(), sf_data.size())
+        .read(per_file_metadata[mapping.source_idx].stripefooters[i]);
+      mapping.stripe_info[i].second = &per_file_metadata[mapping.source_idx].stripefooters[i];
+      if (stripe->indexLength == 0) { row_grp_idx_present = false; }
     }
   }
 
@@ -270,7 +267,7 @@ column_hierarchy aggregate_orc_metadata::select_columns(
       CUDF_EXPECTS(name_found, "Unknown column name: " + std::string(path));
     }
   }
-  return {std::move(selected_columns)};
+  return column_hierarchy{std::move(selected_columns)};
 }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index 587684ccc0d..f05946a4346 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include "orc.hpp"
 
 #include <map>
@@ -33,8 +35,8 @@ struct column_hierarchy {
   // Each element contains column at the given nesting level
   std::vector<std::vector<orc_column_meta>> levels;
 
-  column_hierarchy(nesting_map child_map);
-  auto num_levels() const { return levels.size(); }
+  explicit column_hierarchy(nesting_map child_map);
+  [[nodiscard]] auto num_levels() const { return levels.size(); }
 };
 
 /**
@@ -50,11 +52,6 @@ class aggregate_orc_metadata {
    */
   [[nodiscard]] int64_t calc_num_rows() const;
 
-  /**
-   * @brief Number of columns in a ORC file.
-   */
-  [[nodiscard]] size_type calc_num_cols() const;
-
   /**
    * @brief Sums up the number of stripes of each source
    */
@@ -69,22 +66,23 @@ class aggregate_orc_metadata {
   aggregate_orc_metadata(std::vector<std::unique_ptr<datasource>> const& sources,
                          rmm::cuda_stream_view stream);
 
-  [[nodiscard]] auto const& get_schema(int schema_idx) const
+  [[nodiscard]] auto get_col_type(int col_idx) const
   {
-    return per_file_metadata[0].ff.types[schema_idx];
+    return per_file_metadata[0].ff.types[col_idx];
   }
 
-  auto get_col_type(int col_idx) const { return per_file_metadata[0].ff.types[col_idx]; }
-
   [[nodiscard]] auto get_num_rows() const { return num_rows; }
 
-  auto get_num_cols() const { return per_file_metadata[0].get_num_columns(); }
+  [[nodiscard]] auto get_num_cols() const { return per_file_metadata[0].get_num_columns(); }
 
   [[nodiscard]] auto get_num_stripes() const { return num_stripes; }
 
   [[nodiscard]] auto const& get_types() const { return per_file_metadata[0].ff.types; }
 
-  [[nodiscard]] int get_row_index_stride() const { return per_file_metadata[0].ff.rowIndexStride; }
+  [[nodiscard]] int get_row_index_stride() const
+  {
+    return static_cast<int>(per_file_metadata[0].ff.rowIndexStride);
+  }
 
   [[nodiscard]] auto is_row_grp_idx_present() const { return row_grp_idx_present; }
 
@@ -115,11 +113,11 @@ class aggregate_orc_metadata {
    *
    * Stripes are potentially selected from multiple files.
    */
-  std::tuple<int64_t, size_type, std::vector<metadata::stripe_source_mapping>> select_stripes(
-    std::vector<std::vector<size_type>> const& user_specified_stripes,
-    uint64_t skip_rows,
-    std::optional<size_type> const& num_rows,
-    rmm::cuda_stream_view stream);
+  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<metadata::stripe_source_mapping>>
+  select_stripes(std::vector<std::vector<size_type>> const& user_specified_stripes,
+                 uint64_t skip_rows,
+                 std::optional<size_type> const& num_rows,
+                 rmm::cuda_stream_view stream);
 
   /**
    * @brief Filters ORC file to a selection of columns, based on their paths in the file.
@@ -131,7 +129,7 @@ class aggregate_orc_metadata {
    * `nullopt` if user did not select columns to read
    * @return Columns hierarchy - lists of children columns and sorted columns in each nesting level
    */
-  column_hierarchy select_columns(
+  [[nodiscard]] column_hierarchy select_columns(
     std::optional<std::vector<std::string>> const& column_paths) const;
 };
 
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 1d2262a1ccc..5971482f80c 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,10 +27,10 @@ namespace cudf::io::orc::gpu {
 /**
  * @brief Counts the number of characters in each rowgroup of each string column.
  */
-__global__ void rowgroup_char_counts_kernel(device_2dspan<size_type> char_counts,
-                                            device_span<orc_column_device_view const> orc_columns,
-                                            device_2dspan<rowgroup_rows const> rowgroup_bounds,
-                                            device_span<uint32_t const> str_col_indexes)
+CUDF_KERNEL void rowgroup_char_counts_kernel(device_2dspan<size_type> char_counts,
+                                             device_span<orc_column_device_view const> orc_columns,
+                                             device_2dspan<rowgroup_rows const> rowgroup_bounds,
+                                             device_span<uint32_t const> str_col_indexes)
 {
   // Index of the column in the `str_col_indexes` array
   auto const str_col_idx = blockIdx.y;
@@ -75,7 +75,7 @@ void rowgroup_char_counts(device_2dspan<size_type> counts,
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   initialize_dictionary_hash_maps_kernel(device_span<stripe_dictionary> dictionaries)
 {
   auto const dict_map = dictionaries[blockIdx.x].map_slots;
@@ -107,7 +107,7 @@ struct hash_functor {
 };
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   populate_dictionary_hash_maps_kernel(device_2dspan<stripe_dictionary> dictionaries,
                                        device_span<orc_column_device_view const> columns)
 {
@@ -162,7 +162,7 @@ __global__ void __launch_bounds__(block_size)
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   collect_map_entries_kernel(device_2dspan<stripe_dictionary> dictionaries)
 {
   auto const col_idx    = blockIdx.x;
@@ -196,7 +196,7 @@ __global__ void __launch_bounds__(block_size)
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   get_dictionary_indices_kernel(device_2dspan<stripe_dictionary> dictionaries,
                                 device_span<orc_column_device_view const> columns)
 {
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 46f6861e789..cf3121fe659 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -14,930 +14,11 @@
  * limitations under the License.
  */
 
-/**
- * @file reader_impl.cu
- * @brief cuDF-IO ORC reader class implementation
- */
-
-#include "orc.hpp"
-#include "orc_gpu.hpp"
-
 #include "reader_impl.hpp"
-
-#include <io/comp/gpuinflate.hpp>
-#include <io/comp/nvcomp_adapter.hpp>
-#include <io/utilities/config_utils.hpp>
-
-#include <cudf/detail/timezone.hpp>
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/bit.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/copy.h>
-#include <thrust/fill.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/pair.h>
-#include <thrust/scan.h>
-#include <thrust/transform.h>
-
-#include <algorithm>
-#include <iterator>
+#include "reader_impl_chunking.hpp"
+#include "reader_impl_helpers.hpp"
 
 namespace cudf::io::orc::detail {
-using namespace cudf::io::detail;
-
-namespace {
-
-/**
- * @brief Keeps track of orc mapping and child column details.
- */
-struct reader_column_meta {
-  // Mapping between column id in orc to processing order.
-  std::vector<std::vector<size_type>> orc_col_map;
-
-  // Number of rows in child columns.
-  std::vector<uint32_t> num_child_rows;
-
-  // Consists of parent column valid_map and null count.
-  std::vector<column_validity_info> parent_column_data;
-
-  std::vector<size_type> parent_column_index;
-
-  // Start row of child columns [stripe][column].
-  std::vector<uint32_t> child_start_row;
-
-  // Number of rows of child columns [stripe][column].
-  std::vector<uint32_t> num_child_rows_per_stripe;
-
-  struct row_group_meta {
-    uint32_t num_rows;   // number of rows in a column in a row group
-    uint32_t start_row;  // start row in a column in a row group
-  };
-
-  // Row group metadata [rowgroup][column].
-  std::vector<row_group_meta> rwgrp_meta;
-};
-
-/**
- * @brief Struct that maps ORC streams to columns
- */
-struct orc_stream_info {
-  explicit orc_stream_info(uint64_t offset_,
-                           std::size_t dst_pos_,
-                           uint32_t length_,
-                           uint32_t stripe_idx_)
-    : offset(offset_), dst_pos(dst_pos_), length(length_), stripe_idx(stripe_idx_)
-  {
-  }
-  uint64_t offset;      // offset in file
-  std::size_t dst_pos;  // offset in memory relative to start of compressed stripe data
-  std::size_t length;   // length in file
-  uint32_t stripe_idx;  // stripe index
-};
-
-/**
- * @brief Function that populates column descriptors stream/chunk
- */
-std::size_t gather_stream_info(std::size_t stripe_index,
-                               orc::StripeInformation const* stripeinfo,
-                               orc::StripeFooter const* stripefooter,
-                               host_span<int const> orc2gdf,
-                               host_span<orc::SchemaType const> types,
-                               bool use_index,
-                               bool apply_struct_map,
-                               std::size_t* num_dictionary_entries,
-                               std::vector<orc_stream_info>& stream_info,
-                               cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks)
-{
-  uint64_t src_offset = 0;
-  uint64_t dst_offset = 0;
-
-  auto const get_stream_index_type = [](orc::StreamKind kind) {
-    switch (kind) {
-      case orc::DATA: return gpu::CI_DATA;
-      case orc::LENGTH:
-      case orc::SECONDARY: return gpu::CI_DATA2;
-      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
-      case orc::PRESENT: return gpu::CI_PRESENT;
-      case orc::ROW_INDEX: return gpu::CI_INDEX;
-      default:
-        // Skip this stream as it's not strictly required
-        return gpu::CI_NUM_STREAMS;
-    }
-  };
-
-  for (auto const& stream : stripefooter->streams) {
-    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
-      dst_offset += stream.length;
-      continue;
-    }
-
-    auto const column_id = *stream.column_id;
-    auto col             = orc2gdf[column_id];
-
-    if (col == -1 and apply_struct_map) {
-      // A struct-type column has no data itself, but rather child columns
-      // for each of its fields. There is only a PRESENT stream, which
-      // needs to be included for the reader.
-      auto const schema_type = types[column_id];
-      if (not schema_type.subtypes.empty()) {
-        if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
-          for (auto const& idx : schema_type.subtypes) {
-            auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
-            if (child_idx >= 0) {
-              col                             = child_idx;
-              auto& chunk                     = chunks[stripe_index][col];
-              chunk.strm_id[gpu::CI_PRESENT]  = stream_info.size();
-              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
-            }
-          }
-        }
-      }
-    }
-    if (col != -1) {
-      if (src_offset >= stripeinfo->indexLength || use_index) {
-        auto& chunk           = chunks[stripe_index][col];
-        auto const index_type = get_stream_index_type(stream.kind);
-        if (index_type < gpu::CI_NUM_STREAMS) {
-          chunk.strm_id[index_type]  = stream_info.size();
-          chunk.strm_len[index_type] = stream.length;
-          // NOTE: skip_count field is temporarily used to track the presence of index streams
-          chunk.skip_count |= 1 << index_type;
-
-          if (index_type == gpu::CI_DICTIONARY) {
-            chunk.dictionary_start = *num_dictionary_entries;
-            chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
-            *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
-          }
-        }
-      }
-      stream_info.emplace_back(
-        stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index);
-      dst_offset += stream.length;
-    }
-    src_offset += stream.length;
-  }
-
-  return dst_offset;
-}
-
-/**
- * @brief Decompresses the stripe data, at stream granularity.
- *
- * @param decompressor Block decompressor
- * @param stripe_data List of source stripe column data
- * @param stream_info List of stream to column mappings
- * @param chunks Vector of list of column chunk descriptors
- * @param row_groups Vector of list of row index descriptors
- * @param num_stripes Number of stripes making up column chunks
- * @param row_index_stride Distance between each row index
- * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Device buffer to decompressed page data
- */
-rmm::device_buffer decompress_stripe_data(
-  OrcDecompressor const& decompressor,
-  host_span<rmm::device_buffer const> stripe_data,
-  host_span<orc_stream_info> stream_info,
-  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-  cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
-  std::size_t num_stripes,
-  std::size_t row_index_stride,
-  bool use_base_stride,
-  rmm::cuda_stream_view stream)
-{
-  // Parse the columns' compressed info
-  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
-    0, stream_info.size(), stream);
-  for (auto const& info : stream_info) {
-    compinfo.push_back(gpu::CompressedStreamInfo(
-      static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
-      info.length));
-  }
-  compinfo.host_to_device_async(stream);
-
-  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                 compinfo.size(),
-                                 decompressor.GetBlockSize(),
-                                 decompressor.GetLog2MaxCompressionRatio(),
-                                 stream);
-  compinfo.device_to_host_sync(stream);
-
-  // Count the exact number of compressed blocks
-  std::size_t num_compressed_blocks   = 0;
-  std::size_t num_uncompressed_blocks = 0;
-  std::size_t total_decomp_size       = 0;
-  for (std::size_t i = 0; i < compinfo.size(); ++i) {
-    num_compressed_blocks += compinfo[i].num_compressed_blocks;
-    num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
-    total_decomp_size += compinfo[i].max_uncompressed_size;
-  }
-  CUDF_EXPECTS(
-    not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
-    "Inconsistent info on compression blocks");
-
-  // Buffer needs to be padded.
-  // Required by `gpuDecodeOrcColumnData`.
-  rmm::device_buffer decomp_data(
-    cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
-  if (decomp_data.is_empty()) { return decomp_data; }
-
-  rmm::device_uvector<device_span<uint8_t const>> inflate_in(
-    num_compressed_blocks + num_uncompressed_blocks, stream);
-  rmm::device_uvector<device_span<uint8_t>> inflate_out(
-    num_compressed_blocks + num_uncompressed_blocks, stream);
-  rmm::device_uvector<compression_result> inflate_res(num_compressed_blocks, stream);
-  thrust::fill(rmm::exec_policy(stream),
-               inflate_res.begin(),
-               inflate_res.end(),
-               compression_result{0, compression_status::FAILURE});
-
-  // Parse again to populate the decompression input/output buffers
-  std::size_t decomp_offset      = 0;
-  uint32_t max_uncomp_block_size = 0;
-  uint32_t start_pos             = 0;
-  auto start_pos_uncomp          = (uint32_t)num_compressed_blocks;
-  for (std::size_t i = 0; i < compinfo.size(); ++i) {
-    auto dst_base                 = static_cast<uint8_t*>(decomp_data.data());
-    compinfo[i].uncompressed_data = dst_base + decomp_offset;
-    compinfo[i].dec_in_ctl        = inflate_in.data() + start_pos;
-    compinfo[i].dec_out_ctl       = inflate_out.data() + start_pos;
-    compinfo[i].dec_res      = {inflate_res.data() + start_pos, compinfo[i].num_compressed_blocks};
-    compinfo[i].copy_in_ctl  = inflate_in.data() + start_pos_uncomp;
-    compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp;
-
-    stream_info[i].dst_pos = decomp_offset;
-    decomp_offset += compinfo[i].max_uncompressed_size;
-    start_pos += compinfo[i].num_compressed_blocks;
-    start_pos_uncomp += compinfo[i].num_uncompressed_blocks;
-    max_uncomp_block_size =
-      std::max(max_uncomp_block_size, compinfo[i].max_uncompressed_block_size);
-  }
-  compinfo.host_to_device_async(stream);
-  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                 compinfo.size(),
-                                 decompressor.GetBlockSize(),
-                                 decompressor.GetLog2MaxCompressionRatio(),
-                                 stream);
-
-  // Value for checking whether we decompress successfully.
-  // It doesn't need to be atomic as there is no race condition: we only write `true` if needed.
-  cudf::detail::hostdevice_vector<bool> any_block_failure(1, stream);
-  any_block_failure[0] = false;
-  any_block_failure.host_to_device_async(stream);
-
-  // Dispatch batches of blocks to decompress
-  if (num_compressed_blocks > 0) {
-    device_span<device_span<uint8_t const>> inflate_in_view{inflate_in.data(),
-                                                            num_compressed_blocks};
-    device_span<device_span<uint8_t>> inflate_out_view{inflate_out.data(), num_compressed_blocks};
-    switch (decompressor.compression()) {
-      case compression_type::ZLIB:
-        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE)) {
-          gpuinflate(
-            inflate_in_view, inflate_out_view, inflate_res, gzip_header_included::NO, stream);
-        } else {
-          nvcomp::batched_decompress(nvcomp::compression_type::DEFLATE,
-                                     inflate_in_view,
-                                     inflate_out_view,
-                                     inflate_res,
-                                     max_uncomp_block_size,
-                                     total_decomp_size,
-                                     stream);
-        }
-        break;
-      case compression_type::SNAPPY:
-        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::SNAPPY)) {
-          gpu_unsnap(inflate_in_view, inflate_out_view, inflate_res, stream);
-        } else {
-          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
-                                     inflate_in_view,
-                                     inflate_out_view,
-                                     inflate_res,
-                                     max_uncomp_block_size,
-                                     total_decomp_size,
-                                     stream);
-        }
-        break;
-      case compression_type::ZSTD:
-        if (auto const reason = nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD);
-            reason) {
-          CUDF_FAIL("Decompression error: " + reason.value());
-        }
-        nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
-                                   inflate_in_view,
-                                   inflate_out_view,
-                                   inflate_res,
-                                   max_uncomp_block_size,
-                                   total_decomp_size,
-                                   stream);
-        break;
-      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
-    }
-
-    // Check if any block has been failed to decompress.
-    // Not using `thrust::any` or `thrust::count_if` to defer stream sync.
-    thrust::for_each(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(std::size_t{0}),
-      thrust::make_counting_iterator(inflate_res.size()),
-      [results           = inflate_res.begin(),
-       any_block_failure = any_block_failure.device_ptr()] __device__(auto const idx) {
-        if (results[idx].status != compression_status::SUCCESS) { *any_block_failure = true; }
-      });
-  }
-
-  if (num_uncompressed_blocks > 0) {
-    device_span<device_span<uint8_t const>> copy_in_view{inflate_in.data() + num_compressed_blocks,
-                                                         num_uncompressed_blocks};
-    device_span<device_span<uint8_t>> copy_out_view{inflate_out.data() + num_compressed_blocks,
-                                                    num_uncompressed_blocks};
-    gpu_copy_uncompressed_blocks(copy_in_view, copy_out_view, stream);
-  }
-
-  // Copy without stream sync, thus need to wait for stream sync below to access.
-  any_block_failure.device_to_host_async(stream);
-
-  gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
-  compinfo.device_to_host_sync(stream);  // This also sync stream for `any_block_failure`.
-
-  // We can check on host after stream synchronize
-  CUDF_EXPECTS(not any_block_failure[0], "Error during decompression");
-
-  auto const num_columns = chunks.size().second;
-
-  // Update the stream information with the updated uncompressed info
-  // TBD: We could update the value from the information we already
-  // have in stream_info[], but using the gpu results also updates
-  // max_uncompressed_size to the actual uncompressed size, or zero if
-  // decompression failed.
-  for (std::size_t i = 0; i < num_stripes; ++i) {
-    for (std::size_t j = 0; j < num_columns; ++j) {
-      auto& chunk = chunks[i][j];
-      for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
-        if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) {
-          chunk.streams[k]  = compinfo[chunk.strm_id[k]].uncompressed_data;
-          chunk.strm_len[k] = compinfo[chunk.strm_id[k]].max_uncompressed_size;
-        }
-      }
-    }
-  }
-
-  if (row_groups.size().first) {
-    chunks.host_to_device_async(stream);
-    row_groups.host_to_device_async(stream);
-    gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
-                            compinfo.device_ptr(),
-                            chunks.base_device_ptr(),
-                            num_columns,
-                            num_stripes,
-                            row_groups.size().first,
-                            row_index_stride,
-                            use_base_stride,
-                            stream);
-  }
-
-  return decomp_data;
-}
-
-/**
- * @brief Updates null mask of columns whose parent is a struct column.
- *
- * If struct column has null element, that row would be skipped while writing child column in ORC,
- * so we need to insert the missing null elements in child column. There is another behavior from
- * pyspark, where if the child column doesn't have any null elements, it will not have present
- * stream, so in that case parent null mask need to be copied to child column.
- *
- * @param chunks Vector of list of column chunk descriptors
- * @param out_buffers Output columns' device buffers
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource to use for device memory allocation
- */
-void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-                      host_span<column_buffer> out_buffers,
-                      rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr)
-{
-  auto const num_stripes = chunks.size().first;
-  auto const num_columns = chunks.size().second;
-  bool is_mask_updated   = false;
-
-  for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
-    if (chunks[0][col_idx].parent_validity_info.valid_map_base != nullptr) {
-      if (not is_mask_updated) {
-        chunks.device_to_host_sync(stream);
-        is_mask_updated = true;
-      }
-
-      auto parent_valid_map_base = chunks[0][col_idx].parent_validity_info.valid_map_base;
-      auto child_valid_map_base  = out_buffers[col_idx].null_mask();
-      auto child_mask_len =
-        chunks[0][col_idx].column_num_rows - chunks[0][col_idx].parent_validity_info.null_count;
-      auto parent_mask_len = chunks[0][col_idx].column_num_rows;
-
-      if (child_valid_map_base != nullptr) {
-        rmm::device_uvector<uint32_t> dst_idx(child_mask_len, stream);
-        // Copy indexes at which the parent has valid value.
-        thrust::copy_if(rmm::exec_policy(stream),
-                        thrust::make_counting_iterator(0),
-                        thrust::make_counting_iterator(0) + parent_mask_len,
-                        dst_idx.begin(),
-                        [parent_valid_map_base] __device__(auto idx) {
-                          return bit_is_set(parent_valid_map_base, idx);
-                        });
-
-        auto merged_null_mask = cudf::detail::create_null_mask(
-          parent_mask_len, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
-        auto merged_mask      = static_cast<bitmask_type*>(merged_null_mask.data());
-        uint32_t* dst_idx_ptr = dst_idx.data();
-        // Copy child valid bits from child column to valid indexes, this will merge both child
-        // and parent null masks
-        thrust::for_each(rmm::exec_policy(stream),
-                         thrust::make_counting_iterator(0),
-                         thrust::make_counting_iterator(0) + dst_idx.size(),
-                         [child_valid_map_base, dst_idx_ptr, merged_mask] __device__(auto idx) {
-                           if (bit_is_set(child_valid_map_base, idx)) {
-                             cudf::set_bit(merged_mask, dst_idx_ptr[idx]);
-                           };
-                         });
-
-        out_buffers[col_idx].set_null_mask(std::move(merged_null_mask));
-
-      } else {
-        // Since child column doesn't have a mask, copy parent null mask
-        auto mask_size = bitmask_allocation_size_bytes(parent_mask_len);
-        out_buffers[col_idx].set_null_mask(
-          rmm::device_buffer(static_cast<void*>(parent_valid_map_base), mask_size, stream, mr));
-      }
-    }
-  }
-
-  if (is_mask_updated) {
-    // Update chunks with pointers to column data which might have been changed.
-    for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
-      for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
-        auto& chunk          = chunks[stripe_idx][col_idx];
-        chunk.valid_map_base = out_buffers[col_idx].null_mask();
-      }
-    }
-    chunks.host_to_device_sync(stream);
-  }
-}
-
-/**
- * @brief Converts the stripe column data and outputs to columns.
- *
- * @param num_dicts Number of dictionary entries required
- * @param skip_rows Number of rows to offset from start
- * @param row_index_stride Distance between each row index
- * @param level Current nesting level being processed
- * @param tz_table Local time to UTC conversion table
- * @param chunks Vector of list of column chunk descriptors
- * @param row_groups Vector of list of row index descriptors
- * @param out_buffers Output columns' device buffers
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource to use for device memory allocation
- */
-void decode_stream_data(std::size_t num_dicts,
-                        std::size_t skip_rows,
-                        std::size_t row_index_stride,
-                        std::size_t level,
-                        table_view const& tz_table,
-                        cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
-                        cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
-                        std::vector<column_buffer>& out_buffers,
-                        rmm::cuda_stream_view stream,
-                        rmm::mr::device_memory_resource* mr)
-{
-  auto const num_stripes = chunks.size().first;
-  auto const num_columns = chunks.size().second;
-  thrust::counting_iterator<int> col_idx_it(0);
-  thrust::counting_iterator<int> stripe_idx_it(0);
-
-  // Update chunks with pointers to column data
-  std::for_each(stripe_idx_it, stripe_idx_it + num_stripes, [&](auto stripe_idx) {
-    std::for_each(col_idx_it, col_idx_it + num_columns, [&](auto col_idx) {
-      auto& chunk            = chunks[stripe_idx][col_idx];
-      chunk.column_data_base = out_buffers[col_idx].data();
-      chunk.valid_map_base   = out_buffers[col_idx].null_mask();
-    });
-  });
-
-  // Allocate global dictionary for deserializing
-  rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
-
-  chunks.host_to_device_sync(stream);
-  gpu::DecodeNullsAndStringDictionaries(
-    chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
-
-  if (level > 0) {
-    // Update nullmasks for children if parent was a struct and had null mask
-    update_null_mask(chunks, out_buffers, stream, mr);
-  }
-
-  auto const tz_table_dptr = table_device_view::create(tz_table, stream);
-  rmm::device_scalar<size_type> error_count(0, stream);
-  // Update the null map for child columns
-  gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
-                           global_dict.data(),
-                           row_groups,
-                           num_columns,
-                           num_stripes,
-                           skip_rows,
-                           *tz_table_dptr,
-                           row_groups.size().first,
-                           row_index_stride,
-                           level,
-                           error_count.data(),
-                           stream);
-  chunks.device_to_host_async(stream);
-  // `value` synchronizes
-  auto const num_errors = error_count.value(stream);
-  CUDF_EXPECTS(num_errors == 0, "ORC data decode failed");
-
-  std::for_each(col_idx_it + 0, col_idx_it + num_columns, [&](auto col_idx) {
-    out_buffers[col_idx].null_count() =
-      std::accumulate(stripe_idx_it + 0,
-                      stripe_idx_it + num_stripes,
-                      0,
-                      [&](auto null_count, auto const stripe_idx) {
-                        return null_count + chunks[stripe_idx][col_idx].null_count;
-                      });
-  });
-}
-
-/**
- * @brief Compute the per-stripe prefix sum of null count, for each struct column in the current
- * layer.
- */
-void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
-                      cudf::host_span<rmm::device_uvector<uint32_t>> prefix_sums,
-                      rmm::cuda_stream_view stream)
-{
-  auto const num_stripes = chunks.size().first;
-  if (num_stripes == 0) return;
-
-  auto const num_columns = chunks.size().second;
-  std::vector<thrust::pair<size_type, cudf::device_span<uint32_t>>> prefix_sums_to_update;
-  for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
-    // Null counts sums are only needed for children of struct columns
-    if (chunks[0][col_idx].type_kind == STRUCT) {
-      prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
-    }
-  }
-  auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
-    prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
-
-  thrust::for_each(rmm::exec_policy(stream),
-                   d_prefix_sums_to_update.begin(),
-                   d_prefix_sums_to_update.end(),
-                   [chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
-                     auto const& idx_psums) {
-                     auto const col_idx = idx_psums.first;
-                     auto const psums   = idx_psums.second;
-
-                     thrust::transform(
-                       thrust::seq,
-                       thrust::make_counting_iterator(0),
-                       thrust::make_counting_iterator(0) + psums.size(),
-                       psums.begin(),
-                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
-
-                     thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin());
-                   });
-  // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
-  stream.synchronize();
-}
-
-/**
- * @brief Aggregate child metadata from parent column chunks.
- */
-void aggregate_child_meta(std::size_t level,
-                          column_hierarchy const& selected_columns,
-                          cudf::detail::host_2dspan<gpu::ColumnDesc> chunks,
-                          cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
-                          host_span<orc_column_meta const> list_col,
-                          host_span<column_buffer> out_buffers,
-                          reader_column_meta& col_meta)
-{
-  auto const num_of_stripes         = chunks.size().first;
-  auto const num_of_rowgroups       = row_groups.size().first;
-  auto const num_child_cols         = selected_columns.levels[level + 1].size();
-  auto const number_of_child_chunks = num_child_cols * num_of_stripes;
-  auto& num_child_rows              = col_meta.num_child_rows;
-  auto& parent_column_data          = col_meta.parent_column_data;
-
-  // Reset the meta to store child column details.
-  num_child_rows.resize(selected_columns.levels[level + 1].size());
-  std::fill(num_child_rows.begin(), num_child_rows.end(), 0);
-  parent_column_data.resize(number_of_child_chunks);
-  col_meta.parent_column_index.resize(number_of_child_chunks);
-  col_meta.child_start_row.resize(number_of_child_chunks);
-  col_meta.num_child_rows_per_stripe.resize(number_of_child_chunks);
-  col_meta.rwgrp_meta.resize(num_of_rowgroups * num_child_cols);
-
-  auto child_start_row = cudf::detail::host_2dspan<uint32_t>(
-    col_meta.child_start_row.data(), num_of_stripes, num_child_cols);
-  auto num_child_rows_per_stripe = cudf::detail::host_2dspan<uint32_t>(
-    col_meta.num_child_rows_per_stripe.data(), num_of_stripes, num_child_cols);
-  auto rwgrp_meta = cudf::detail::host_2dspan<reader_column_meta::row_group_meta>(
-    col_meta.rwgrp_meta.data(), num_of_rowgroups, num_child_cols);
-
-  int index = 0;  // number of child column processed
-
-  // For each parent column, update its child column meta for each stripe.
-  std::for_each(list_col.begin(), list_col.end(), [&](auto const p_col) {
-    auto const parent_col_idx = col_meta.orc_col_map[level][p_col.id];
-    auto start_row            = 0;
-    auto processed_row_groups = 0;
-
-    for (std::size_t stripe_id = 0; stripe_id < num_of_stripes; stripe_id++) {
-      // Aggregate num_rows and start_row from processed parent columns per row groups
-      if (num_of_rowgroups) {
-        auto stripe_num_row_groups = chunks[stripe_id][parent_col_idx].num_rowgroups;
-        auto processed_child_rows  = 0;
-
-        for (std::size_t rowgroup_id = 0; rowgroup_id < stripe_num_row_groups;
-             rowgroup_id++, processed_row_groups++) {
-          auto const child_rows = row_groups[processed_row_groups][parent_col_idx].num_child_rows;
-          for (size_type id = 0; id < p_col.num_children; id++) {
-            auto const child_col_idx                                  = index + id;
-            rwgrp_meta[processed_row_groups][child_col_idx].start_row = processed_child_rows;
-            rwgrp_meta[processed_row_groups][child_col_idx].num_rows  = child_rows;
-          }
-          processed_child_rows += child_rows;
-        }
-      }
-
-      // Aggregate start row, number of rows per chunk and total number of rows in a column
-      auto const child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
-      for (size_type id = 0; id < p_col.num_children; id++) {
-        auto const child_col_idx = index + id;
-
-        num_child_rows[child_col_idx] += child_rows;
-        num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
-        // start row could be different for each column when there is nesting at each stripe level
-        child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
-      }
-      start_row += child_rows;
-    }
-
-    // Parent column null mask and null count would be required for child column
-    // to adjust its nullmask.
-    auto type              = out_buffers[parent_col_idx].type.id();
-    auto parent_null_count = static_cast<uint32_t>(out_buffers[parent_col_idx].null_count());
-    auto parent_valid_map  = out_buffers[parent_col_idx].null_mask();
-    auto num_rows          = out_buffers[parent_col_idx].size;
-
-    for (size_type id = 0; id < p_col.num_children; id++) {
-      auto const child_col_idx                    = index + id;
-      col_meta.parent_column_index[child_col_idx] = parent_col_idx;
-      if (type == type_id::STRUCT) {
-        parent_column_data[child_col_idx] = {parent_valid_map, parent_null_count};
-        // Number of rows in child will remain same as parent in case of struct column
-        num_child_rows[child_col_idx] = num_rows;
-      } else {
-        parent_column_data[child_col_idx] = {nullptr, 0};
-      }
-    }
-    index += p_col.num_children;
-  });
-}
-
-/**
- * @brief struct to store buffer data and size of list buffer
- */
-struct list_buffer_data {
-  size_type* data;
-  size_type size;
-};
-
-// Generates offsets for list buffer from number of elements in a row.
-void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_stream_view stream)
-{
-  for (auto& list_data : buff_data) {
-    thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
-                           list_data.data,
-                           list_data.data + list_data.size,
-                           list_data.data);
-  }
-}
-
-/**
- * @brief Function that translates ORC data kind to cuDF type enum
- */
-constexpr type_id to_cudf_type(orc::TypeKind kind,
-                               bool use_np_dtypes,
-                               type_id timestamp_type_id,
-                               type_id decimal_type_id)
-{
-  switch (kind) {
-    case orc::BOOLEAN: return type_id::BOOL8;
-    case orc::BYTE: return type_id::INT8;
-    case orc::SHORT: return type_id::INT16;
-    case orc::INT: return type_id::INT32;
-    case orc::LONG: return type_id::INT64;
-    case orc::FLOAT: return type_id::FLOAT32;
-    case orc::DOUBLE: return type_id::FLOAT64;
-    case orc::STRING:
-    case orc::BINARY:
-    case orc::VARCHAR:
-    case orc::CHAR:
-      // Variable-length types can all be mapped to STRING
-      return type_id::STRING;
-    case orc::TIMESTAMP:
-      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                   : type_id::TIMESTAMP_NANOSECONDS;
-    case orc::DATE:
-      // There isn't a (DAYS -> np.dtype) mapping
-      return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS;
-    case orc::DECIMAL: return decimal_type_id;
-    // Need to update once cuDF plans to support map type
-    case orc::MAP:
-    case orc::LIST: return type_id::LIST;
-    case orc::STRUCT: return type_id::STRUCT;
-    default: break;
-  }
-
-  return type_id::EMPTY;
-}
-
-/**
- * @brief Determines cuDF type of an ORC Decimal column.
- */
-type_id to_cudf_decimal_type(host_span<std::string const> decimal128_columns,
-                             aggregate_orc_metadata const& metadata,
-                             int column_index)
-{
-  if (metadata.get_col_type(column_index).kind != DECIMAL) { return type_id::EMPTY; }
-
-  if (std::find(decimal128_columns.begin(),
-                decimal128_columns.end(),
-                metadata.column_path(0, column_index)) != decimal128_columns.end()) {
-    return type_id::DECIMAL128;
-  }
-
-  auto const precision = metadata.get_col_type(column_index)
-                           .precision.value_or(cuda::std::numeric_limits<int64_t>::digits10);
-  if (precision <= cuda::std::numeric_limits<int32_t>::digits10) { return type_id::DECIMAL32; }
-  if (precision <= cuda::std::numeric_limits<int64_t>::digits10) { return type_id::DECIMAL64; }
-  return type_id::DECIMAL128;
-}
-
-std::string get_map_child_col_name(std::size_t const idx) { return (idx == 0) ? "key" : "value"; }
-
-/**
- * @brief Create empty columns and respective schema information from the buffer.
- */
-std::unique_ptr<column> create_empty_column(size_type orc_col_id,
-                                            aggregate_orc_metadata const& metadata,
-                                            host_span<std::string const> decimal128_columns,
-                                            bool use_np_dtypes,
-                                            data_type timestamp_type,
-                                            column_name_info& schema_info,
-                                            rmm::cuda_stream_view stream)
-{
-  schema_info.name = metadata.column_name(0, orc_col_id);
-  auto const kind  = metadata.get_col_type(orc_col_id).kind;
-  auto const type  = to_cudf_type(kind,
-                                 use_np_dtypes,
-                                 timestamp_type.id(),
-                                 to_cudf_decimal_type(decimal128_columns, metadata, orc_col_id));
-
-  switch (kind) {
-    case orc::LIST: {
-      schema_info.children.emplace_back("offsets");
-      schema_info.children.emplace_back("");
-      return make_lists_column(0,
-                               make_empty_column(type_id::INT32),
-                               create_empty_column(metadata.get_col_type(orc_col_id).subtypes[0],
-                                                   metadata,
-                                                   decimal128_columns,
-                                                   use_np_dtypes,
-                                                   timestamp_type,
-                                                   schema_info.children.back(),
-                                                   stream),
-                               0,
-                               rmm::device_buffer{0, stream},
-                               stream);
-    }
-    case orc::MAP: {
-      schema_info.children.emplace_back("offsets");
-      schema_info.children.emplace_back("struct");
-      auto const child_column_ids = metadata.get_col_type(orc_col_id).subtypes;
-      auto& children_schema       = schema_info.children.back().children;
-      std::vector<std::unique_ptr<column>> child_columns;
-      for (std::size_t idx = 0; idx < metadata.get_col_type(orc_col_id).subtypes.size(); idx++) {
-        children_schema.emplace_back("");
-        child_columns.push_back(create_empty_column(child_column_ids[idx],
-                                                    metadata,
-                                                    decimal128_columns,
-                                                    use_np_dtypes,
-                                                    timestamp_type,
-                                                    schema_info.children.back().children.back(),
-                                                    stream));
-        children_schema[idx].name = get_map_child_col_name(idx);
-      }
-      return make_lists_column(
-        0,
-        make_empty_column(type_id::INT32),
-        make_structs_column(0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream),
-        0,
-        rmm::device_buffer{0, stream},
-        stream);
-    }
-
-    case orc::STRUCT: {
-      std::vector<std::unique_ptr<column>> child_columns;
-      for (auto const col : metadata.get_col_type(orc_col_id).subtypes) {
-        schema_info.children.emplace_back("");
-        child_columns.push_back(create_empty_column(col,
-                                                    metadata,
-                                                    decimal128_columns,
-                                                    use_np_dtypes,
-                                                    timestamp_type,
-                                                    schema_info.children.back(),
-                                                    stream));
-      }
-      return make_structs_column(
-        0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream);
-    }
-
-    case orc::DECIMAL: {
-      int32_t scale = 0;
-      if (type == type_id::DECIMAL32 or type == type_id::DECIMAL64 or type == type_id::DECIMAL128) {
-        scale = -static_cast<int32_t>(metadata.get_types()[orc_col_id].scale.value_or(0));
-      }
-      return make_empty_column(data_type(type, scale));
-    }
-
-    default: return make_empty_column(type);
-  }
-}
-
-/**
- * @brief Assemble the buffer with child columns.
- */
-column_buffer assemble_buffer(size_type orc_col_id,
-                              std::size_t level,
-                              reader_column_meta const& col_meta,
-                              aggregate_orc_metadata const& metadata,
-                              column_hierarchy const& selected_columns,
-                              std::vector<std::vector<column_buffer>>& col_buffers,
-                              rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
-{
-  auto const col_id = col_meta.orc_col_map[level][orc_col_id];
-  auto& col_buffer  = col_buffers[level][col_id];
-
-  col_buffer.name = metadata.column_name(0, orc_col_id);
-  auto kind       = metadata.get_col_type(orc_col_id).kind;
-  switch (kind) {
-    case orc::LIST:
-    case orc::STRUCT: {
-      auto const& children_indices = selected_columns.children.at(orc_col_id);
-      for (auto const child_id : children_indices) {
-        col_buffer.children.emplace_back(assemble_buffer(
-          child_id, level + 1, col_meta, metadata, selected_columns, col_buffers, stream, mr));
-      }
-    } break;
-
-    case orc::MAP: {
-      std::vector<column_buffer> child_col_buffers;
-      // Get child buffers
-      auto const& children_indices = selected_columns.children.at(orc_col_id);
-      for (std::size_t idx = 0; idx < children_indices.size(); idx++) {
-        auto const col = children_indices[idx];
-        child_col_buffers.emplace_back(assemble_buffer(
-          col, level + 1, col_meta, metadata, selected_columns, col_buffers, stream, mr));
-        child_col_buffers.back().name = get_map_child_col_name(idx);
-      }
-      // Create a struct buffer
-      auto num_rows = child_col_buffers[0].size;
-      auto struct_buffer =
-        column_buffer(cudf::data_type(type_id::STRUCT), num_rows, false, stream, mr);
-      struct_buffer.children = std::move(child_col_buffers);
-      struct_buffer.name     = "struct";
-
-      col_buffer.children.emplace_back(std::move(struct_buffer));
-    } break;
-
-    default: break;
-  }
-
-  return std::move(col_buffer);
-}
-
-}  // namespace
 
 reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    orc_reader_options const& options,
@@ -945,14 +26,14 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    rmm::mr::device_memory_resource* mr)
   : _stream(stream),
     _mr(mr),
-    _sources(std::move(sources)),
-    _metadata{_sources, stream},
-    _selected_columns{_metadata.select_columns(options.get_columns())},
     _timestamp_type{options.get_timestamp_type()},
     _use_index{options.is_enabled_use_index()},
     _use_np_dtypes{options.is_enabled_use_np_dtypes()},
     _decimal128_columns{options.get_decimal128_columns()},
-    _col_meta{std::make_unique<reader_column_meta>()}
+    _col_meta{std::make_unique<reader_column_meta>()},
+    _sources(std::move(sources)),
+    _metadata{_sources, stream},
+    _selected_columns{_metadata.select_columns(options.get_columns())}
 {
 }
 
@@ -960,23 +41,21 @@ table_with_metadata reader::impl::read(uint64_t skip_rows,
                                        std::optional<size_type> const& num_rows_opt,
                                        std::vector<std::vector<size_type>> const& stripes)
 {
-  // Selected columns at different levels of nesting are stored in different elements
-  // of `selected_columns`; thus, size == 1 means no nested columns
-  CUDF_EXPECTS(skip_rows == 0 or _selected_columns.num_levels() == 1,
-               "skip_rows is not supported by nested columns");
-
-  // There are no columns in the table
-  if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
+  prepare_data(skip_rows, num_rows_opt, stripes);
+  return read_chunk_internal();
+}
 
-  std::vector<std::vector<column_buffer>> out_buffers(_selected_columns.num_levels());
-  std::vector<std::unique_ptr<column>> out_columns;
-  table_metadata out_metadata;
+table_metadata reader::impl::make_output_metadata()
+{
+  if (_output_metadata) { return table_metadata{*_output_metadata}; }
 
   // Copy user data to the output metadata.
+  table_metadata out_metadata;
+  out_metadata.per_file_user_data.reserve(_metadata.per_file_metadata.size());
   std::transform(_metadata.per_file_metadata.cbegin(),
                  _metadata.per_file_metadata.cend(),
                  std::back_inserter(out_metadata.per_file_user_data),
-                 [](auto& meta) {
+                 [](auto const& meta) {
                    std::unordered_map<std::string, std::string> kv_map;
                    std::transform(meta.ff.metadata.cbegin(),
                                   meta.ff.metadata.cend(),
@@ -989,12 +68,22 @@ table_with_metadata reader::impl::read(uint64_t skip_rows,
   out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),
                             out_metadata.per_file_user_data[0].end()};
 
-  // Select only stripes required (aka row groups)
-  auto const [rows_to_skip, rows_to_read, selected_stripes] =
-    _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
+  // Save the output table metadata into `_output_metadata` for reuse next time.
+  _output_metadata = std::make_unique<table_metadata>(out_metadata);
+
+  return out_metadata;
+}
+
+table_with_metadata reader::impl::read_chunk_internal()
+{
+  // There is no columns in the table.
+  if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
+
+  std::vector<std::unique_ptr<column>> out_columns;
+  auto out_metadata = make_output_metadata();
 
   // If no rows or stripes to read, return empty columns
-  if (rows_to_read == 0 || selected_stripes.empty()) {
+  if (_file_itm_data->rows_to_read == 0 || _file_itm_data->selected_stripes.empty()) {
     std::transform(_selected_columns.levels[0].begin(),
                    _selected_columns.levels[0].end(),
                    std::back_inserter(out_columns),
@@ -1011,324 +100,6 @@ table_with_metadata reader::impl::read(uint64_t skip_rows,
     return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
   }
 
-  // Set up table for converting timestamp columns from local to UTC time
-  auto const tz_table = [&, &selected_stripes = selected_stripes] {
-    auto const has_timestamp_column = std::any_of(
-      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
-        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
-          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
-        });
-      });
-
-    return has_timestamp_column
-             ? cudf::detail::make_timezone_transition_table(
-                 {}, selected_stripes[0].stripe_info[0].second->writerTimezone, _stream)
-             : std::make_unique<cudf::table>();
-  }();
-
-  std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data(_selected_columns.num_levels());
-  std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
-
-  // Iterates through levels of nested columns, child column will be one level down
-  // compared to parent column.
-  auto& col_meta = *_col_meta;
-  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    auto& columns_level = _selected_columns.levels[level];
-    // Association between each ORC column and its cudf::column
-    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
-    std::vector<orc_column_meta> nested_col;
-
-    // Get a list of column data types
-    std::vector<data_type> column_types;
-    for (auto& col : columns_level) {
-      auto col_type = to_cudf_type(_metadata.get_col_type(col.id).kind,
-                                   _use_np_dtypes,
-                                   _timestamp_type.id(),
-                                   to_cudf_decimal_type(_decimal128_columns, _metadata, col.id));
-      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
-          col_type == type_id::DECIMAL128) {
-        // sign of the scale is changed since cuDF follows c++ libraries like CNL
-        // which uses negative scaling, but liborc and other libraries
-        // follow positive scaling.
-        auto const scale =
-          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
-        column_types.emplace_back(col_type, scale);
-      } else {
-        column_types.emplace_back(col_type);
-      }
-
-      // Map each ORC column to its column
-      col_meta.orc_col_map[level][col.id] = column_types.size() - 1;
-      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
-        nested_col.emplace_back(col);
-      }
-    }
-
-    // Get the total number of stripes across all input files.
-    std::size_t total_num_stripes =
-      std::accumulate(selected_stripes.begin(),
-                      selected_stripes.end(),
-                      0,
-                      [](std::size_t sum, auto& stripe_source_mapping) {
-                        return sum + stripe_source_mapping.stripe_info.size();
-                      });
-    auto const num_columns = columns_level.size();
-    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
-      total_num_stripes, num_columns, _stream);
-    memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
-
-    const bool use_index =
-      _use_index &&
-      // Do stripes have row group index
-      _metadata.is_row_grp_idx_present() &&
-      // Only use if we don't have much work with complete columns & stripes
-      // TODO: Consider nrows, gpu, and tune the threshold
-      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
-       _metadata.get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) &&
-      // Only use if first row is aligned to a stripe boundary
-      // TODO: Fix logic to handle unaligned rows
-      (rows_to_skip == 0);
-
-    // Logically view streams as columns
-    std::vector<orc_stream_info> stream_info;
-
-    null_count_prefix_sums.emplace_back();
-    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
-    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
-                    _selected_columns.levels[level].size(),
-                    [&]() {
-                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                        total_num_stripes, _stream, rmm::mr::get_current_device_resource());
-                    });
-
-    // Tracker for eventually deallocating compressed and uncompressed data
-    auto& stripe_data = lvl_stripe_data[level];
-
-    std::size_t stripe_start_row = 0;
-    std::size_t num_dict_entries = 0;
-    std::size_t num_rowgroups    = 0;
-    int stripe_idx               = 0;
-
-    std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
-    for (auto const& stripe_source_mapping : selected_stripes) {
-      // Iterate through the source files selected stripes
-      for (auto const& stripe : stripe_source_mapping.stripe_info) {
-        auto const stripe_info   = stripe.first;
-        auto const stripe_footer = stripe.second;
-
-        auto stream_count          = stream_info.size();
-        auto const total_data_size = gather_stream_info(stripe_idx,
-                                                        stripe_info,
-                                                        stripe_footer,
-                                                        col_meta.orc_col_map[level],
-                                                        _metadata.get_types(),
-                                                        use_index,
-                                                        level == 0,
-                                                        &num_dict_entries,
-                                                        stream_info,
-                                                        chunks);
-
-        auto const is_stripe_data_empty = total_data_size == 0;
-        CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
-                     "Invalid index rowgroup stream data");
-
-        // Buffer needs to be padded.
-        // Required by `copy_uncompressed_kernel`.
-        stripe_data.emplace_back(
-          cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), _stream);
-        auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
-
-        // Coalesce consecutive streams into one read
-        while (not is_stripe_data_empty and stream_count < stream_info.size()) {
-          auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
-          auto const offset = stream_info[stream_count].offset;
-          auto len          = stream_info[stream_count].length;
-          stream_count++;
-
-          while (stream_count < stream_info.size() &&
-                 stream_info[stream_count].offset == offset + len) {
-            len += stream_info[stream_count].length;
-            stream_count++;
-          }
-          if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                .source->is_device_read_preferred(len)) {
-            read_tasks.push_back(
-              std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                          .source->device_read_async(offset, len, d_dst, _stream),
-                        len));
-
-          } else {
-            auto const buffer =
-              _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
-                offset, len);
-            CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
-            CUDF_CUDA_TRY(
-              cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
-            _stream.synchronize();
-          }
-        }
-
-        auto const num_rows_per_stripe = stripe_info->numberOfRows;
-        auto const rowgroup_id         = num_rowgroups;
-        auto stripe_num_rowgroups      = 0;
-        if (use_index) {
-          stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
-                                 _metadata.get_row_index_stride();
-        }
-        // Update chunks to reference streams pointers
-        for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-          auto& chunk = chunks[stripe_idx][col_idx];
-          // start row, number of rows in a each stripe and total number of rows
-          // may change in lower levels of nesting
-          chunk.start_row = (level == 0)
-                              ? stripe_start_row
-                              : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
-          chunk.num_rows =
-            (level == 0) ? stripe_info->numberOfRows
-                         : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
-          chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
-          chunk.parent_validity_info =
-            (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
-          chunk.parent_null_count_prefix_sums =
-            (level == 0)
-              ? nullptr
-              : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
-          chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
-          chunk.type_kind     = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                              .ff.types[columns_level[col_idx].id]
-                              .kind;
-          // num_child_rows for a struct column will be same, for other nested types it will be
-          // calculated.
-          chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
-          chunk.dtype_id       = column_types[col_idx].id();
-          chunk.decimal_scale  = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                                  .ff.types[columns_level[col_idx].id]
-                                  .scale.value_or(0);
-
-          chunk.rowgroup_id   = rowgroup_id;
-          chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
-                                  ? sizeof(string_index_pair)
-                                : ((column_types[col_idx].id() == type_id::LIST) or
-                               (column_types[col_idx].id() == type_id::STRUCT))
-                                  ? sizeof(size_type)
-                                  : cudf::size_of(column_types[col_idx]);
-          chunk.num_rowgroups = stripe_num_rowgroups;
-          if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _timestamp_type.id(); }
-          if (not is_stripe_data_empty) {
-            for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-              chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
-            }
-          }
-        }
-        stripe_start_row += num_rows_per_stripe;
-        num_rowgroups += stripe_num_rowgroups;
-
-        stripe_idx++;
-      }
-    }
-    for (auto& task : read_tasks) {
-      CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
-    }
-
-    if (stripe_data.empty()) { continue; }
-
-    // Process dataset chunk pages into output columns
-    auto row_groups =
-      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, _stream);
-    if (level > 0 and row_groups.size().first) {
-      cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
-                                                     num_rowgroups * num_columns);
-      auto& rw_grp_meta = col_meta.rwgrp_meta;
-
-      // Update start row and num rows per row group
-      std::transform(rw_grp_meta.begin(),
-                     rw_grp_meta.end(),
-                     row_groups_span.begin(),
-                     rw_grp_meta.begin(),
-                     [&](auto meta, auto& row_grp) {
-                       row_grp.num_rows  = meta.num_rows;
-                       row_grp.start_row = meta.start_row;
-                       return meta;
-                     });
-    }
-    // Setup row group descriptors if using indexes
-    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      auto decomp_data = decompress_stripe_data(*_metadata.per_file_metadata[0].decompressor,
-                                                stripe_data,
-                                                stream_info,
-                                                chunks,
-                                                row_groups,
-                                                total_num_stripes,
-                                                _metadata.get_row_index_stride(),
-                                                level == 0,
-                                                _stream);
-      stripe_data.clear();
-      stripe_data.push_back(std::move(decomp_data));
-    } else {
-      if (row_groups.size().first) {
-        chunks.host_to_device_async(_stream);
-        row_groups.host_to_device_async(_stream);
-        gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
-                                nullptr,
-                                chunks.base_device_ptr(),
-                                num_columns,
-                                total_num_stripes,
-                                num_rowgroups,
-                                _metadata.get_row_index_stride(),
-                                level == 0,
-                                _stream);
-      }
-    }
-
-    for (std::size_t i = 0; i < column_types.size(); ++i) {
-      bool is_nullable = false;
-      for (std::size_t j = 0; j < total_num_stripes; ++j) {
-        if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
-          is_nullable = true;
-          break;
-        }
-      }
-      auto is_list_type = (column_types[i].id() == type_id::LIST);
-      auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
-      // For list column, offset column will be always size + 1
-      if (is_list_type) n_rows++;
-      out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
-    }
-
-    decode_stream_data(num_dict_entries,
-                       rows_to_skip,
-                       _metadata.get_row_index_stride(),
-                       level,
-                       tz_table->view(),
-                       chunks,
-                       row_groups,
-                       out_buffers[level],
-                       _stream,
-                       _mr);
-
-    if (nested_col.size()) {
-      // Extract information to process nested child columns
-      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
-
-      row_groups.device_to_host_sync(_stream);
-      aggregate_child_meta(
-        level, _selected_columns, chunks, row_groups, nested_col, out_buffers[level], col_meta);
-
-      // ORC stores number of elements at each row, so we need to generate offsets from that
-      std::vector<list_buffer_data> buff_data;
-      std::for_each(
-        out_buffers[level].begin(), out_buffers[level].end(), [&buff_data](auto& out_buffer) {
-          if (out_buffer.type.id() == type_id::LIST) {
-            auto data = static_cast<size_type*>(out_buffer.data());
-            buff_data.emplace_back(list_buffer_data{data, out_buffer.size});
-          }
-        });
-
-      if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
-    }
-  }
-
   // Create columns from buffer with respective schema information.
   std::transform(
     _selected_columns.levels[0].begin(),
@@ -1337,7 +108,7 @@ table_with_metadata reader::impl::read(uint64_t skip_rows,
     [&](auto const& orc_col_meta) {
       out_metadata.schema_info.emplace_back("");
       auto col_buffer = assemble_buffer(
-        orc_col_meta.id, 0, col_meta, _metadata, _selected_columns, out_buffers, _stream, _mr);
+        orc_col_meta.id, 0, *_col_meta, _metadata, _selected_columns, _out_buffers, _stream, _mr);
       return make_column(col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream);
     });
 
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 4a7771687f6..6561c08f2d9 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -17,11 +17,8 @@
 #pragma once
 
 #include "aggregate_orc_metadata.hpp"
-#include "orc.hpp"
-#include "orc_gpu.hpp"
 
 #include <io/utilities/column_buffer.hpp>
-#include <io/utilities/hostdevice_vector.hpp>
 
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/orc.hpp>
@@ -30,15 +27,13 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <memory>
-#include <string>
-#include <utility>
+#include <optional>
 #include <vector>
 
 namespace cudf::io::orc::detail {
 
-namespace {
 struct reader_column_meta;
-}
+struct file_intermediate_data;
 
 /**
  * @brief Implementation for ORC reader.
@@ -62,7 +57,7 @@ class reader::impl {
    * @brief Read an entire set or a subset of data and returns a set of columns
    *
    * @param skip_rows Number of rows to skip from the start
-   * @param num_rows_opt Optional number of rows to read
+   * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows
    * @param stripes Indices of individual stripes to load if non-empty
    * @return The set of columns along with metadata
    */
@@ -71,18 +66,50 @@ class reader::impl {
                            std::vector<std::vector<size_type>> const& stripes);
 
  private:
+  /**
+   * @brief Perform all the necessary data preprocessing before creating an output table.
+   *
+   * @param skip_rows Number of rows to skip from the start
+   * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows
+   * @param stripes Indices of individual stripes to load if non-empty
+   */
+  void prepare_data(uint64_t skip_rows,
+                    std::optional<size_type> const& num_rows_opt,
+                    std::vector<std::vector<size_type>> const& stripes);
+
+  /**
+   * @brief Create the output table metadata from file metadata.
+   *
+   * @return Columns' metadata to output with the table read from file
+   */
+  table_metadata make_output_metadata();
+
+  /**
+   * @brief Read a chunk of data from the input source and return an output table with metadata.
+   *
+   * This function is called internally and expects all preprocessing steps have already been done.
+   *
+   * @return The output table along with columns' metadata
+   */
+  table_with_metadata read_chunk_internal();
+
   rmm::cuda_stream_view const _stream;
   rmm::mr::device_memory_resource* const _mr;
 
-  std::vector<std::unique_ptr<datasource>> const _sources;  // Unused but owns data for `_metadata`
-  aggregate_orc_metadata _metadata;
-  column_hierarchy const _selected_columns;  // Need to be after _metadata
-
+  // Reader configs
   data_type const _timestamp_type;  // Override output timestamp resolution
   bool const _use_index;            // Enable or disable attempt to use row index for parsing
   bool const _use_np_dtypes;        // Enable or disable the conversion to numpy-compatible dtypes
   std::vector<std::string> const _decimal128_columns;   // Control decimals conversion
   std::unique_ptr<reader_column_meta> const _col_meta;  // Track of orc mapping and child details
+
+  // Intermediate data for internal processing.
+  std::vector<std::unique_ptr<datasource>> const _sources;  // Unused but owns data for `_metadata`
+  aggregate_orc_metadata _metadata;
+  column_hierarchy const _selected_columns;  // Construct from `_metadata` thus declare after it
+  std::unique_ptr<file_intermediate_data> _file_itm_data;
+  std::unique_ptr<table_metadata> _output_metadata;
+  std::vector<std::vector<cudf::io::detail::column_buffer>> _out_buffers;
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
new file mode 100644
index 00000000000..44ece671155
--- /dev/null
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "orc_gpu.hpp"
+
+#include <io/utilities/hostdevice_vector.hpp>
+
+#include <cudf/types.hpp>
+
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cudf::io::orc::detail {
+
+/**
+ * @brief Struct to store file-level data that remains constant for all chunks being read.
+ */
+struct file_intermediate_data {
+  std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
+  std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
+
+  int64_t rows_to_skip;
+  size_type rows_to_read;
+  std::vector<metadata::stripe_source_mapping> selected_stripes;
+};
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_helpers.cpp b/cpp/src/io/orc/reader_impl_helpers.cpp
new file mode 100644
index 00000000000..ea4e5dcfaab
--- /dev/null
+++ b/cpp/src/io/orc/reader_impl_helpers.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl_helpers.hpp"
+
+namespace cudf::io::orc::detail {
+
+std::unique_ptr<column> create_empty_column(size_type orc_col_id,
+                                            aggregate_orc_metadata const& metadata,
+                                            host_span<std::string const> decimal128_columns,
+                                            bool use_np_dtypes,
+                                            data_type timestamp_type,
+                                            column_name_info& schema_info,
+                                            rmm::cuda_stream_view stream)
+{
+  schema_info.name = metadata.column_name(0, orc_col_id);
+  auto const kind  = metadata.get_col_type(orc_col_id).kind;
+  auto const type  = to_cudf_type(kind,
+                                 use_np_dtypes,
+                                 timestamp_type.id(),
+                                 to_cudf_decimal_type(decimal128_columns, metadata, orc_col_id));
+
+  switch (kind) {
+    case orc::LIST: {
+      schema_info.children.emplace_back("offsets");
+      schema_info.children.emplace_back("");
+      return make_lists_column(0,
+                               make_empty_column(type_id::INT32),
+                               create_empty_column(metadata.get_col_type(orc_col_id).subtypes[0],
+                                                   metadata,
+                                                   decimal128_columns,
+                                                   use_np_dtypes,
+                                                   timestamp_type,
+                                                   schema_info.children.back(),
+                                                   stream),
+                               0,
+                               rmm::device_buffer{0, stream},
+                               stream);
+    }
+    case orc::MAP: {
+      schema_info.children.emplace_back("offsets");
+      schema_info.children.emplace_back("struct");
+      auto const child_column_ids = metadata.get_col_type(orc_col_id).subtypes;
+      auto& children_schema       = schema_info.children.back().children;
+      std::vector<std::unique_ptr<column>> child_columns;
+      for (std::size_t idx = 0; idx < metadata.get_col_type(orc_col_id).subtypes.size(); idx++) {
+        children_schema.emplace_back("");
+        child_columns.push_back(create_empty_column(child_column_ids[idx],
+                                                    metadata,
+                                                    decimal128_columns,
+                                                    use_np_dtypes,
+                                                    timestamp_type,
+                                                    schema_info.children.back().children.back(),
+                                                    stream));
+        children_schema[idx].name = get_map_child_col_name(idx);
+      }
+      return make_lists_column(
+        0,
+        make_empty_column(type_id::INT32),
+        make_structs_column(0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream),
+        0,
+        rmm::device_buffer{0, stream},
+        stream);
+    }
+
+    case orc::STRUCT: {
+      std::vector<std::unique_ptr<column>> child_columns;
+      for (auto const col : metadata.get_col_type(orc_col_id).subtypes) {
+        schema_info.children.emplace_back("");
+        child_columns.push_back(create_empty_column(col,
+                                                    metadata,
+                                                    decimal128_columns,
+                                                    use_np_dtypes,
+                                                    timestamp_type,
+                                                    schema_info.children.back(),
+                                                    stream));
+      }
+      return make_structs_column(
+        0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream);
+    }
+
+    case orc::DECIMAL: {
+      int32_t scale = 0;
+      if (type == type_id::DECIMAL32 or type == type_id::DECIMAL64 or type == type_id::DECIMAL128) {
+        scale = -static_cast<int32_t>(metadata.get_types()[orc_col_id].scale.value_or(0));
+      }
+      return make_empty_column(data_type(type, scale));
+    }
+
+    default: return make_empty_column(type);
+  }
+}
+
+column_buffer assemble_buffer(size_type orc_col_id,
+                              std::size_t level,
+                              reader_column_meta const& col_meta,
+                              aggregate_orc_metadata const& metadata,
+                              column_hierarchy const& selected_columns,
+                              std::vector<std::vector<column_buffer>>& col_buffers,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
+{
+  auto const col_id = col_meta.orc_col_map[level][orc_col_id];
+  auto& col_buffer  = col_buffers[level][col_id];
+
+  col_buffer.name = metadata.column_name(0, orc_col_id);
+  auto kind       = metadata.get_col_type(orc_col_id).kind;
+  switch (kind) {
+    case orc::LIST:
+    case orc::STRUCT: {
+      auto const& children_indices = selected_columns.children.at(orc_col_id);
+      for (auto const child_id : children_indices) {
+        col_buffer.children.emplace_back(assemble_buffer(
+          child_id, level + 1, col_meta, metadata, selected_columns, col_buffers, stream, mr));
+      }
+    } break;
+
+    case orc::MAP: {
+      std::vector<column_buffer> child_col_buffers;
+      // Get child buffers
+      auto const& children_indices = selected_columns.children.at(orc_col_id);
+      for (std::size_t idx = 0; idx < children_indices.size(); idx++) {
+        auto const col = children_indices[idx];
+        child_col_buffers.emplace_back(assemble_buffer(
+          col, level + 1, col_meta, metadata, selected_columns, col_buffers, stream, mr));
+        child_col_buffers.back().name = get_map_child_col_name(idx);
+      }
+      // Create a struct buffer
+      auto num_rows = child_col_buffers[0].size;
+      auto struct_buffer =
+        column_buffer(cudf::data_type(type_id::STRUCT), num_rows, false, stream, mr);
+      struct_buffer.children = std::move(child_col_buffers);
+      struct_buffer.name     = "struct";
+
+      col_buffer.children.emplace_back(std::move(struct_buffer));
+    } break;
+
+    default: break;
+  }
+
+  return std::move(col_buffer);
+}
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp
new file mode 100644
index 00000000000..f0d91c75fc3
--- /dev/null
+++ b/cpp/src/io/orc/reader_impl_helpers.hpp
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "aggregate_orc_metadata.hpp"
+#include "orc.hpp"
+
+#include <io/utilities/column_buffer.hpp>
+
+#include <cudf/io/orc.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <memory>
+#include <vector>
+
+namespace cudf::io::orc::detail {
+using namespace cudf::io::detail;
+
+/**
+ * @brief Keeps track of orc mapping and child column details.
+ */
+struct reader_column_meta {
+  // Mapping between column id in orc to processing order.
+  std::vector<std::vector<size_type>> orc_col_map;
+
+  // Number of rows in child columns.
+  std::vector<uint32_t> num_child_rows;
+
+  // Consists of parent column valid_map and null count.
+  std::vector<column_validity_info> parent_column_data;
+
+  std::vector<size_type> parent_column_index;
+
+  // Start row of child columns [stripe][column].
+  std::vector<uint32_t> child_start_row;
+
+  // Number of rows of child columns [stripe][column].
+  std::vector<uint32_t> num_child_rows_per_stripe;
+
+  struct row_group_meta {
+    uint32_t num_rows;   // number of rows in a column in a row group
+    uint32_t start_row;  // start row in a column in a row group
+  };
+
+  // Row group metadata [rowgroup][column].
+  std::vector<row_group_meta> rwgrp_meta;
+};
+
+/**
+ * @brief Function that translates ORC data kind to cuDF type enum
+ */
+inline constexpr type_id to_cudf_type(orc::TypeKind kind,
+                                      bool use_np_dtypes,
+                                      type_id timestamp_type_id,
+                                      type_id decimal_type_id)
+{
+  switch (kind) {
+    case orc::BOOLEAN: return type_id::BOOL8;
+    case orc::BYTE: return type_id::INT8;
+    case orc::SHORT: return type_id::INT16;
+    case orc::INT: return type_id::INT32;
+    case orc::LONG: return type_id::INT64;
+    case orc::FLOAT: return type_id::FLOAT32;
+    case orc::DOUBLE: return type_id::FLOAT64;
+    case orc::STRING:
+    case orc::BINARY:
+    case orc::VARCHAR:
+    case orc::CHAR:
+      // Variable-length types can all be mapped to STRING
+      return type_id::STRING;
+    case orc::TIMESTAMP:
+      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
+                                                   : type_id::TIMESTAMP_NANOSECONDS;
+    case orc::DATE:
+      // There isn't a (DAYS -> np.dtype) mapping
+      return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS;
+    case orc::DECIMAL: return decimal_type_id;
+    // Need to update once cuDF plans to support map type
+    case orc::MAP:
+    case orc::LIST: return type_id::LIST;
+    case orc::STRUCT: return type_id::STRUCT;
+    default: break;
+  }
+
+  return type_id::EMPTY;
+}
+
+/**
+ * @brief Determines cuDF type of an ORC Decimal column.
+ */
+inline type_id to_cudf_decimal_type(host_span<std::string const> decimal128_columns,
+                                    aggregate_orc_metadata const& metadata,
+                                    int column_index)
+{
+  if (metadata.get_col_type(column_index).kind != DECIMAL) { return type_id::EMPTY; }
+
+  if (std::find(decimal128_columns.begin(),
+                decimal128_columns.end(),
+                metadata.column_path(0, column_index)) != decimal128_columns.end()) {
+    return type_id::DECIMAL128;
+  }
+
+  auto const precision = metadata.get_col_type(column_index)
+                           .precision.value_or(cuda::std::numeric_limits<int64_t>::digits10);
+  if (precision <= cuda::std::numeric_limits<int32_t>::digits10) { return type_id::DECIMAL32; }
+  if (precision <= cuda::std::numeric_limits<int64_t>::digits10) { return type_id::DECIMAL64; }
+  return type_id::DECIMAL128;
+}
+
+inline std::string get_map_child_col_name(std::size_t const idx)
+{
+  return (idx == 0) ? "key" : "value";
+}
+
+/**
+ * @brief Create empty columns and respective schema information from the buffer.
+ */
+std::unique_ptr<column> create_empty_column(size_type orc_col_id,
+                                            aggregate_orc_metadata const& metadata,
+                                            host_span<std::string const> decimal128_columns,
+                                            bool use_np_dtypes,
+                                            data_type timestamp_type,
+                                            column_name_info& schema_info,
+                                            rmm::cuda_stream_view stream);
+
+/**
+ * @brief Assemble the buffer with child columns.
+ */
+column_buffer assemble_buffer(size_type orc_col_id,
+                              std::size_t level,
+                              reader_column_meta const& col_meta,
+                              aggregate_orc_metadata const& metadata,
+                              column_hierarchy const& selected_columns,
+                              std::vector<std::vector<column_buffer>>& col_buffers,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
new file mode 100644
index 00000000000..179afa12bd5
--- /dev/null
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -0,0 +1,1048 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl.hpp"
+#include "reader_impl_chunking.hpp"
+#include "reader_impl_helpers.hpp"
+
+#include <io/comp/gpuinflate.hpp>
+#include <io/comp/nvcomp_adapter.hpp>
+#include <io/utilities/config_utils.hpp>
+
+#include <cudf/detail/timezone.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/fill.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/pair.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+#include <algorithm>
+#include <iterator>
+
+namespace cudf::io::orc::detail {
+
+namespace {
+
+/**
+ * @brief Struct that maps ORC streams to columns
+ */
+struct orc_stream_info {
+  explicit orc_stream_info(uint64_t offset_,
+                           std::size_t dst_pos_,
+                           uint32_t length_,
+                           uint32_t stripe_idx_)
+    : offset(offset_), dst_pos(dst_pos_), length(length_), stripe_idx(stripe_idx_)
+  {
+  }
+  uint64_t offset;      // offset in file
+  std::size_t dst_pos;  // offset in memory relative to start of compressed stripe data
+  std::size_t length;   // length in file
+  uint32_t stripe_idx;  // stripe index
+};
+
+/**
+ * @brief Function that populates column descriptors stream/chunk
+ */
+std::size_t gather_stream_info(std::size_t stripe_index,
+                               orc::StripeInformation const* stripeinfo,
+                               orc::StripeFooter const* stripefooter,
+                               host_span<int const> orc2gdf,
+                               host_span<orc::SchemaType const> types,
+                               bool use_index,
+                               bool apply_struct_map,
+                               std::size_t* num_dictionary_entries,
+                               std::vector<orc_stream_info>& stream_info,
+                               cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks)
+{
+  uint64_t src_offset = 0;
+  uint64_t dst_offset = 0;
+
+  auto const get_stream_index_type = [](orc::StreamKind kind) {
+    switch (kind) {
+      case orc::DATA: return gpu::CI_DATA;
+      case orc::LENGTH:
+      case orc::SECONDARY: return gpu::CI_DATA2;
+      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
+      case orc::PRESENT: return gpu::CI_PRESENT;
+      case orc::ROW_INDEX: return gpu::CI_INDEX;
+      default:
+        // Skip this stream as it's not strictly required
+        return gpu::CI_NUM_STREAMS;
+    }
+  };
+
+  for (auto const& stream : stripefooter->streams) {
+    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
+      dst_offset += stream.length;
+      continue;
+    }
+
+    auto const column_id = *stream.column_id;
+    auto col             = orc2gdf[column_id];
+
+    if (col == -1 and apply_struct_map) {
+      // A struct-type column has no data itself, but rather child columns
+      // for each of its fields. There is only a PRESENT stream, which
+      // needs to be included for the reader.
+      auto const schema_type = types[column_id];
+      if (not schema_type.subtypes.empty()) {
+        if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
+          for (auto const& idx : schema_type.subtypes) {
+            auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
+            if (child_idx >= 0) {
+              col                             = child_idx;
+              auto& chunk                     = chunks[stripe_index][col];
+              chunk.strm_id[gpu::CI_PRESENT]  = stream_info.size();
+              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
+            }
+          }
+        }
+      }
+    }
+    if (col != -1) {
+      if (src_offset >= stripeinfo->indexLength || use_index) {
+        auto& chunk           = chunks[stripe_index][col];
+        auto const index_type = get_stream_index_type(stream.kind);
+        if (index_type < gpu::CI_NUM_STREAMS) {
+          chunk.strm_id[index_type]  = stream_info.size();
+          chunk.strm_len[index_type] = stream.length;
+          // NOTE: skip_count field is temporarily used to track the presence of index streams
+          chunk.skip_count |= 1 << index_type;
+
+          if (index_type == gpu::CI_DICTIONARY) {
+            chunk.dictionary_start = *num_dictionary_entries;
+            chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
+            *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
+          }
+        }
+      }
+      stream_info.emplace_back(
+        stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index);
+      dst_offset += stream.length;
+    }
+    src_offset += stream.length;
+  }
+
+  return dst_offset;
+}
+
+/**
+ * @brief Decompresses the stripe data, at stream granularity.
+ *
+ * @param decompressor Block decompressor
+ * @param stripe_data List of source stripe column data
+ * @param stream_info List of stream to column mappings
+ * @param chunks Vector of list of column chunk descriptors
+ * @param row_groups Vector of list of row index descriptors
+ * @param num_stripes Number of stripes making up column chunks
+ * @param row_index_stride Distance between each row index
+ * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Device buffer to decompressed page data
+ */
+rmm::device_buffer decompress_stripe_data(
+  OrcDecompressor const& decompressor,
+  host_span<rmm::device_buffer const> stripe_data,
+  host_span<orc_stream_info> stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+  cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
+  std::size_t num_stripes,
+  std::size_t row_index_stride,
+  bool use_base_stride,
+  rmm::cuda_stream_view stream)
+{
+  // Parse the columns' compressed info
+  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
+    0, stream_info.size(), stream);
+  for (auto const& info : stream_info) {
+    compinfo.push_back(gpu::CompressedStreamInfo(
+      static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
+      info.length));
+  }
+  compinfo.host_to_device_async(stream);
+
+  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                 compinfo.size(),
+                                 decompressor.GetBlockSize(),
+                                 decompressor.GetLog2MaxCompressionRatio(),
+                                 stream);
+  compinfo.device_to_host_sync(stream);
+
+  // Count the exact number of compressed blocks
+  std::size_t num_compressed_blocks   = 0;
+  std::size_t num_uncompressed_blocks = 0;
+  std::size_t total_decomp_size       = 0;
+  for (std::size_t i = 0; i < compinfo.size(); ++i) {
+    num_compressed_blocks += compinfo[i].num_compressed_blocks;
+    num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
+    total_decomp_size += compinfo[i].max_uncompressed_size;
+  }
+  CUDF_EXPECTS(
+    not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
+    "Inconsistent info on compression blocks");
+
+  // Buffer needs to be padded.
+  // Required by `gpuDecodeOrcColumnData`.
+  rmm::device_buffer decomp_data(
+    cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
+  if (decomp_data.is_empty()) { return decomp_data; }
+
+  rmm::device_uvector<device_span<uint8_t const>> inflate_in(
+    num_compressed_blocks + num_uncompressed_blocks, stream);
+  rmm::device_uvector<device_span<uint8_t>> inflate_out(
+    num_compressed_blocks + num_uncompressed_blocks, stream);
+  rmm::device_uvector<compression_result> inflate_res(num_compressed_blocks, stream);
+  thrust::fill(rmm::exec_policy(stream),
+               inflate_res.begin(),
+               inflate_res.end(),
+               compression_result{0, compression_status::FAILURE});
+
+  // Parse again to populate the decompression input/output buffers
+  std::size_t decomp_offset      = 0;
+  uint32_t max_uncomp_block_size = 0;
+  uint32_t start_pos             = 0;
+  auto start_pos_uncomp          = (uint32_t)num_compressed_blocks;
+  for (std::size_t i = 0; i < compinfo.size(); ++i) {
+    auto dst_base                 = static_cast<uint8_t*>(decomp_data.data());
+    compinfo[i].uncompressed_data = dst_base + decomp_offset;
+    compinfo[i].dec_in_ctl        = inflate_in.data() + start_pos;
+    compinfo[i].dec_out_ctl       = inflate_out.data() + start_pos;
+    compinfo[i].dec_res      = {inflate_res.data() + start_pos, compinfo[i].num_compressed_blocks};
+    compinfo[i].copy_in_ctl  = inflate_in.data() + start_pos_uncomp;
+    compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp;
+
+    stream_info[i].dst_pos = decomp_offset;
+    decomp_offset += compinfo[i].max_uncompressed_size;
+    start_pos += compinfo[i].num_compressed_blocks;
+    start_pos_uncomp += compinfo[i].num_uncompressed_blocks;
+    max_uncomp_block_size =
+      std::max(max_uncomp_block_size, compinfo[i].max_uncompressed_block_size);
+  }
+  compinfo.host_to_device_async(stream);
+  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                 compinfo.size(),
+                                 decompressor.GetBlockSize(),
+                                 decompressor.GetLog2MaxCompressionRatio(),
+                                 stream);
+
+  // Value for checking whether we decompress successfully.
+  // It doesn't need to be atomic as there is no race condition: we only write `true` if needed.
+  cudf::detail::hostdevice_vector<bool> any_block_failure(1, stream);
+  any_block_failure[0] = false;
+  any_block_failure.host_to_device_async(stream);
+
+  // Dispatch batches of blocks to decompress
+  if (num_compressed_blocks > 0) {
+    device_span<device_span<uint8_t const>> inflate_in_view{inflate_in.data(),
+                                                            num_compressed_blocks};
+    device_span<device_span<uint8_t>> inflate_out_view{inflate_out.data(), num_compressed_blocks};
+    switch (decompressor.compression()) {
+      case compression_type::ZLIB:
+        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE)) {
+          gpuinflate(
+            inflate_in_view, inflate_out_view, inflate_res, gzip_header_included::NO, stream);
+        } else {
+          nvcomp::batched_decompress(nvcomp::compression_type::DEFLATE,
+                                     inflate_in_view,
+                                     inflate_out_view,
+                                     inflate_res,
+                                     max_uncomp_block_size,
+                                     total_decomp_size,
+                                     stream);
+        }
+        break;
+      case compression_type::SNAPPY:
+        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::SNAPPY)) {
+          gpu_unsnap(inflate_in_view, inflate_out_view, inflate_res, stream);
+        } else {
+          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
+                                     inflate_in_view,
+                                     inflate_out_view,
+                                     inflate_res,
+                                     max_uncomp_block_size,
+                                     total_decomp_size,
+                                     stream);
+        }
+        break;
+      case compression_type::ZSTD:
+        if (auto const reason = nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD);
+            reason) {
+          CUDF_FAIL("Decompression error: " + reason.value());
+        }
+        nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
+                                   inflate_in_view,
+                                   inflate_out_view,
+                                   inflate_res,
+                                   max_uncomp_block_size,
+                                   total_decomp_size,
+                                   stream);
+        break;
+      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
+    }
+
+    // Check if any block has been failed to decompress.
+    // Not using `thrust::any` or `thrust::count_if` to defer stream sync.
+    thrust::for_each(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(std::size_t{0}),
+      thrust::make_counting_iterator(inflate_res.size()),
+      [results           = inflate_res.begin(),
+       any_block_failure = any_block_failure.device_ptr()] __device__(auto const idx) {
+        if (results[idx].status != compression_status::SUCCESS) { *any_block_failure = true; }
+      });
+  }
+
+  if (num_uncompressed_blocks > 0) {
+    device_span<device_span<uint8_t const>> copy_in_view{inflate_in.data() + num_compressed_blocks,
+                                                         num_uncompressed_blocks};
+    device_span<device_span<uint8_t>> copy_out_view{inflate_out.data() + num_compressed_blocks,
+                                                    num_uncompressed_blocks};
+    gpu_copy_uncompressed_blocks(copy_in_view, copy_out_view, stream);
+  }
+
+  // Copy without stream sync, thus need to wait for stream sync below to access.
+  any_block_failure.device_to_host_async(stream);
+
+  gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream);
+  compinfo.device_to_host_sync(stream);  // This also sync stream for `any_block_failure`.
+
+  // We can check on host after stream synchronize
+  CUDF_EXPECTS(not any_block_failure[0], "Error during decompression");
+
+  auto const num_columns = chunks.size().second;
+
+  // Update the stream information with the updated uncompressed info
+  // TBD: We could update the value from the information we already
+  // have in stream_info[], but using the gpu results also updates
+  // max_uncompressed_size to the actual uncompressed size, or zero if
+  // decompression failed.
+  for (std::size_t i = 0; i < num_stripes; ++i) {
+    for (std::size_t j = 0; j < num_columns; ++j) {
+      auto& chunk = chunks[i][j];
+      for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
+        if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) {
+          chunk.streams[k]  = compinfo[chunk.strm_id[k]].uncompressed_data;
+          chunk.strm_len[k] = compinfo[chunk.strm_id[k]].max_uncompressed_size;
+        }
+      }
+    }
+  }
+
+  if (row_groups.size().first) {
+    chunks.host_to_device_async(stream);
+    row_groups.host_to_device_async(stream);
+    gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
+                            compinfo.device_ptr(),
+                            chunks.base_device_ptr(),
+                            num_columns,
+                            num_stripes,
+                            row_groups.size().first,
+                            row_index_stride,
+                            use_base_stride,
+                            stream);
+  }
+
+  return decomp_data;
+}
+
+/**
+ * @brief Updates null mask of columns whose parent is a struct column.
+ *
+ * If struct column has null element, that row would be skipped while writing child column in ORC,
+ * so we need to insert the missing null elements in child column. There is another behavior from
+ * pyspark, where if the child column doesn't have any null elements, it will not have present
+ * stream, so in that case parent null mask need to be copied to child column.
+ *
+ * @param chunks Vector of list of column chunk descriptors
+ * @param out_buffers Output columns' device buffers
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource to use for device memory allocation
+ */
+void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+                      host_span<column_buffer> out_buffers,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr)
+{
+  auto const num_stripes = chunks.size().first;
+  auto const num_columns = chunks.size().second;
+  bool is_mask_updated   = false;
+
+  for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
+    if (chunks[0][col_idx].parent_validity_info.valid_map_base != nullptr) {
+      if (not is_mask_updated) {
+        chunks.device_to_host_sync(stream);
+        is_mask_updated = true;
+      }
+
+      auto parent_valid_map_base = chunks[0][col_idx].parent_validity_info.valid_map_base;
+      auto child_valid_map_base  = out_buffers[col_idx].null_mask();
+      auto child_mask_len =
+        chunks[0][col_idx].column_num_rows - chunks[0][col_idx].parent_validity_info.null_count;
+      auto parent_mask_len = chunks[0][col_idx].column_num_rows;
+
+      if (child_valid_map_base != nullptr) {
+        rmm::device_uvector<uint32_t> dst_idx(child_mask_len, stream);
+        // Copy indexes at which the parent has valid value.
+        thrust::copy_if(rmm::exec_policy(stream),
+                        thrust::make_counting_iterator(0),
+                        thrust::make_counting_iterator(0) + parent_mask_len,
+                        dst_idx.begin(),
+                        [parent_valid_map_base] __device__(auto idx) {
+                          return bit_is_set(parent_valid_map_base, idx);
+                        });
+
+        auto merged_null_mask = cudf::detail::create_null_mask(
+          parent_mask_len, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
+        auto merged_mask      = static_cast<bitmask_type*>(merged_null_mask.data());
+        uint32_t* dst_idx_ptr = dst_idx.data();
+        // Copy child valid bits from child column to valid indexes, this will merge both child
+        // and parent null masks
+        thrust::for_each(rmm::exec_policy(stream),
+                         thrust::make_counting_iterator(0),
+                         thrust::make_counting_iterator(0) + dst_idx.size(),
+                         [child_valid_map_base, dst_idx_ptr, merged_mask] __device__(auto idx) {
+                           if (bit_is_set(child_valid_map_base, idx)) {
+                             cudf::set_bit(merged_mask, dst_idx_ptr[idx]);
+                           };
+                         });
+
+        out_buffers[col_idx].set_null_mask(std::move(merged_null_mask));
+
+      } else {
+        // Since child column doesn't have a mask, copy parent null mask
+        auto mask_size = bitmask_allocation_size_bytes(parent_mask_len);
+        out_buffers[col_idx].set_null_mask(
+          rmm::device_buffer(static_cast<void*>(parent_valid_map_base), mask_size, stream, mr));
+      }
+    }
+  }
+
+  if (is_mask_updated) {
+    // Update chunks with pointers to column data which might have been changed.
+    for (std::size_t stripe_idx = 0; stripe_idx < num_stripes; ++stripe_idx) {
+      for (std::size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
+        auto& chunk          = chunks[stripe_idx][col_idx];
+        chunk.valid_map_base = out_buffers[col_idx].null_mask();
+      }
+    }
+    chunks.host_to_device_sync(stream);
+  }
+}
+
+/**
+ * @brief Converts the stripe column data and outputs to columns.
+ *
+ * @param num_dicts Number of dictionary entries required
+ * @param skip_rows Number of rows to offset from start
+ * @param row_index_stride Distance between each row index
+ * @param level Current nesting level being processed
+ * @param tz_table Local time to UTC conversion table
+ * @param chunks Vector of list of column chunk descriptors
+ * @param row_groups Vector of list of row index descriptors
+ * @param out_buffers Output columns' device buffers
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ */
+void decode_stream_data(std::size_t num_dicts,
+                        std::size_t skip_rows,
+                        std::size_t row_index_stride,
+                        std::size_t level,
+                        table_view const& tz_table,
+                        cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
+                        cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
+                        std::vector<column_buffer>& out_buffers,
+                        rmm::cuda_stream_view stream,
+                        rmm::mr::device_memory_resource* mr)
+{
+  auto const num_stripes = chunks.size().first;
+  auto const num_columns = chunks.size().second;
+  thrust::counting_iterator<int> col_idx_it(0);
+  thrust::counting_iterator<int> stripe_idx_it(0);
+
+  // Update chunks with pointers to column data
+  std::for_each(stripe_idx_it, stripe_idx_it + num_stripes, [&](auto stripe_idx) {
+    std::for_each(col_idx_it, col_idx_it + num_columns, [&](auto col_idx) {
+      auto& chunk            = chunks[stripe_idx][col_idx];
+      chunk.column_data_base = out_buffers[col_idx].data();
+      chunk.valid_map_base   = out_buffers[col_idx].null_mask();
+    });
+  });
+
+  // Allocate global dictionary for deserializing
+  rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
+
+  chunks.host_to_device_sync(stream);
+  gpu::DecodeNullsAndStringDictionaries(
+    chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
+
+  if (level > 0) {
+    // Update nullmasks for children if parent was a struct and had null mask
+    update_null_mask(chunks, out_buffers, stream, mr);
+  }
+
+  auto const tz_table_dptr = table_device_view::create(tz_table, stream);
+  rmm::device_scalar<size_type> error_count(0, stream);
+  // Update the null map for child columns
+  gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
+                           global_dict.data(),
+                           row_groups,
+                           num_columns,
+                           num_stripes,
+                           skip_rows,
+                           *tz_table_dptr,
+                           row_groups.size().first,
+                           row_index_stride,
+                           level,
+                           error_count.data(),
+                           stream);
+  chunks.device_to_host_async(stream);
+  // `value` synchronizes
+  auto const num_errors = error_count.value(stream);
+  CUDF_EXPECTS(num_errors == 0, "ORC data decode failed");
+
+  std::for_each(col_idx_it + 0, col_idx_it + num_columns, [&](auto col_idx) {
+    out_buffers[col_idx].null_count() =
+      std::accumulate(stripe_idx_it + 0,
+                      stripe_idx_it + num_stripes,
+                      0,
+                      [&](auto null_count, auto const stripe_idx) {
+                        return null_count + chunks[stripe_idx][col_idx].null_count;
+                      });
+  });
+}
+
+/**
+ * @brief Compute the per-stripe prefix sum of null count, for each struct column in the current
+ * layer.
+ */
+void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
+                      cudf::host_span<rmm::device_uvector<uint32_t>> prefix_sums,
+                      rmm::cuda_stream_view stream)
+{
+  auto const num_stripes = chunks.size().first;
+  if (num_stripes == 0) return;
+
+  auto const num_columns = chunks.size().second;
+  std::vector<thrust::pair<size_type, cudf::device_span<uint32_t>>> prefix_sums_to_update;
+  for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
+    // Null counts sums are only needed for children of struct columns
+    if (chunks[0][col_idx].type_kind == STRUCT) {
+      prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
+    }
+  }
+  auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
+    prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
+
+  thrust::for_each(rmm::exec_policy(stream),
+                   d_prefix_sums_to_update.begin(),
+                   d_prefix_sums_to_update.end(),
+                   [chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
+                     auto const& idx_psums) {
+                     auto const col_idx = idx_psums.first;
+                     auto const psums   = idx_psums.second;
+
+                     thrust::transform(
+                       thrust::seq,
+                       thrust::make_counting_iterator(0),
+                       thrust::make_counting_iterator(0) + psums.size(),
+                       psums.begin(),
+                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
+
+                     thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin());
+                   });
+  // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
+  stream.synchronize();
+}
+
+/**
+ * @brief Aggregate child metadata from parent column chunks.
+ */
+void aggregate_child_meta(std::size_t level,
+                          cudf::io::orc::detail::column_hierarchy const& selected_columns,
+                          cudf::detail::host_2dspan<gpu::ColumnDesc> chunks,
+                          cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
+                          host_span<orc_column_meta const> nested_cols,
+                          host_span<column_buffer> out_buffers,
+                          reader_column_meta& col_meta)
+{
+  auto const num_of_stripes         = chunks.size().first;
+  auto const num_of_rowgroups       = row_groups.size().first;
+  auto const num_child_cols         = selected_columns.levels[level + 1].size();
+  auto const number_of_child_chunks = num_child_cols * num_of_stripes;
+  auto& num_child_rows              = col_meta.num_child_rows;
+  auto& parent_column_data          = col_meta.parent_column_data;
+
+  // Reset the meta to store child column details.
+  num_child_rows.resize(selected_columns.levels[level + 1].size());
+  std::fill(num_child_rows.begin(), num_child_rows.end(), 0);
+  parent_column_data.resize(number_of_child_chunks);
+  col_meta.parent_column_index.resize(number_of_child_chunks);
+  col_meta.child_start_row.resize(number_of_child_chunks);
+  col_meta.num_child_rows_per_stripe.resize(number_of_child_chunks);
+  col_meta.rwgrp_meta.resize(num_of_rowgroups * num_child_cols);
+
+  auto child_start_row = cudf::detail::host_2dspan<uint32_t>(
+    col_meta.child_start_row.data(), num_of_stripes, num_child_cols);
+  auto num_child_rows_per_stripe = cudf::detail::host_2dspan<uint32_t>(
+    col_meta.num_child_rows_per_stripe.data(), num_of_stripes, num_child_cols);
+  auto rwgrp_meta = cudf::detail::host_2dspan<reader_column_meta::row_group_meta>(
+    col_meta.rwgrp_meta.data(), num_of_rowgroups, num_child_cols);
+
+  int index = 0;  // number of child column processed
+
+  // For each parent column, update its child column meta for each stripe.
+  std::for_each(nested_cols.begin(), nested_cols.end(), [&](auto const p_col) {
+    auto const parent_col_idx = col_meta.orc_col_map[level][p_col.id];
+    auto start_row            = 0;
+    auto processed_row_groups = 0;
+
+    for (std::size_t stripe_id = 0; stripe_id < num_of_stripes; stripe_id++) {
+      // Aggregate num_rows and start_row from processed parent columns per row groups
+      if (num_of_rowgroups) {
+        auto stripe_num_row_groups = chunks[stripe_id][parent_col_idx].num_rowgroups;
+        auto processed_child_rows  = 0;
+
+        for (std::size_t rowgroup_id = 0; rowgroup_id < stripe_num_row_groups;
+             rowgroup_id++, processed_row_groups++) {
+          auto const child_rows = row_groups[processed_row_groups][parent_col_idx].num_child_rows;
+          for (size_type id = 0; id < p_col.num_children; id++) {
+            auto const child_col_idx                                  = index + id;
+            rwgrp_meta[processed_row_groups][child_col_idx].start_row = processed_child_rows;
+            rwgrp_meta[processed_row_groups][child_col_idx].num_rows  = child_rows;
+          }
+          processed_child_rows += child_rows;
+        }
+      }
+
+      // Aggregate start row, number of rows per chunk and total number of rows in a column
+      auto const child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
+      for (size_type id = 0; id < p_col.num_children; id++) {
+        auto const child_col_idx = index + id;
+
+        num_child_rows[child_col_idx] += child_rows;
+        num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
+        // start row could be different for each column when there is nesting at each stripe level
+        child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
+      }
+      start_row += child_rows;
+    }
+
+    // Parent column null mask and null count would be required for child column
+    // to adjust its nullmask.
+    auto type              = out_buffers[parent_col_idx].type.id();
+    auto parent_null_count = static_cast<uint32_t>(out_buffers[parent_col_idx].null_count());
+    auto parent_valid_map  = out_buffers[parent_col_idx].null_mask();
+    auto num_rows          = out_buffers[parent_col_idx].size;
+
+    for (size_type id = 0; id < p_col.num_children; id++) {
+      auto const child_col_idx                    = index + id;
+      col_meta.parent_column_index[child_col_idx] = parent_col_idx;
+      if (type == type_id::STRUCT) {
+        parent_column_data[child_col_idx] = {parent_valid_map, parent_null_count};
+        // Number of rows in child will remain same as parent in case of struct column
+        num_child_rows[child_col_idx] = num_rows;
+      } else {
+        parent_column_data[child_col_idx] = {nullptr, 0};
+      }
+    }
+    index += p_col.num_children;
+  });
+}
+
+/**
+ * @brief struct to store buffer data and size of list buffer
+ */
+struct list_buffer_data {
+  size_type* data;
+  size_type size;
+};
+
+// Generates offsets for list buffer from number of elements in a row.
+void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_stream_view stream)
+{
+  for (auto& list_data : buff_data) {
+    thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                           list_data.data,
+                           list_data.data + list_data.size,
+                           list_data.data);
+  }
+}
+
+}  // namespace
+
+void reader::impl::prepare_data(uint64_t skip_rows,
+                                std::optional<size_type> const& num_rows_opt,
+                                std::vector<std::vector<size_type>> const& stripes)
+{
+  // Selected columns at different levels of nesting are stored in different elements
+  // of `selected_columns`; thus, size == 1 means no nested columns
+  CUDF_EXPECTS(skip_rows == 0 or _selected_columns.num_levels() == 1,
+               "skip_rows is not supported by nested columns");
+
+  // There are no columns in the table
+  if (_selected_columns.num_levels() == 0) { return; }
+
+  _file_itm_data = std::make_unique<file_intermediate_data>();
+
+  // Select only stripes required (aka row groups)
+  std::tie(
+    _file_itm_data->rows_to_skip, _file_itm_data->rows_to_read, _file_itm_data->selected_stripes) =
+    _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
+  auto const rows_to_skip      = _file_itm_data->rows_to_skip;
+  auto const rows_to_read      = _file_itm_data->rows_to_read;
+  auto const& selected_stripes = _file_itm_data->selected_stripes;
+
+  // If no rows or stripes to read, return empty columns
+  if (rows_to_read == 0 || selected_stripes.empty()) { return; }
+
+  // Set up table for converting timestamp columns from local to UTC time
+  auto const tz_table = [&, &selected_stripes = selected_stripes] {
+    auto const has_timestamp_column = std::any_of(
+      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
+        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
+          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
+        });
+      });
+
+    return has_timestamp_column
+             ? cudf::detail::make_timezone_transition_table(
+                 {}, selected_stripes[0].stripe_info[0].second->writerTimezone, _stream)
+             : std::make_unique<cudf::table>();
+  }();
+
+  auto& lvl_stripe_data        = _file_itm_data->lvl_stripe_data;
+  auto& null_count_prefix_sums = _file_itm_data->null_count_prefix_sums;
+  lvl_stripe_data.resize(_selected_columns.num_levels());
+
+  _out_buffers.resize(_selected_columns.num_levels());
+
+  // Iterates through levels of nested columns, child column will be one level down
+  // compared to parent column.
+  auto& col_meta = *_col_meta;
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    auto& columns_level = _selected_columns.levels[level];
+    // Association between each ORC column and its cudf::column
+    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
+    std::vector<orc_column_meta> nested_cols;
+
+    // Get a list of column data types
+    std::vector<data_type> column_types;
+    for (auto& col : columns_level) {
+      auto col_type = to_cudf_type(_metadata.get_col_type(col.id).kind,
+                                   _use_np_dtypes,
+                                   _timestamp_type.id(),
+                                   to_cudf_decimal_type(_decimal128_columns, _metadata, col.id));
+      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
+      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
+          col_type == type_id::DECIMAL128) {
+        // sign of the scale is changed since cuDF follows c++ libraries like CNL
+        // which uses negative scaling, but liborc and other libraries
+        // follow positive scaling.
+        auto const scale =
+          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
+        column_types.emplace_back(col_type, scale);
+      } else {
+        column_types.emplace_back(col_type);
+      }
+
+      // Map each ORC column to its column
+      col_meta.orc_col_map[level][col.id] = column_types.size() - 1;
+      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
+        nested_cols.emplace_back(col);
+      }
+    }
+
+    // Get the total number of stripes across all input files.
+    std::size_t total_num_stripes =
+      std::accumulate(selected_stripes.begin(),
+                      selected_stripes.end(),
+                      0,
+                      [](std::size_t sum, auto& stripe_source_mapping) {
+                        return sum + stripe_source_mapping.stripe_info.size();
+                      });
+    auto const num_columns = columns_level.size();
+    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
+      total_num_stripes, num_columns, _stream);
+    memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
+
+    const bool use_index =
+      _use_index &&
+      // Do stripes have row group index
+      _metadata.is_row_grp_idx_present() &&
+      // Only use if we don't have much work with complete columns & stripes
+      // TODO: Consider nrows, gpu, and tune the threshold
+      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
+       _metadata.get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) &&
+      // Only use if first row is aligned to a stripe boundary
+      // TODO: Fix logic to handle unaligned rows
+      (rows_to_skip == 0);
+
+    // Logically view streams as columns
+    std::vector<orc_stream_info> stream_info;
+
+    null_count_prefix_sums.emplace_back();
+    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
+    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
+                    _selected_columns.levels[level].size(),
+                    [&]() {
+                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+                        total_num_stripes, _stream, rmm::mr::get_current_device_resource());
+                    });
+
+    // Tracker for eventually deallocating compressed and uncompressed data
+    auto& stripe_data = lvl_stripe_data[level];
+
+    std::size_t stripe_start_row = 0;
+    std::size_t num_dict_entries = 0;
+    std::size_t num_rowgroups    = 0;
+    int stripe_idx               = 0;
+
+    std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
+    for (auto const& stripe_source_mapping : selected_stripes) {
+      // Iterate through the source files selected stripes
+      for (auto const& stripe : stripe_source_mapping.stripe_info) {
+        auto const stripe_info   = stripe.first;
+        auto const stripe_footer = stripe.second;
+
+        auto stream_count          = stream_info.size();
+        auto const total_data_size = gather_stream_info(stripe_idx,
+                                                        stripe_info,
+                                                        stripe_footer,
+                                                        col_meta.orc_col_map[level],
+                                                        _metadata.get_types(),
+                                                        use_index,
+                                                        level == 0,
+                                                        &num_dict_entries,
+                                                        stream_info,
+                                                        chunks);
+
+        auto const is_stripe_data_empty = total_data_size == 0;
+        CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                     "Invalid index rowgroup stream data");
+
+        // Buffer needs to be padded.
+        // Required by `copy_uncompressed_kernel`.
+        stripe_data.emplace_back(
+          cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), _stream);
+        auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
+
+        // Coalesce consecutive streams into one read
+        while (not is_stripe_data_empty and stream_count < stream_info.size()) {
+          auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
+          auto const offset = stream_info[stream_count].offset;
+          auto len          = stream_info[stream_count].length;
+          stream_count++;
+
+          while (stream_count < stream_info.size() &&
+                 stream_info[stream_count].offset == offset + len) {
+            len += stream_info[stream_count].length;
+            stream_count++;
+          }
+          if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                .source->is_device_read_preferred(len)) {
+            read_tasks.push_back(
+              std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                          .source->device_read_async(offset, len, d_dst, _stream),
+                        len));
+
+          } else {
+            auto const buffer =
+              _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
+                offset, len);
+            CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
+            CUDF_CUDA_TRY(
+              cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
+            _stream.synchronize();
+          }
+        }
+
+        auto const num_rows_per_stripe = stripe_info->numberOfRows;
+        auto const rowgroup_id         = num_rowgroups;
+        auto stripe_num_rowgroups      = 0;
+        if (use_index) {
+          stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
+                                 _metadata.get_row_index_stride();
+        }
+        // Update chunks to reference streams pointers
+        for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
+          auto& chunk = chunks[stripe_idx][col_idx];
+          // start row, number of rows in a each stripe and total number of rows
+          // may change in lower levels of nesting
+          chunk.start_row = (level == 0)
+                              ? stripe_start_row
+                              : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
+          chunk.num_rows =
+            (level == 0) ? stripe_info->numberOfRows
+                         : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
+          chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
+          chunk.parent_validity_info =
+            (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
+          chunk.parent_null_count_prefix_sums =
+            (level == 0)
+              ? nullptr
+              : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
+          chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
+          chunk.type_kind     = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                              .ff.types[columns_level[col_idx].id]
+                              .kind;
+          // num_child_rows for a struct column will be same, for other nested types it will be
+          // calculated.
+          chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+          chunk.dtype_id       = column_types[col_idx].id();
+          chunk.decimal_scale  = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                                  .ff.types[columns_level[col_idx].id]
+                                  .scale.value_or(0);
+
+          chunk.rowgroup_id   = rowgroup_id;
+          chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
+                                  ? sizeof(string_index_pair)
+                                : ((column_types[col_idx].id() == type_id::LIST) or
+                               (column_types[col_idx].id() == type_id::STRUCT))
+                                  ? sizeof(size_type)
+                                  : cudf::size_of(column_types[col_idx]);
+          chunk.num_rowgroups = stripe_num_rowgroups;
+          if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _timestamp_type.id(); }
+          if (not is_stripe_data_empty) {
+            for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
+              chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
+            }
+          }
+        }
+        stripe_start_row += num_rows_per_stripe;
+        num_rowgroups += stripe_num_rowgroups;
+
+        stripe_idx++;
+      }
+    }
+    for (auto& task : read_tasks) {
+      CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
+    }
+
+    if (stripe_data.empty()) { continue; }
+
+    // Process dataset chunk pages into output columns
+    auto row_groups =
+      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, _stream);
+    if (level > 0 and row_groups.size().first) {
+      cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
+                                                     num_rowgroups * num_columns);
+      auto& rw_grp_meta = col_meta.rwgrp_meta;
+
+      // Update start row and num rows per row group
+      std::transform(rw_grp_meta.begin(),
+                     rw_grp_meta.end(),
+                     row_groups_span.begin(),
+                     rw_grp_meta.begin(),
+                     [&](auto meta, auto& row_grp) {
+                       row_grp.num_rows  = meta.num_rows;
+                       row_grp.start_row = meta.start_row;
+                       return meta;
+                     });
+    }
+    // Setup row group descriptors if using indexes
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      auto decomp_data = decompress_stripe_data(*_metadata.per_file_metadata[0].decompressor,
+                                                stripe_data,
+                                                stream_info,
+                                                chunks,
+                                                row_groups,
+                                                total_num_stripes,
+                                                _metadata.get_row_index_stride(),
+                                                level == 0,
+                                                _stream);
+      stripe_data.clear();
+      stripe_data.push_back(std::move(decomp_data));
+    } else {
+      if (row_groups.size().first) {
+        chunks.host_to_device_async(_stream);
+        row_groups.host_to_device_async(_stream);
+        row_groups.host_to_device_async(_stream);
+        gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
+                                nullptr,
+                                chunks.base_device_ptr(),
+                                num_columns,
+                                total_num_stripes,
+                                num_rowgroups,
+                                _metadata.get_row_index_stride(),
+                                level == 0,
+                                _stream);
+      }
+    }
+
+    for (std::size_t i = 0; i < column_types.size(); ++i) {
+      bool is_nullable = false;
+      for (std::size_t j = 0; j < total_num_stripes; ++j) {
+        if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
+          is_nullable = true;
+          break;
+        }
+      }
+      auto is_list_type = (column_types[i].id() == type_id::LIST);
+      auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
+      // For list column, offset column will be always size + 1
+      if (is_list_type) n_rows++;
+      _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
+    }
+
+    decode_stream_data(num_dict_entries,
+                       rows_to_skip,
+                       _metadata.get_row_index_stride(),
+                       level,
+                       tz_table->view(),
+                       chunks,
+                       row_groups,
+                       _out_buffers[level],
+                       _stream,
+                       _mr);
+
+    if (nested_cols.size()) {
+      // Extract information to process nested child columns
+      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
+
+      row_groups.device_to_host_sync(_stream);
+      aggregate_child_meta(
+        level, _selected_columns, chunks, row_groups, nested_cols, _out_buffers[level], col_meta);
+
+      // ORC stores number of elements at each row, so we need to generate offsets from that
+      std::vector<list_buffer_data> buff_data;
+      std::for_each(
+        _out_buffers[level].begin(), _out_buffers[level].end(), [&buff_data](auto& out_buffer) {
+          if (out_buffer.type.id() == type_id::LIST) {
+            auto data = static_cast<size_type*>(out_buffer.data());
+            buff_data.emplace_back(list_buffer_data{data, out_buffer.size});
+          }
+        });
+
+      if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
+    }
+  }  // end loop level
+}
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 1afc0200bfa..31159ae0341 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -35,7 +35,7 @@ constexpr unsigned int init_threads_per_group = 32;
 constexpr unsigned int init_groups_per_block  = 4;
 constexpr unsigned int init_threads_per_block = init_threads_per_group * init_groups_per_block;
 
-__global__ void __launch_bounds__(init_threads_per_block)
+CUDF_KERNEL void __launch_bounds__(init_threads_per_block)
   gpu_init_statistics_groups(statistics_group* groups,
                              stats_column_desc const* cols,
                              device_2dspan<rowgroup_rows const> rowgroup_bounds)
@@ -73,7 +73,7 @@ constexpr unsigned int pb_fldlen_common =
   pb_fld_hdrlen + (pb_fld_hdrlen + pb_fldlen_int64) + 2 * pb_fld_hdrlen;
 
 template <unsigned int block_size>
-__global__ void __launch_bounds__(block_size, 1)
+CUDF_KERNEL void __launch_bounds__(block_size, 1)
   gpu_init_statistics_buffersize(statistics_merge_group* groups,
                                  statistics_chunk const* chunks,
                                  uint32_t statistics_count)
@@ -249,7 +249,7 @@ constexpr unsigned int encode_chunks_per_block  = 4;
 constexpr unsigned int encode_threads_per_block =
   encode_threads_per_chunk * encode_chunks_per_block;
 
-__global__ void __launch_bounds__(encode_threads_per_block)
+CUDF_KERNEL void __launch_bounds__(encode_threads_per_block)
   gpu_encode_statistics(uint8_t* blob_bfr,
                         statistics_merge_group* groups,
                         statistics_chunk const* chunks,
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 0b249bbdafe..14072d79172 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1082,7 +1082,7 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
  */
 // blockDim {block_size,1,1}
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   gpuDecodeNullsAndStringDictionaries(ColumnDesc* chunks,
                                       DictionaryEntry* global_dictionary,
                                       uint32_t num_columns,
@@ -1358,7 +1358,7 @@ static const __device__ __constant__ uint32_t kTimestampNanoScale[8] = {
  */
 // blockDim {block_size,1,1}
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   gpuDecodeOrcColumnData(ColumnDesc* chunks,
                          DictionaryEntry* global_dictionary,
                          table_device_view tz_table,
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index b99826e070e..b7dd0ea9ec3 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -723,7 +723,7 @@ static __device__ void encode_null_mask(orcenc_state_s* s,
  */
 // blockDim {`encode_block_size`,1,1}
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   gpuEncodeOrcColumnData(device_2dspan<EncChunk const> chunks,
                          device_2dspan<encoder_chunk_streams> streams)
 {
@@ -1008,7 +1008,7 @@ __global__ void __launch_bounds__(block_size)
  */
 // blockDim {512,1,1}
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   gpuEncodeStringDictionaries(stripe_dictionary const* stripes,
                               device_span<orc_column_device_view const> columns,
                               device_2dspan<EncChunk const> chunks,
@@ -1091,7 +1091,7 @@ __global__ void __launch_bounds__(block_size)
  * @param[in,out] streams List of encoder chunk streams [column][rowgroup]
  */
 // blockDim {compact_streams_block_size,1,1}
-__global__ void __launch_bounds__(compact_streams_block_size)
+CUDF_KERNEL void __launch_bounds__(compact_streams_block_size)
   gpuCompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
                            device_2dspan<encoder_chunk_streams> streams)
 {
@@ -1136,7 +1136,7 @@ __global__ void __launch_bounds__(compact_streams_block_size)
  * @param[in] comp_block_align Required alignment for compressed blocks
  */
 // blockDim {256,1,1}
-__global__ void __launch_bounds__(256)
+CUDF_KERNEL void __launch_bounds__(256)
   gpuInitCompressionBlocks(device_2dspan<StripeStream const> strm_desc,
                            device_2dspan<encoder_chunk_streams> streams,  // const?
                            device_span<device_span<uint8_t const>> inputs,
@@ -1191,7 +1191,7 @@ __global__ void __launch_bounds__(256)
  * @param[in] max_comp_blk_size Max size of any block after compression
  */
 // blockDim {1024,1,1}
-__global__ void __launch_bounds__(1024)
+CUDF_KERNEL void __launch_bounds__(1024)
   gpuCompactCompressedBlocks(device_2dspan<StripeStream> strm_desc,
                              device_span<device_span<uint8_t const> const> inputs,
                              device_span<device_span<uint8_t> const> outputs,
@@ -1274,8 +1274,8 @@ struct decimal_column_element_sizes {
 // Converts sizes of individual decimal elements to offsets within each row group
 // Conversion is done in-place
 template <int block_size>
-__global__ void decimal_sizes_to_offsets_kernel(device_2dspan<rowgroup_rows const> rg_bounds,
-                                                device_span<decimal_column_element_sizes> sizes)
+CUDF_KERNEL void decimal_sizes_to_offsets_kernel(device_2dspan<rowgroup_rows const> rg_bounds,
+                                                 device_span<decimal_column_element_sizes> sizes)
 {
   using block_scan = cub::BlockScan<uint32_t, block_size>;
   __shared__ typename block_scan::TempStorage scan_storage;
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index b31a4a081d1..327b9557176 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ struct compressed_stream_s {
 };
 
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeData(
+CUDF_KERNEL void __launch_bounds__(128, 8) gpuParseCompressedStripeData(
   CompressedStreamInfo* strm_info, int32_t num_streams, uint32_t block_size, uint32_t log2maxcr)
 {
   __shared__ compressed_stream_s strm_g[4];
@@ -138,7 +138,7 @@ __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeData(
 }
 
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128, 8)
+CUDF_KERNEL void __launch_bounds__(128, 8)
   gpuPostDecompressionReassemble(CompressedStreamInfo* strm_info, int32_t num_streams)
 {
   __shared__ compressed_stream_s strm_g[4];
@@ -442,14 +442,14 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s* s,
  * value
  */
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_groups,
-                                                                CompressedStreamInfo* strm_info,
-                                                                ColumnDesc* chunks,
-                                                                uint32_t num_columns,
-                                                                uint32_t num_stripes,
-                                                                uint32_t num_rowgroups,
-                                                                uint32_t rowidx_stride,
-                                                                bool use_base_stride)
+CUDF_KERNEL void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_groups,
+                                                                 CompressedStreamInfo* strm_info,
+                                                                 ColumnDesc* chunks,
+                                                                 uint32_t num_columns,
+                                                                 uint32_t num_stripes,
+                                                                 uint32_t num_rowgroups,
+                                                                 uint32_t rowidx_stride,
+                                                                 bool use_base_stride)
 {
   __shared__ __align__(16) rowindex_state_s state_g;
   rowindex_state_s* const s = &state_g;
@@ -513,7 +513,7 @@ __global__ void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_gr
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   gpu_reduce_pushdown_masks(device_span<orc_column_device_view const> orc_columns,
                             device_2dspan<rowgroup_rows const> rowgroup_bounds,
                             device_2dspan<size_type> set_counts)
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index cef4915e0c9..edc40391bfa 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -357,10 +357,10 @@ struct string_length_functor {
   statistics_merge_group const* stripe_stat_merge;
 };
 
-__global__ void copy_string_data(char* string_pool,
-                                 size_type* offsets,
-                                 statistics_chunk* chunks,
-                                 statistics_merge_group const* groups)
+CUDF_KERNEL void copy_string_data(char* string_pool,
+                                  size_type* offsets,
+                                  statistics_chunk* chunks,
+                                  statistics_merge_group const* groups)
 {
   auto const idx = blockIdx.x / 2;
   if (groups[idx].stats_dtype == dtype_string) {
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 53ff31ab0a7..a43c6d4cbb6 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@ constexpr int DEFAULT_BLOCK_SIZE = 256;
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   initialize_chunk_hash_maps_kernel(device_span<EncColumnChunk> chunks)
 {
   auto const chunk = chunks[blockIdx.x];
@@ -98,7 +98,7 @@ struct map_find_fn {
 };
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
@@ -189,7 +189,7 @@ __global__ void __launch_bounds__(block_size)
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   collect_map_entries_kernel(device_span<EncColumnChunk> chunks)
 {
   auto& chunk = chunks[blockIdx.x];
@@ -223,7 +223,7 @@ __global__ void __launch_bounds__(block_size)
 }
 
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   get_dictionary_indices_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index afe9a76a6d0..2d000600028 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -207,7 +207,7 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s,
  * (PageInfo::str_bytes) as part of the pass
  */
 template <typename level_t>
-__global__ void __launch_bounds__(preprocess_block_size)
+CUDF_KERNEL void __launch_bounds__(preprocess_block_size)
   gpuComputePageSizes(PageInfo* pages,
                       device_span<ColumnChunkDesc const> chunks,
                       size_t min_row,
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index d39edd70fcd..8d220e6fa96 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -421,7 +421,7 @@ static __device__ void gpuOutputGeneric(
  * @param error_code Error code to set if an error is encountered
  */
 template <int lvl_buf_size, typename level_t>
-__global__ void __launch_bounds__(decode_block_size)
+CUDF_KERNEL void __launch_bounds__(decode_block_size)
   gpuDecodePageData(PageInfo* pages,
                     device_span<ColumnChunkDesc const> chunks,
                     size_t min_row,
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index 44ec0e1e027..d0557446f14 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -305,7 +305,7 @@ struct delta_byte_array_decoder {
 // with V2 page headers; see https://www.mail-archive.com/dev@parquet.apache.org/msg11826.html).
 // this kernel only needs 96 threads (3 warps)(for now).
 template <typename level_t>
-__global__ void __launch_bounds__(96)
+CUDF_KERNEL void __launch_bounds__(96)
   gpuDecodeDeltaBinary(PageInfo* pages,
                        device_span<ColumnChunkDesc const> chunks,
                        size_t min_row,
@@ -430,7 +430,7 @@ __global__ void __launch_bounds__(96)
 // suffixes are not encoded in the header, we're going to have to first do a quick pass through them
 // to find the start/end of each structure.
 template <typename level_t>
-__global__ void __launch_bounds__(decode_block_size)
+CUDF_KERNEL void __launch_bounds__(decode_block_size)
   gpuDecodeDeltaByteArray(PageInfo* pages,
                           device_span<ColumnChunkDesc const> chunks,
                           size_t min_row,
@@ -587,7 +587,7 @@ __global__ void __launch_bounds__(decode_block_size)
 // Decode page data that is DELTA_LENGTH_BYTE_ARRAY packed. This encoding consists of a
 // DELTA_BINARY_PACKED array of string lengths, followed by the string data.
 template <typename level_t>
-__global__ void __launch_bounds__(decode_block_size)
+CUDF_KERNEL void __launch_bounds__(decode_block_size)
   gpuDecodeDeltaLengthByteArray(PageInfo* pages,
                                 device_span<ColumnChunkDesc const> chunks,
                                 size_t min_row,
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index e16551024d1..12af5888d2f 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -385,7 +385,7 @@ __device__ uint8_t const* delta_encode(page_enc_state_s<0>* s, uint64_t* buffer,
 
 // blockDim {512,1,1}
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   gpuInitRowGroupFragments(device_2dspan<PageFragment> frag,
                            device_span<parquet_column_device_view const> col_desc,
                            device_span<partition_info const> partitions,
@@ -422,7 +422,7 @@ __global__ void __launch_bounds__(block_size)
 
 // blockDim {512,1,1}
 template <int block_size>
-__global__ void __launch_bounds__(block_size)
+CUDF_KERNEL void __launch_bounds__(block_size)
   gpuCalculatePageFragments(device_span<PageFragment> frag,
                             device_span<size_type const> column_frag_sizes)
 {
@@ -449,7 +449,7 @@ __global__ void __launch_bounds__(block_size)
 }
 
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128)
+CUDF_KERNEL void __launch_bounds__(128)
   gpuInitFragmentStats(device_span<statistics_group> groups,
                        device_span<PageFragment const> fragments)
 {
@@ -510,7 +510,7 @@ __device__ size_t delta_data_len(Type physical_type,
 }
 
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128)
+CUDF_KERNEL void __launch_bounds__(128)
   gpuInitPages(device_2dspan<EncColumnChunk> chunks,
                device_span<EncPage> pages,
                device_span<size_type> page_sizes,
@@ -1244,9 +1244,10 @@ __device__ auto julian_days_with_time(int64_t v)
 // the level data is encoded.
 // blockDim(128, 1, 1)
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 8) gpuEncodePageLevels(device_span<EncPage> pages,
-                                                                     bool write_v2_headers,
-                                                                     encode_kernel_mask kernel_mask)
+CUDF_KERNEL void __launch_bounds__(block_size, 8)
+  gpuEncodePageLevels(device_span<EncPage> pages,
+                      bool write_v2_headers,
+                      encode_kernel_mask kernel_mask)
 {
   __shared__ __align__(8) rle_page_enc_state_s state_g;
 
@@ -1504,7 +1505,7 @@ __device__ void finish_page_encode(state_buf* s,
 // PLAIN page data encoder
 // blockDim(128, 1, 1)
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 8)
+CUDF_KERNEL void __launch_bounds__(block_size, 8)
   gpuEncodePages(device_span<EncPage> pages,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
@@ -1739,7 +1740,7 @@ __global__ void __launch_bounds__(block_size, 8)
 // DICTIONARY page data encoder
 // blockDim(128, 1, 1)
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 8)
+CUDF_KERNEL void __launch_bounds__(block_size, 8)
   gpuEncodeDictPages(device_span<EncPage> pages,
                      device_span<device_span<uint8_t const>> comp_in,
                      device_span<device_span<uint8_t>> comp_out,
@@ -1871,7 +1872,7 @@ __global__ void __launch_bounds__(block_size, 8)
 // DELTA_BINARY_PACKED page data encoder
 // blockDim(128, 1, 1)
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 8)
+CUDF_KERNEL void __launch_bounds__(block_size, 8)
   gpuEncodeDeltaBinaryPages(device_span<EncPage> pages,
                             device_span<device_span<uint8_t const>> comp_in,
                             device_span<device_span<uint8_t>> comp_out,
@@ -1975,7 +1976,7 @@ __global__ void __launch_bounds__(block_size, 8)
 // DELTA_LENGTH_BYTE_ARRAY page data encoder
 // blockDim(128, 1, 1)
 template <int block_size>
-__global__ void __launch_bounds__(block_size, 8)
+CUDF_KERNEL void __launch_bounds__(block_size, 8)
   gpuEncodeDeltaLengthByteArrayPages(device_span<EncPage> pages,
                                      device_span<device_span<uint8_t const>> comp_in,
                                      device_span<device_span<uint8_t>> comp_out,
@@ -2105,7 +2106,7 @@ constexpr int decide_compression_block_size =
   decide_compression_warps_in_block * cudf::detail::warp_size;
 
 // blockDim(decide_compression_block_size, 1, 1)
-__global__ void __launch_bounds__(decide_compression_block_size)
+CUDF_KERNEL void __launch_bounds__(decide_compression_block_size)
   gpuDecideCompression(device_span<EncColumnChunk> chunks)
 {
   __shared__ __align__(8) EncColumnChunk ck_g[decide_compression_warps_in_block];
@@ -2575,7 +2576,7 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start,
 }
 
 // blockDim(128, 1, 1)
-__global__ void __launch_bounds__(128)
+CUDF_KERNEL void __launch_bounds__(128)
   gpuEncodePageHeaders(device_span<EncPage> pages,
                        device_span<compression_result const> comp_results,
                        device_span<statistics_chunk const> page_stats,
@@ -2670,7 +2671,7 @@ __global__ void __launch_bounds__(128)
 }
 
 // blockDim(1024, 1, 1)
-__global__ void __launch_bounds__(1024)
+CUDF_KERNEL void __launch_bounds__(1024)
   gpuGatherPages(device_span<EncColumnChunk> chunks, device_span<EncPage const> pages)
 {
   __shared__ __align__(8) EncColumnChunk ck_g;
@@ -2848,7 +2849,7 @@ struct mask_tform {
 }  // namespace
 
 // blockDim(1, 1, 1)
-__global__ void __launch_bounds__(1)
+CUDF_KERNEL void __launch_bounds__(1)
   gpuEncodeColumnIndexes(device_span<EncColumnChunk> chunks,
                          device_span<statistics_chunk const> column_stats,
                          int32_t column_index_truncate_length)
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index cc3f584422d..4be4f45497d 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -348,9 +348,9 @@ struct gpuParsePageHeader {
  * @param[in] num_chunks Number of column chunks
  */
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chunks,
-                                                            int32_t num_chunks,
-                                                            kernel_error::pointer error_code)
+CUDF_KERNEL void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chunks,
+                                                             int32_t num_chunks,
+                                                             kernel_error::pointer error_code)
 {
   using cudf::detail::warp_size;
   gpuParsePageHeader parse_page_header;
@@ -480,7 +480,7 @@ __global__ void __launch_bounds__(128) gpuDecodePageHeaders(ColumnChunkDesc* chu
  * @param[in] num_chunks Number of column chunks
  */
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128)
+CUDF_KERNEL void __launch_bounds__(128)
   gpuBuildStringDictionaryIndex(ColumnChunkDesc* chunks, int32_t num_chunks)
 {
   __shared__ ColumnChunkDesc chunk_g[4];
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index d559f93f45b..37a8cabc182 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -584,7 +584,7 @@ __device__ thrust::pair<size_t, size_t> totalDeltaByteArraySize(uint8_t const* d
  * @tparam level_t Type used to store decoded repetition and definition levels
  */
 template <typename level_t>
-__global__ void __launch_bounds__(preprocess_block_size) gpuComputeStringPageBounds(
+CUDF_KERNEL void __launch_bounds__(preprocess_block_size) gpuComputeStringPageBounds(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
   __shared__ __align__(16) page_state_s state_g;
@@ -653,7 +653,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputeStringPageBou
  * @param min_rows crop all rows below min_row
  * @param num_rows Maximum number of rows to read
  */
-__global__ void __launch_bounds__(delta_preproc_block_size) gpuComputeDeltaPageStringSizes(
+CUDF_KERNEL void __launch_bounds__(delta_preproc_block_size) gpuComputeDeltaPageStringSizes(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
   __shared__ __align__(16) page_state_s state_g;
@@ -725,7 +725,7 @@ __global__ void __launch_bounds__(delta_preproc_block_size) gpuComputeDeltaPageS
  * @param min_rows crop all rows below min_row
  * @param num_rows Maximum number of rows to read
  */
-__global__ void __launch_bounds__(delta_length_block_size) gpuComputeDeltaLengthPageStringSizes(
+CUDF_KERNEL void __launch_bounds__(delta_length_block_size) gpuComputeDeltaLengthPageStringSizes(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
   using cudf::detail::warp_size;
@@ -820,7 +820,7 @@ __global__ void __launch_bounds__(delta_length_block_size) gpuComputeDeltaLength
  * @param min_rows crop all rows below min_row
  * @param num_rows Maximum number of rows to read
  */
-__global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
+CUDF_KERNEL void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes(
   PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
   __shared__ __align__(16) page_state_s state_g;
@@ -912,7 +912,7 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz
  * @tparam level_t Type used to store decoded repetition and definition levels
  */
 template <typename level_t>
-__global__ void __launch_bounds__(decode_block_size)
+CUDF_KERNEL void __launch_bounds__(decode_block_size)
   gpuDecodeStringPageData(PageInfo* pages,
                           device_span<ColumnChunkDesc const> chunks,
                           size_t min_row,
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 9c8b03886b5..f43a8fd24c4 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -189,7 +189,7 @@ struct stats_caster {
             return cudf::make_strings_column(
               val.size(),
               std::make_unique<column>(std::move(d_offsets), rmm::device_buffer{}, 0),
-              std::make_unique<column>(std::move(d_chars), rmm::device_buffer{}, 0),
+              d_chars.release(),
               null_count,
               rmm::device_buffer{
                 null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr});
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 279a814a4e1..90f52c0ee70 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -609,10 +609,10 @@ std::vector<schema_tree_node> construct_schema_tree(
       // column that isn't a single-depth list<int8> the code will throw.
       if (col_meta.is_enabled_output_as_binary() && is_last_list_child(col)) {
         CUDF_EXPECTS(col_meta.num_children() == 2 or col_meta.num_children() == 0,
-                     "Binary column's corresponding metadata should have zero or two children!");
+                     "Binary column's corresponding metadata should have zero or two children");
         if (col_meta.num_children() > 0) {
           CUDF_EXPECTS(col->children[lists_column_view::child_column_index]->children.empty(),
-                       "Binary column must not be nested!");
+                       "Binary column must not be nested");
         }
 
         schema_tree_node col_schema{};
@@ -734,8 +734,13 @@ std::vector<schema_tree_node> construct_schema_tree(
       } else {
         // if leaf, add current
         if (col->type().id() == type_id::STRING) {
-          CUDF_EXPECTS(col_meta.num_children() == 2 or col_meta.num_children() == 0,
-                       "String column's corresponding metadata should have zero or two children");
+          if (col_meta.is_enabled_output_as_binary()) {
+            CUDF_EXPECTS(col_meta.num_children() == 2 or col_meta.num_children() == 0,
+                         "Binary column's corresponding metadata should have zero or two children");
+          } else {
+            CUDF_EXPECTS(col_meta.num_children() == 1 or col_meta.num_children() == 0,
+                         "String column's corresponding metadata should have zero or one children");
+          }
         } else {
           CUDF_EXPECTS(col_meta.num_children() == 0,
                        "Leaf column's corresponding metadata cannot have children");
diff --git a/cpp/src/io/statistics/column_statistics.cuh b/cpp/src/io/statistics/column_statistics.cuh
index f71fb95949f..db0d56ac321 100644
--- a/cpp/src/io/statistics/column_statistics.cuh
+++ b/cpp/src/io/statistics/column_statistics.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -289,7 +289,7 @@ __device__ void cooperative_load(T& destination, T const* source = nullptr)
  * @tparam IO File format for which statistics calculation is being done
  */
 template <int block_size, detail::io_file_format IO>
-__global__ void __launch_bounds__(block_size, 1)
+CUDF_KERNEL void __launch_bounds__(block_size, 1)
   gpu_calculate_group_statistics(statistics_chunk* chunks,
                                  statistics_group const* groups,
                                  bool const int96_timestamps)
@@ -368,7 +368,7 @@ void calculate_group_statistics(statistics_chunk* chunks,
  * @tparam IO File format for which statistics calculation is being done
  */
 template <int block_size, detail::io_file_format IO>
-__global__ void __launch_bounds__(block_size, 1)
+CUDF_KERNEL void __launch_bounds__(block_size, 1)
   gpu_merge_group_statistics(statistics_chunk* chunks_out,
                              statistics_chunk const* chunks_in,
                              statistics_merge_group const* groups)
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 443ca0f5fe7..34a476974e4 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -138,7 +138,7 @@ using byte_offset   = int64_t;
 // it begins in. From there, each thread can then take deterministic action. In this case, the
 // deterministic action is counting and outputting delimiter offsets when a delimiter is found.
 
-__global__ void multibyte_split_init_kernel(
+CUDF_KERNEL void multibyte_split_init_kernel(
   cudf::size_type base_tile_idx,
   cudf::size_type num_tiles,
   cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
@@ -154,7 +154,7 @@ __global__ void multibyte_split_init_kernel(
   }
 }
 
-__global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel(
+CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel(
   cudf::size_type base_tile_idx,
   byte_offset base_input_offset,
   output_offset base_output_offset,
@@ -231,7 +231,7 @@ __global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel(
   }
 }
 
-__global__ __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel(
+CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel(
   cudf::size_type base_tile_idx,
   byte_offset base_input_offset,
   output_offset base_output_offset,
@@ -556,7 +556,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
     return cudf::make_strings_column(
       string_count,
       std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0),
-      std::make_unique<cudf::column>(std::move(chars), rmm::device_buffer{}, 0),
+      chars.release(),
       0,
       {});
   }
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 1cbd5929525..36303a60aa9 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -68,26 +68,10 @@ std::unique_ptr<column> cudf::io::detail::inline_column_buffer::make_string_colu
   rmm::cuda_stream_view stream)
 {
   // no need for copies, just transfer ownership of the data_buffers to the columns
-  auto const state = mask_state::UNALLOCATED;
-  auto str_col =
-    _string_data.is_empty()
-      ? make_empty_column(data_type{type_id::INT8})
-      : std::make_unique<column>(data_type{type_id::INT8},
-                                 string_size(),
-                                 std::move(_string_data),
-                                 cudf::detail::create_null_mask(size, state, stream, _mr),
-                                 state_null_count(state, size),
-                                 std::vector<std::unique_ptr<column>>{});
-  auto offsets_col =
-    std::make_unique<column>(data_type{type_to_id<size_type>()},
-                             size + 1,
-                             std::move(_data),
-                             cudf::detail::create_null_mask(size + 1, state, stream, _mr),
-                             state_null_count(state, size + 1),
-                             std::vector<std::unique_ptr<column>>{});
-
+  auto offsets_col = std::make_unique<column>(
+    data_type{type_to_id<size_type>()}, size + 1, std::move(_data), rmm::device_buffer{}, 0);
   return make_strings_column(
-    size, std::move(offsets_col), std::move(str_col), null_count(), std::move(_null_mask));
+    size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask));
 }
 
 namespace {
@@ -161,7 +145,6 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
       if (schema.value_or(reader_column_schema{}).is_enabled_convert_binary_to_strings()) {
         if (schema_info != nullptr) {
           schema_info->children.push_back(column_name_info{"offsets"});
-          schema_info->children.push_back(column_name_info{"chars"});
         }
 
         // make_strings_column allocates new memory, it does not simply move
@@ -177,12 +160,11 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
         auto col_content      = string_col->release();
 
         // convert to uint8 column, strings are currently stored as int8
-        auto contents =
-          col_content.children[strings_column_view::chars_column_index].release()->release();
-        auto data = contents.data.release();
+        auto data      = col_content.data.release();
+        auto char_size = data->size();
 
         auto uint8_col = std::make_unique<column>(
-          data_type{type_id::UINT8}, data->size(), std::move(*data), rmm::device_buffer{}, 0);
+          data_type{type_id::UINT8}, char_size, std::move(*data), rmm::device_buffer{}, 0);
 
         if (schema_info != nullptr) {
           schema_info->children.push_back(column_name_info{"offsets"});
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 9e5c5c76392..8fd860d9492 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -420,14 +420,14 @@ struct bitfield_block {
  * @param d_chars Character array to store the characters of strings
  */
 template <bool is_warp, size_type num_warps, typename str_tuple_it>
-__global__ void parse_fn_string_parallel(str_tuple_it str_tuples,
-                                         size_type total_out_strings,
-                                         size_type* str_counter,
-                                         bitmask_type* null_mask,
-                                         size_type* null_count_data,
-                                         cudf::io::parse_options_view const options,
-                                         size_type* d_offsets,
-                                         char* d_chars)
+CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
+                                          size_type total_out_strings,
+                                          size_type* str_counter,
+                                          bitmask_type* null_mask,
+                                          size_type* null_count_data,
+                                          cudf::io::parse_options_view const options,
+                                          size_type* d_offsets,
+                                          char* d_chars)
 {
   constexpr auto BLOCK_SIZE =
     is_warp ? cudf::detail::warp_size : cudf::detail::warp_size * num_warps;
@@ -861,9 +861,8 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
                std::overflow_error);
 
   // CHARS column
-  std::unique_ptr<column> chars =
-    strings::detail::create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
-  auto d_chars = chars->mutable_view().data<char>();
+  rmm::device_uvector<char> chars(bytes, stream, mr);
+  auto d_chars = chars.data();
 
   single_thread_fn.d_chars = d_chars;
   thrust::for_each_n(rmm::exec_policy(stream),
@@ -902,7 +901,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
 
   return make_strings_column(col_size,
                              std::move(offsets),
-                             std::move(chars),
+                             chars.release(),
                              d_null_count.value(stream),
                              std::move(null_mask));
 }
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index a6a93c41472..3cd70801cdf 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -204,8 +204,10 @@ class hostdevice_vector {
 template <typename T>
 class hostdevice_2dvector {
  public:
+  hostdevice_2dvector() : hostdevice_2dvector(0, 0, cudf::get_default_stream()) {}
+
   hostdevice_2dvector(size_t rows, size_t columns, rmm::cuda_stream_view stream)
-    : _size{rows, columns}, _data{rows * columns, stream}
+    : _data{rows * columns, stream}, _size{rows, columns}
   {
   }
 
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index 06b86f33c85..d02ce99e6e5 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -87,12 +87,12 @@ __device__ __forceinline__ void setElement(void*, cudf::size_type, T const&, V c
  * @param[out] positions Array containing the output positions
  */
 template <class T>
-__global__ void count_and_set_positions(char const* data,
-                                        uint64_t size,
-                                        uint64_t offset,
-                                        char const key,
-                                        cudf::size_type* count,
-                                        T* positions)
+CUDF_KERNEL void count_and_set_positions(char const* data,
+                                         uint64_t size,
+                                         uint64_t offset,
+                                         char const key,
+                                         cudf::size_type* count,
+                                         T* positions)
 {
   // thread IDs range per block, so also need the block id
   auto const tid = cudf::detail::grid_1d::global_thread_id();
diff --git a/cpp/src/io/utilities/type_inference.cu b/cpp/src/io/utilities/type_inference.cu
index 79a5c8f1c4c..b446ad41946 100644
--- a/cpp/src/io/utilities/type_inference.cu
+++ b/cpp/src/io/utilities/type_inference.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -112,11 +112,11 @@ __device__ __inline__ bool is_like_float(std::size_t len,
  * @param[out] column_info Histogram of column type counters
  */
 template <int BlockSize, typename OptionsView, typename ColumnStringIter>
-__global__ void infer_column_type_kernel(OptionsView options,
-                                         device_span<char const> data,
-                                         ColumnStringIter offset_length_begin,
-                                         std::size_t size,
-                                         cudf::io::column_type_histogram* column_info)
+CUDF_KERNEL void infer_column_type_kernel(OptionsView options,
+                                          device_span<char const> data,
+                                          ColumnStringIter offset_length_begin,
+                                          std::size_t size,
+                                          cudf::io::column_type_histogram* column_info)
 {
   auto thread_type_histogram = cudf::io::column_type_histogram{};
 
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index f665aba698f..02ce27a36ba 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,7 +48,7 @@ namespace detail {
  * @param[out] output_size The resulting output size
  */
 template <int block_size, bool has_nulls>
-__global__ void compute_conditional_join_output_size(
+CUDF_KERNEL void compute_conditional_join_output_size(
   table_device_view left_table,
   table_device_view right_table,
   join_kind join_type,
@@ -138,15 +138,15 @@ __global__ void compute_conditional_join_output_size(
  * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
  */
 template <cudf::size_type block_size, cudf::size_type output_cache_size, bool has_nulls>
-__global__ void conditional_join(table_device_view left_table,
-                                 table_device_view right_table,
-                                 join_kind join_type,
-                                 cudf::size_type* join_output_l,
-                                 cudf::size_type* join_output_r,
-                                 cudf::size_type* current_idx,
-                                 cudf::ast::detail::expression_device_view device_expression_data,
-                                 cudf::size_type const max_size,
-                                 bool const swap_tables)
+CUDF_KERNEL void conditional_join(table_device_view left_table,
+                                  table_device_view right_table,
+                                  join_kind join_type,
+                                  cudf::size_type* join_output_l,
+                                  cudf::size_type* join_output_r,
+                                  cudf::size_type* current_idx,
+                                  cudf::ast::detail::expression_device_view device_expression_data,
+                                  cudf::size_type const max_size,
+                                  bool const swap_tables)
 {
   constexpr int num_warps = block_size / detail::warp_size;
   __shared__ cudf::size_type current_idx_shared[num_warps];
diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh
index efe575e14de..22bbbff967a 100644
--- a/cpp/src/join/mixed_join_kernel.cuh
+++ b/cpp/src/join/mixed_join_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,8 +36,10 @@ namespace detail {
 
 namespace cg = cooperative_groups;
 
+#pragma GCC diagnostic ignored "-Wattributes"
+
 template <cudf::size_type block_size, bool has_nulls>
-__launch_bounds__(block_size) __global__
+__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
   void mixed_join(table_device_view left_table,
                   table_device_view right_table,
                   table_device_view probe,
diff --git a/cpp/src/join/mixed_join_kernels.cuh b/cpp/src/join/mixed_join_kernels.cuh
index 2cd4d0c3b38..1d36a246f02 100644
--- a/cpp/src/join/mixed_join_kernels.cuh
+++ b/cpp/src/join/mixed_join_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,6 +57,7 @@ namespace detail {
  * left/right tables to determine which is the build table and which is the
  * probe table has already happened on the host.
  */
+
 template <int block_size, bool has_nulls>
 __global__ void compute_mixed_join_output_size(
   table_device_view left_table,
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index e31e35ff788..bde75395371 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,8 +31,10 @@ namespace detail {
 
 namespace cg = cooperative_groups;
 
+#pragma GCC diagnostic ignored "-Wattributes"
+
 template <cudf::size_type block_size, bool has_nulls>
-__launch_bounds__(block_size) __global__
+__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
   void mixed_join_semi(table_device_view left_table,
                        table_device_view right_table,
                        table_device_view probe,
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index ef377dadc4b..3bd7bfd7c9a 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,20 +33,23 @@ namespace cudf {
 namespace detail {
 namespace cg = cooperative_groups;
 
+#pragma GCC diagnostic ignored "-Wattributes"
+
 template <int block_size, bool has_nulls>
-__launch_bounds__(block_size) __global__ void compute_mixed_join_output_size(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row)
+__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
+  void compute_mixed_join_output_size(
+    table_device_view left_table,
+    table_device_view right_table,
+    table_device_view probe,
+    table_device_view build,
+    row_hash const hash_probe,
+    row_equality const equality_probe,
+    join_kind const join_type,
+    cudf::detail::mixed_multimap_type::device_view hash_table_view,
+    ast::detail::expression_device_view device_expression_data,
+    bool const swap_tables,
+    std::size_t* output_size,
+    cudf::device_span<cudf::size_type> matches_per_row)
 {
   // The (required) extern storage of the shared memory array leads to
   // conflicting declarations between different templates. The easiest
diff --git a/cpp/src/join/mixed_join_size_kernels_semi.cu b/cpp/src/join/mixed_join_size_kernels_semi.cu
index fd7bf0234e9..31da6677aef 100644
--- a/cpp/src/join/mixed_join_size_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_size_kernels_semi.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,20 +31,23 @@ namespace detail {
 
 namespace cg = cooperative_groups;
 
+#pragma GCC diagnostic ignored "-Wattributes"
+
 template <int block_size, bool has_nulls>
-__launch_bounds__(block_size) __global__ void compute_mixed_join_output_size_semi(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::semi_map_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row)
+__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
+  void compute_mixed_join_output_size_semi(
+    table_device_view left_table,
+    table_device_view right_table,
+    table_device_view probe,
+    table_device_view build,
+    row_hash const hash_probe,
+    row_equality const equality_probe,
+    join_kind const join_type,
+    cudf::detail::semi_map_type::device_view hash_table_view,
+    ast::detail::expression_device_view device_expression_data,
+    bool const swap_tables,
+    std::size_t* output_size,
+    cudf::device_span<cudf::size_type> matches_per_row)
 {
   // The (required) extern storage of the shared memory array leads to
   // conflicting declarations between different templates. The easiest
diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu
index c01357c96ca..146b54c0d87 100644
--- a/cpp/src/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -900,7 +900,7 @@ __device__ thrust::pair<parse_result, json_output> get_json_object_single(
  * @param options Options controlling behavior
  */
 template <int block_size>
-__launch_bounds__(block_size) __global__
+__launch_bounds__(block_size) CUDF_KERNEL
   void get_json_object_kernel(column_device_view col,
                               path_operator const* const commands,
                               size_type* output_offsets,
@@ -1010,7 +1010,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     cudf::detail::get_value<size_type>(offsets_view, col.size(), stream);
 
   // allocate output string column
-  auto chars = cudf::strings::detail::create_chars_child_column(output_size, stream, mr);
+  rmm::device_uvector<char> chars(output_size, stream, mr);
 
   // potential optimization : if we know that all outputs are valid, we could skip creating
   // the validity mask altogether
@@ -1018,7 +1018,6 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
     cudf::detail::create_null_mask(col.size(), mask_state::UNINITIALIZED, stream, mr);
 
   // compute results
-  cudf::mutable_column_view chars_view(*chars);
   rmm::device_scalar<size_type> d_valid_count{0, stream};
 
   get_json_object_kernel<block_size>
@@ -1026,14 +1025,14 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
       *cdv,
       std::get<0>(preprocess).value().data(),
       offsets_view.head<size_type>(),
-      chars_view.head<char>(),
+      chars.data(),
       static_cast<bitmask_type*>(validity.data()),
       d_valid_count.data(),
       options);
 
   auto result = make_strings_column(col.size(),
                                     std::move(offsets),
-                                    std::move(chars),
+                                    chars.release(),
                                     col.size() - d_valid_count.value(stream),
                                     std::move(validity));
   // unmatched array query may result in unsanitized '[' value in the result
diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu
index 1a603785a41..9fecaa1ddb2 100644
--- a/cpp/src/labeling/label_bins.cu
+++ b/cpp/src/labeling/label_bins.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -236,15 +236,11 @@ std::unique_ptr<column> label_bins(column_view const& input,
                                    inclusive left_inclusive,
                                    column_view const& right_edges,
                                    inclusive right_inclusive,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::label_bins(input,
-                            left_edges,
-                            left_inclusive,
-                            right_edges,
-                            right_inclusive,
-                            cudf::get_default_stream(),
-                            mr);
+  return detail::label_bins(
+    input, left_edges, left_inclusive, right_edges, right_inclusive, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index cbc99fcdb83..8f05b020a2e 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -193,8 +193,7 @@ struct compute_string_sizes_and_interleave_lists_fn {
       auto const start_byte = str_offsets[start_str_idx];
       auto const end_byte   = str_offsets[end_str_idx];
       if (start_byte < end_byte) {
-        auto const input_ptr =
-          str_col.child(strings_column_view::chars_column_index).template data<char>() + start_byte;
+        auto const input_ptr  = str_col.template head<char>() + start_byte;
         auto const output_ptr = d_chars + d_offsets[write_idx];
         thrust::copy(thrust::seq, input_ptr, input_ptr + end_byte - start_byte, output_ptr);
       }
@@ -238,7 +237,7 @@ struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::
 
     return make_strings_column(num_output_entries,
                                std::move(offsets_column),
-                               std::move(chars_column),
+                               std::move(chars_column->release().data.release()[0]),
                                null_count,
                                std::move(null_mask));
   }
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 0d30230de28..073a2a6b97e 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -122,7 +122,7 @@ using index_type = detail::index_type;
  * to be copied to the output. Length must be equal to `num_destination_rows`
  */
 template <bool left_have_valids, bool right_have_valids>
-__global__ void materialize_merged_bitmask_kernel(
+CUDF_KERNEL void materialize_merged_bitmask_kernel(
   column_device_view left_dcol,
   column_device_view right_dcol,
   bitmask_type* out_validity,
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 7b6676346c2..8d8f1a71672 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -122,14 +122,14 @@ class bitwise_partitioner {
  * @param[out] global_partition_sizes The number of rows in each partition.
  */
 template <class row_hasher_t, typename partitioner_type>
-__global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
-                                              size_type const num_rows,
-                                              size_type const num_partitions,
-                                              partitioner_type const the_partitioner,
-                                              size_type* __restrict__ row_partition_numbers,
-                                              size_type* __restrict__ row_partition_offset,
-                                              size_type* __restrict__ block_partition_sizes,
-                                              size_type* __restrict__ global_partition_sizes)
+CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher,
+                                               size_type const num_rows,
+                                               size_type const num_partitions,
+                                               partitioner_type const the_partitioner,
+                                               size_type* __restrict__ row_partition_numbers,
+                                               size_type* __restrict__ row_partition_offset,
+                                               size_type* __restrict__ block_partition_sizes,
+                                               size_type* __restrict__ global_partition_sizes)
 {
   // Accumulate histogram of the size of each partition in shared memory
   extern __shared__ size_type shared_partition_sizes[];
@@ -197,10 +197,10 @@ __global__ void compute_row_partition_numbers(row_hasher_t the_hasher,
          {block0 partition(num_partitions-1) offset, block1
  partition(num_partitions -1) offset, ...} }
  */
-__global__ void compute_row_output_locations(size_type* __restrict__ row_partition_numbers,
-                                             size_type const num_rows,
-                                             size_type const num_partitions,
-                                             size_type* __restrict__ block_partition_offsets)
+CUDF_KERNEL void compute_row_output_locations(size_type* __restrict__ row_partition_numbers,
+                                              size_type const num_rows,
+                                              size_type const num_partitions,
+                                              size_type* __restrict__ block_partition_offsets)
 {
   // Shared array that holds the offset of this blocks partitions in
   // global memory
@@ -255,14 +255,14 @@ __global__ void compute_row_output_locations(size_type* __restrict__ row_partiti
  * @param[in] scanned_block_partition_sizes The scan of block_partition_sizes
  */
 template <typename InputIter, typename DataType>
-__global__ void copy_block_partitions(InputIter input_iter,
-                                      DataType* __restrict__ output_buf,
-                                      size_type const num_rows,
-                                      size_type const num_partitions,
-                                      size_type const* __restrict__ row_partition_numbers,
-                                      size_type const* __restrict__ row_partition_offset,
-                                      size_type const* __restrict__ block_partition_sizes,
-                                      size_type const* __restrict__ scanned_block_partition_sizes)
+CUDF_KERNEL void copy_block_partitions(InputIter input_iter,
+                                       DataType* __restrict__ output_buf,
+                                       size_type const num_rows,
+                                       size_type const num_partitions,
+                                       size_type const* __restrict__ row_partition_numbers,
+                                       size_type const* __restrict__ row_partition_offset,
+                                       size_type const* __restrict__ block_partition_sizes,
+                                       size_type const* __restrict__ scanned_block_partition_sizes)
 {
   extern __shared__ char shared_memory[];
   auto block_output = reinterpret_cast<DataType*>(shared_memory);
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 4764ac4d87a..c8ac19e01cc 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -68,13 +68,13 @@ struct make_centroid {
 
 // kernel for computing percentiles on input tdigest (mean, weight) centroid data.
 template <typename CentroidIter>
-__global__ void compute_percentiles_kernel(device_span<size_type const> tdigest_offsets,
-                                           column_device_view percentiles,
-                                           CentroidIter centroids_,
-                                           double const* min_,
-                                           double const* max_,
-                                           double const* cumulative_weight_,
-                                           double* output)
+CUDF_KERNEL void compute_percentiles_kernel(device_span<size_type const> tdigest_offsets,
+                                            column_device_view percentiles,
+                                            CentroidIter centroids_,
+                                            double const* min_,
+                                            double const* max_,
+                                            double const* cumulative_weight_,
+                                            double* output)
 {
   auto const tid = cudf::detail::grid_1d::global_thread_id();
 
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 450996a43d2..fc56d17d73b 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -370,15 +370,15 @@ std::unique_ptr<scalar> to_tdigest_scalar(std::unique_ptr<column>&& tdigest,
  */
 
 template <typename GroupInfo, typename NearestWeightFunc, typename CumulativeWeight>
-__global__ void generate_cluster_limits_kernel(int delta,
-                                               size_type num_groups,
-                                               NearestWeightFunc nearest_weight,
-                                               GroupInfo group_info,
-                                               CumulativeWeight cumulative_weight,
-                                               double* group_cluster_wl,
-                                               size_type* group_num_clusters,
-                                               size_type const* group_cluster_offsets,
-                                               bool has_nulls)
+CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
+                                                size_type num_groups,
+                                                NearestWeightFunc nearest_weight,
+                                                GroupInfo group_info,
+                                                CumulativeWeight cumulative_weight,
+                                                double* group_cluster_wl,
+                                                size_type* group_num_clusters,
+                                                size_type const* group_cluster_offsets,
+                                                bool has_nulls)
 {
   int const tid = threadIdx.x + blockIdx.x * blockDim.x;
 
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 6852b19af44..23c792ddcae 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -106,7 +106,7 @@ std::unique_ptr<cudf::column> clamp_string_column(strings_column_view const& inp
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              std::move(cudf::detail::copy_bitmask(input.parent(), stream, mr)));
 }
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 2eb624d3f05..8ea229368cc 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,12 +57,12 @@ namespace {  // anonymous
 static constexpr int BLOCK_SIZE = 256;
 
 template <int phase, bool replacement_has_nulls>
-__global__ void replace_nulls_strings(cudf::column_device_view input,
-                                      cudf::column_device_view replacement,
-                                      cudf::bitmask_type* output_valid,
-                                      cudf::size_type* offsets,
-                                      char* chars,
-                                      cudf::size_type* valid_counter)
+CUDF_KERNEL void replace_nulls_strings(cudf::column_device_view input,
+                                       cudf::column_device_view replacement,
+                                       cudf::bitmask_type* output_valid,
+                                       cudf::size_type* offsets,
+                                       char* chars,
+                                       cudf::size_type* valid_counter)
 {
   cudf::size_type nrows = input.size();
   auto i                = cudf::detail::grid_1d::global_thread_id();
@@ -112,10 +112,10 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
 }
 
 template <typename Type, bool replacement_has_nulls>
-__global__ void replace_nulls(cudf::column_device_view input,
-                              cudf::column_device_view replacement,
-                              cudf::mutable_column_device_view output,
-                              cudf::size_type* output_valid_count)
+CUDF_KERNEL void replace_nulls(cudf::column_device_view input,
+                               cudf::column_device_view replacement,
+                               cudf::mutable_column_device_view output,
+                               cudf::size_type* output_valid_count)
 {
   cudf::size_type nrows = input.size();
   auto i                = cudf::detail::grid_1d::global_thread_id();
@@ -255,22 +255,19 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
   auto offsets_view = offsets->mutable_view();
 
   // Allocate chars array and output null mask
-  std::unique_ptr<cudf::column> output_chars =
-    cudf::strings::detail::create_chars_child_column(bytes, stream, mr);
-
-  auto output_chars_view = output_chars->mutable_view();
+  rmm::device_uvector<char> output_chars(bytes, stream, mr);
 
   replace_second<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
     *device_in,
     *device_replacement,
     reinterpret_cast<cudf::bitmask_type*>(valid_bits.data()),
     offsets_view.begin<cudf::size_type>(),
-    output_chars_view.data<char>(),
+    output_chars.data(),
     valid_count);
 
   return cudf::make_strings_column(input.size(),
                                    std::move(offsets),
-                                   std::move(output_chars),
+                                   output_chars.release(),
                                    input.size() - valid_counter.value(stream),
                                    std::move(valid_bits));
 }
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 9341929de44..184c30246c7 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -17,7 +17,7 @@
  * limitations under the License.
  */
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -118,13 +118,13 @@ __device__ int get_new_string_value(cudf::size_type idx,
  * @param output_valid_count The output valid count
  */
 template <bool input_has_nulls, bool replacement_has_nulls>
-__global__ void replace_strings_first_pass(cudf::column_device_view input,
-                                           cudf::column_device_view values_to_replace,
-                                           cudf::column_device_view replacement,
-                                           cudf::mutable_column_device_view offsets,
-                                           cudf::mutable_column_device_view indices,
-                                           cudf::bitmask_type* output_valid,
-                                           cudf::size_type* __restrict__ output_valid_count)
+CUDF_KERNEL void replace_strings_first_pass(cudf::column_device_view input,
+                                            cudf::column_device_view values_to_replace,
+                                            cudf::column_device_view replacement,
+                                            cudf::mutable_column_device_view offsets,
+                                            cudf::mutable_column_device_view indices,
+                                            cudf::bitmask_type* output_valid,
+                                            cudf::size_type* __restrict__ output_valid_count)
 {
   cudf::size_type nrows = input.size();
   auto tid              = cudf::detail::grid_1d::global_thread_id();
@@ -184,11 +184,11 @@ __global__ void replace_strings_first_pass(cudf::column_device_view input,
  * @param indices Temporary column used to store the replacement indices.
  */
 template <bool input_has_nulls, bool replacement_has_nulls>
-__global__ void replace_strings_second_pass(cudf::column_device_view input,
-                                            cudf::column_device_view replacement,
-                                            cudf::mutable_column_device_view offsets,
-                                            cudf::mutable_column_device_view strings,
-                                            cudf::mutable_column_device_view indices)
+CUDF_KERNEL void replace_strings_second_pass(cudf::column_device_view input,
+                                             cudf::column_device_view replacement,
+                                             cudf::mutable_column_device_view offsets,
+                                             char* strings,
+                                             cudf::mutable_column_device_view indices)
 {
   cudf::size_type nrows = input.size();
   auto tid              = cudf::detail::grid_1d::global_thread_id();
@@ -211,9 +211,8 @@ __global__ void replace_strings_second_pass(cudf::column_device_view input,
       cudf::string_view output = (replace_idx == -1)
                                    ? input.element<cudf::string_view>(idx)
                                    : replacement.element<cudf::string_view>(replace_idx);
-      std::memcpy(strings.data<char>() + offsets.data<cudf::size_type>()[idx],
-                  output.data(),
-                  output.size_bytes());
+      std::memcpy(
+        strings + offsets.data<cudf::size_type>()[idx], output.data(), output.size_bytes());
     }
 
     tid += stride;
@@ -245,12 +244,12 @@ __global__ void replace_strings_second_pass(cudf::column_device_view input,
  * @param[in] replacement_valid Valid mask associated with d_replacement_values
  */
 template <class T, bool input_has_nulls, bool replacement_has_nulls>
-__global__ void replace_kernel(cudf::column_device_view input,
-                               cudf::mutable_column_device_view output,
-                               cudf::size_type* __restrict__ output_valid_count,
-                               cudf::size_type nrows,
-                               cudf::column_device_view values_to_replace,
-                               cudf::column_device_view replacement)
+CUDF_KERNEL void replace_kernel(cudf::column_device_view input,
+                                cudf::mutable_column_device_view output,
+                                cudf::size_type* __restrict__ output_valid_count,
+                                cudf::size_type nrows,
+                                cudf::column_device_view values_to_replace,
+                                cudf::column_device_view replacement)
 {
   T* __restrict__ output_data = output.data<T>();
 
@@ -434,18 +433,15 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
 
   // Allocate chars array and output null mask
   cudf::size_type null_count = input_col.size() - valid_counter.value(stream);
-  std::unique_ptr<cudf::column> output_chars =
-    cudf::strings::detail::create_chars_child_column(bytes, stream, mr);
-
-  auto output_chars_view = output_chars->mutable_view();
-  auto device_chars      = cudf::mutable_column_device_view::create(output_chars_view, stream);
+  rmm::device_uvector<char> output_chars(bytes, stream, mr);
+  auto d_chars = output_chars.data();
 
   replace_second<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
-    *device_in, *device_replacement, *device_offsets, *device_chars, *device_indices);
+    *device_in, *device_replacement, *device_offsets, d_chars, *device_indices);
 
   return cudf::make_strings_column(input_col.size(),
                                    std::move(offsets),
-                                   std::move(output_chars),
+                                   output_chars.release(),
                                    null_count,
                                    std::move(valid_bits));
 }
diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu
index 5f89b6d9b3b..6ed28e693fd 100644
--- a/cpp/src/reshape/byte_cast.cu
+++ b/cpp/src/reshape/byte_cast.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -135,10 +135,9 @@ struct byte_list_conversion_fn<T, std::enable_if_t<std::is_same_v<T, cudf::strin
     }
 
     auto col_content     = std::make_unique<column>(input, stream, mr)->release();
-    auto chars_contents  = col_content.children[strings_column_view::chars_column_index]->release();
-    auto const num_chars = chars_contents.data->size();
+    auto const num_chars = col_content.data->size();
     auto uint8_col       = std::make_unique<column>(
-      output_type, num_chars, std::move(*(chars_contents.data)), rmm::device_buffer{}, 0);
+      output_type, num_chars, std::move(*(col_content.data)), rmm::device_buffer{}, 0);
 
     auto result = make_lists_column(
       input.size(),
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index deb0acb4742..22b45fe7a58 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -193,9 +193,8 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::strin
     auto d_results_offsets = offsets_column->view().template data<int32_t>();
 
     // Create the chars column
-    auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr);
-    // Fill the chars column
-    auto d_results_chars = chars_column->mutable_view().template data<char>();
+    rmm::device_uvector<char> chars(bytes, stream, mr);
+    auto d_results_chars = chars.data();
     thrust::for_each_n(
       rmm::exec_policy(stream),
       thrust::make_counting_iterator<size_type>(0),
@@ -215,7 +214,7 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::strin
 
     return make_strings_column(num_strings,
                                std::move(offsets_column),
-                               std::move(chars_column),
+                               chars.release(),
                                null_count,
                                std::move(valid_mask.first));
   }
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index 0648ef3d30f..20845a97c7e 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1022,7 +1022,7 @@ template <typename OutputType,
           typename DeviceRollingOperator,
           typename PrecedingWindowIterator,
           typename FollowingWindowIterator>
-__launch_bounds__(block_size) __global__
+__launch_bounds__(block_size) CUDF_KERNEL
   void gpu_rolling(column_device_view input,
                    column_device_view default_outputs,
                    mutable_column_device_view output,
diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu
index 06b224c39ad..2c753965c1c 100644
--- a/cpp/src/rolling/jit/kernel.cu
+++ b/cpp/src/rolling/jit/kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,15 +41,15 @@ template <typename InType,
           class agg_op,
           typename PrecedingWindowType,
           typename FollowingWindowType>
-__global__ void gpu_rolling_new(cudf::size_type nrows,
-                                InType const* const __restrict__ in_col,
-                                cudf::bitmask_type const* const __restrict__ in_col_valid,
-                                OutType* __restrict__ out_col,
-                                cudf::bitmask_type* __restrict__ out_col_valid,
-                                cudf::size_type* __restrict__ output_valid_count,
-                                PrecedingWindowType preceding_window_begin,
-                                FollowingWindowType following_window_begin,
-                                cudf::size_type min_periods)
+CUDF_KERNEL void gpu_rolling_new(cudf::size_type nrows,
+                                 InType const* const __restrict__ in_col,
+                                 cudf::bitmask_type const* const __restrict__ in_col_valid,
+                                 OutType* __restrict__ out_col,
+                                 cudf::bitmask_type* __restrict__ out_col_valid,
+                                 cudf::size_type* __restrict__ output_valid_count,
+                                 PrecedingWindowType preceding_window_begin,
+                                 FollowingWindowType following_window_begin,
+                                 cudf::size_type min_periods)
 {
   cudf::thread_index_type i            = blockIdx.x * blockDim.x + threadIdx.x;
   cudf::thread_index_type const stride = blockDim.x * gridDim.x;
diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index de51356845c..2856c077fb2 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -110,8 +110,8 @@ std::unique_ptr<column> counts_fn(strings_column_view const& strings,
  * @param d_strings Column with strings to count
  * @param d_lengths Results of the counts per string
  */
-__global__ void count_characters_parallel_fn(column_device_view const d_strings,
-                                             size_type* d_lengths)
+CUDF_KERNEL void count_characters_parallel_fn(column_device_view const d_strings,
+                                              size_type* d_lengths)
 {
   auto const idx    = cudf::detail::grid_1d::global_thread_id();
   using warp_reduce = cub::WarpReduce<size_type>;
@@ -170,7 +170,8 @@ std::unique_ptr<column> count_characters(strings_column_view const& input,
                                          rmm::mr::device_memory_resource* mr)
 {
   if ((input.size() == input.null_count()) ||
-      ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD)) {
+      ((input.chars_size(stream) / (input.size() - input.null_count())) <
+       AVG_CHAR_BYTES_THRESHOLD)) {
     auto ufn = cuda::proclaim_return_type<size_type>(
       [] __device__(string_view const& d_str) { return d_str.length(); });
     return counts_fn(input, ufn, stream, mr);
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index c555031b588..3b99093a89f 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -229,11 +229,12 @@ std::unique_ptr<column> capitalizer(CapitalFn cfn,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  auto children = cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr);
+  auto [offsets_column, chars_column] =
+    cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 8f4c2ee574a..b3bf0e2a787 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,6 +33,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/atomic>
+#include <cuda/functional>
 
 namespace cudf {
 namespace strings {
@@ -211,12 +212,12 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   upper_lower_fn converter{ccfn, *d_strings};
 
   // For smaller strings, use the regular string-parallel algorithm
-  if ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
+  if ((input.chars_size(stream) / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
     auto [offsets, chars] =
       cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr);
     return make_strings_column(input.size(),
                                std::move(offsets),
-                               std::move(chars),
+                               std::move(chars->release().data.release()[0]),
                                input.null_count(),
                                cudf::detail::copy_bitmask(input.parent(), stream, mr));
   }
@@ -227,16 +228,16 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   // but results in a large performance gain when the input contains only single-byte characters.
   // The count_if is faster than any_of or all_of: https://github.com/NVIDIA/thrust/issues/1016
   bool const multi_byte_chars =
-    thrust::count_if(
-      rmm::exec_policy(stream), input.chars_begin(), input.chars_end(), [] __device__(auto chr) {
-        return is_utf8_continuation_char(chr);
-      }) > 0;
+    thrust::count_if(rmm::exec_policy(stream),
+                     input.chars_begin(stream),
+                     input.chars_end(stream),
+                     cuda::proclaim_return_type<bool>(
+                       [] __device__(auto chr) { return is_utf8_continuation_char(chr); })) > 0;
   if (!multi_byte_chars) {
     // optimization for ASCII-only case: copy the input column and inplace replace each character
-    auto result = std::make_unique<column>(input.parent(), stream, mr);
-    auto d_chars =
-      result->mutable_view().child(strings_column_view::chars_column_index).data<char>();
-    auto const chars_size = strings_column_view(result->view()).chars_size();
+    auto result           = std::make_unique<column>(input.parent(), stream, mr);
+    auto d_chars          = result->mutable_view().head<char>();
+    auto const chars_size = strings_column_view(result->view()).chars_size(stream);
     thrust::transform(
       rmm::exec_policy(stream), d_chars, d_chars + chars_size, d_chars, ascii_converter_fn{ccfn});
     result->set_null_count(input.null_count());
@@ -263,15 +264,15 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
                "Size of output exceeds the column size limit",
                std::overflow_error);
 
-  auto chars = create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
+  rmm::device_uvector<char> chars(bytes, stream, mr);
   // second pass, write output
   converter.d_offsets = d_offsets;
-  converter.d_chars   = chars->mutable_view().data<char>();
+  converter.d_chars   = chars.data();
   thrust::for_each_n(rmm::exec_policy(stream), count_itr, input.size(), converter);
 
   return make_strings_column(input.size(),
                              std::move(offsets),
-                             std::move(chars),
+                             chars.release(),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 35b0c0a2690..9c2a2701227 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -200,12 +200,13 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls filterer to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
+  auto [offsets_column, chars_column] =
+    cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return make_strings_column(strings_count,
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              strings.null_count(),
                              std::move(null_mask));
 }
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index 0a11b6dc460..a48e84eac0c 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -142,7 +142,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   // Create device views from the strings columns.
   auto d_table = table_device_view::create(strings_columns, stream);
   concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls};
-  auto children = make_strings_children(fn, strings_count, stream, mr);
+  auto [offsets_column, chars_column] = make_strings_children(fn, strings_count, stream, mr);
 
   // create resulting null mask
   auto [null_mask, null_count] = cudf::detail::valid_if(
@@ -157,8 +157,8 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
     mr);
 
   return make_strings_column(strings_count,
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              null_count,
                              std::move(null_mask));
 }
@@ -237,7 +237,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
 
   multi_separator_concat_fn mscf{
     *d_table, separator_col_view, separator_rep, col_rep, separate_nulls};
-  auto children = make_strings_children(mscf, strings_count, stream, mr);
+  auto [offsets_column, chars_column] = make_strings_children(mscf, strings_count, stream, mr);
 
   // Create resulting null mask
   auto [null_mask, null_count] = cudf::detail::valid_if(
@@ -253,8 +253,8 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
     mr);
 
   return make_strings_column(strings_count,
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              null_count,
                              std::move(null_mask));
 }
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index 9ab527feaf8..0e0d6e437a7 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,15 +145,19 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
   auto chars_column = [&] {
     // build the strings column and commandeer the chars column
     if ((input.size() == input.null_count()) ||
-        ((input.chars_size() / (input.size() - input.null_count())) <= AVG_CHAR_BYTES_THRESHOLD)) {
+        ((input.chars_size(stream) / (input.size() - input.null_count())) <=
+         AVG_CHAR_BYTES_THRESHOLD)) {
       return std::get<1>(
         make_strings_children(join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr));
     }
     // dynamically feeds index pairs to build the output
     auto indices = cudf::detail::make_counting_transform_iterator(
       0, join_gather_fn{*d_strings, d_separator, d_narep});
-    auto joined_col = make_strings_column(indices, indices + (input.size() * 2), stream, mr);
-    return std::move(joined_col->release().children.back());
+    auto joined_col       = make_strings_column(indices, indices + (input.size() * 2), stream, mr);
+    auto chars_data       = joined_col->release().data;
+    auto const chars_size = chars_data->size();
+    return std::make_unique<cudf::column>(
+      data_type{type_id::INT8}, chars_size, std::move(*chars_data), rmm::device_buffer{}, 0);
   }();
 
   // build the offsets: single string output has offsets [0,chars-size]
@@ -169,8 +173,11 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
                      : rmm::device_buffer{0, stream, mr};
 
   // perhaps this return a string_scalar instead of a single-row column
-  return make_strings_column(
-    1, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask));
+  return make_strings_column(1,
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
+                             null_count,
+                             std::move(null_mask));
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index 372b49fb0ee..619f5feba15 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -215,8 +215,11 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                            stream,
                            mr);
 
-  return make_strings_column(
-    num_rows, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask));
+  return make_strings_column(num_rows,
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
+                             null_count,
+                             std::move(null_mask));
 }
 
 namespace {
@@ -290,8 +293,11 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                            stream,
                            mr);
 
-  return make_strings_column(
-    num_rows, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask));
+  return make_strings_column(num_rows,
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
+                             null_count,
+                             std::move(null_mask));
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index e75f1a6fe0f..4fe0be7883f 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -147,7 +147,7 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
 
   return make_strings_column(strings_count,
                              std::move(offsets),
-                             std::move(chars),
+                             std::move(chars->release().data.release()[0]),
                              booleans.null_count(),
                              std::move(null_mask));
 }
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index d2609441d72..b7a662b0b76 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1158,7 +1158,7 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
 
   return make_strings_column(timestamps.size(),
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(chars_column->release().data.release()[0]),
                              timestamps.null_count(),
                              cudf::detail::copy_bitmask(timestamps, stream, mr));
 }
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 987087042cb..9a58926539c 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -422,7 +422,7 @@ struct dispatch_from_durations_fn {
 
     return make_strings_column(strings_count,
                                std::move(offsets),
-                               std::move(chars),
+                               std::move(chars->release().data.release()[0]),
                                durations.null_count(),
                                std::move(null_mask));
   }
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 2c59f6dcd29..975f03b37d6 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -249,7 +249,7 @@ struct dispatch_from_fixed_point_fn {
 
     return make_strings_column(input.size(),
                                std::move(offsets),
-                               std::move(chars),
+                               std::move(chars->release().data.release()[0]),
                                input.null_count(),
                                cudf::detail::copy_bitmask(input, stream, mr));
   }
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 9b3ef8f452b..c56e723de8e 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -408,7 +408,7 @@ struct dispatch_from_floats_fn {
 
     return make_strings_column(strings_count,
                                std::move(offsets),
-                               std::move(chars),
+                               std::move(chars->release().data.release()[0]),
                                floats.null_count(),
                                std::move(null_mask));
   }
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index 8f656b149a5..68cff214507 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -180,12 +180,12 @@ struct dispatch_integers_to_hex_fn {
   {
     auto const d_column = column_device_view::create(input, stream);
 
-    auto children = cudf::strings::detail::make_strings_children(
+    auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
       integer_to_hex_fn<IntegerType>{*d_column}, input.size(), stream, mr);
 
     return make_strings_column(input.size(),
-                               std::move(children.first),
-                               std::move(children.second),
+                               std::move(offsets_column),
+                               std::move(chars_column->release().data.release()[0]),
                                input.null_count(),
                                cudf::detail::copy_bitmask(input, stream, mr));
   }
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 56637e88e19..364cb534d2f 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -367,7 +367,7 @@ struct dispatch_from_integers_fn {
 
     return make_strings_column(strings_count,
                                std::move(offsets),
-                               std::move(chars),
+                               std::move(chars->release().data.release()[0]),
                                integers.null_count(),
                                std::move(null_mask));
   }
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 75527e24e79..e07be26a23c 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -165,13 +165,13 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
 
   CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type");
 
-  auto d_column = column_device_view::create(integers, stream);
-  auto children = cudf::strings::detail::make_strings_children(
+  auto d_column                       = column_device_view::create(integers, stream);
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     integers_to_ipv4_fn{*d_column}, integers.size(), stream, mr);
 
   return make_strings_column(integers.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              integers.null_count(),
                              cudf::detail::copy_bitmask(integers, stream, mr));
 }
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
index f9f2b91eb12..1f22aea284b 100644
--- a/cpp/src/strings/convert/convert_lists.cu
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -216,14 +216,17 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
   auto const d_separators = column_device_view::create(separators.parent(), stream);
   auto const d_na_rep     = na_rep.value(stream);
 
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth},
     input.size(),
     stream,
     mr);
 
-  return make_strings_column(
-    input.size(), std::move(children.first), std::move(children.second), 0, rmm::device_buffer{});
+  return make_strings_column(input.size(),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
+                             0,
+                             rmm::device_buffer{});
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 511acc38d75..a9ddcfa12a2 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -135,12 +135,12 @@ std::unique_ptr<column> url_encode(strings_column_view const& input,
 
   auto d_column = column_device_view::create(input.parent(), stream);
 
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     url_encoder_fn{*d_column}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
@@ -197,8 +197,8 @@ __forceinline__ __device__ char escaped_sequence_to_byte(char const* const ptr)
  * @param[out] out_counts Number of characters in each decode URL.
  */
 template <size_type num_warps_per_threadblock, size_type char_block_size>
-__global__ void url_decode_char_counter(column_device_view const in_strings,
-                                        size_type* const out_counts)
+CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings,
+                                         size_type* const out_counts)
 {
   constexpr int halo_size = 2;
   __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size];
@@ -280,9 +280,9 @@ __global__ void url_decode_char_counter(column_device_view const in_strings,
  * @param[in] out_offsets Offset value of each string associated with `out_chars`.
  */
 template <size_type num_warps_per_threadblock, size_type char_block_size>
-__global__ void url_decode_char_replacer(column_device_view const in_strings,
-                                         char* const out_chars,
-                                         size_type const* const out_offsets)
+CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
+                                          char* const out_chars,
+                                          size_type const* const out_offsets)
 {
   constexpr int halo_size = 2;
   __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size * 2];
@@ -409,8 +409,8 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
   auto out_chars_bytes = cudf::detail::get_value<size_type>(offsets_view, offset_count - 1, stream);
 
   // create the chars column
-  auto chars_column = create_chars_child_column(out_chars_bytes, stream, mr);
-  auto d_out_chars  = chars_column->mutable_view().data<char>();
+  rmm::device_uvector<char> chars(out_chars_bytes, stream, mr);
+  auto d_out_chars = chars.data();
 
   // decode and copy the characters from the input column to the output column
   url_decode_char_replacer<num_warps_per_threadblock, char_block_size>
@@ -422,7 +422,7 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             chars.release(),
                              strings.null_count(),
                              std::move(null_mask));
 }
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 027466ef13c..c4564b1105b 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -112,7 +112,7 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
 }
 
 template <size_type block_size, bool Nullable>
-__global__ void fused_concatenate_string_offset_kernel(
+CUDF_KERNEL void fused_concatenate_string_offset_kernel(
   column_device_view const* input_views,
   size_t const* input_offsets,
   size_t const* partition_offsets,
@@ -171,11 +171,11 @@ __global__ void fused_concatenate_string_offset_kernel(
   }
 }
 
-__global__ void fused_concatenate_string_chars_kernel(column_device_view const* input_views,
-                                                      size_t const* partition_offsets,
-                                                      size_type const num_input_views,
-                                                      size_type const output_size,
-                                                      char* output_data)
+CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const* input_views,
+                                                       size_t const* partition_offsets,
+                                                       size_type const num_input_views,
+                                                       size_type const output_size,
+                                                       char* output_data)
 {
   cudf::thread_index_type output_index = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -192,8 +192,7 @@ __global__ void fused_concatenate_string_chars_kernel(column_device_view const*
     auto const input_offsets_data =
       cudf::detail::input_offsetalator(offsets_child.head(), offsets_child.type());
 
-    constexpr auto chars_child   = strings_column_view::chars_column_index;
-    auto const* input_chars_data = input_view.child(chars_child).data<char>();
+    auto const* input_chars_data = input_view.head<char>();
 
     auto const first_char     = input_offsets_data[input_view.offset()];
     output_data[output_index] = input_chars_data[offset_index + first_char];
@@ -229,9 +228,8 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
     std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
 
   // create output chars column
-  auto chars_column = create_chars_child_column(total_bytes, stream, mr);
-  auto d_new_chars  = chars_column->mutable_view().data<char>();
-  chars_column->set_null_count(0);
+  rmm::device_uvector<char> output_chars(total_bytes, stream, mr);
+  auto d_new_chars = output_chars.data();
 
   // create output offsets column
   auto offsets_column = make_numeric_column(
@@ -287,12 +285,11 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
           continue;            // empty column may not have children
         size_type column_offset   = column->offset();
         column_view offsets_child = column->child(strings_column_view::offsets_column_index);
-        column_view chars_child   = column->child(strings_column_view::chars_column_index);
 
         auto const bytes_offset = get_offset_value(offsets_child, column_offset, stream);
         auto const bytes_end = get_offset_value(offsets_child, column_size + column_offset, stream);
         // copy the chars column data
-        auto d_chars     = chars_child.data<char>() + bytes_offset;
+        auto d_chars     = column->head<char>() + bytes_offset;
         auto const bytes = bytes_end - bytes_offset;
 
         CUDF_CUDA_TRY(
@@ -306,7 +303,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             output_chars.release(),
                              null_count,
                              std::move(null_mask));
 }
diff --git a/cpp/src/strings/copying/copying.cu b/cpp/src/strings/copying/copying.cu
index 2295a80ff5b..4f37d3864ac 100644
--- a/cpp/src/strings/copying/copying.cu
+++ b/cpp/src/strings/copying/copying.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/transform.h>
@@ -65,12 +66,10 @@ std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
   }
 
   // slice the chars child column
-  auto const data_size =
-    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-  auto chars_column = std::make_unique<cudf::column>(
-    cudf::detail::slice(strings.chars(), {chars_offset, chars_offset + data_size}, stream).front(),
-    stream,
-    mr);
+  auto const data_size = static_cast<std::size_t>(
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream));
+  auto chars_buffer =
+    rmm::device_buffer{strings.chars_begin(stream) + chars_offset, data_size, stream, mr};
 
   // slice the null mask
   auto null_mask = cudf::detail::copy_bitmask(
@@ -81,7 +80,7 @@ std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(chars_buffer),
                              null_count,
                              std::move(null_mask));
 }
diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu
index b54c433c23d..331cdecc36f 100644
--- a/cpp/src/strings/copying/shift.cu
+++ b/cpp/src/strings/copying/shift.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,8 +69,7 @@ struct shift_chars_fn {
         auto const first_index =
           offset + d_column.child(strings_column_view::offsets_column_index)
                      .element<size_type>(d_column.offset() + d_column.size());
-        return d_column.child(strings_column_view::chars_column_index)
-          .element<char>(idx + first_index);
+        return d_column.head<char>()[idx + first_index];
       } else {
         auto const char_index = idx - last_index;
         return d_filler.data()[char_index % d_filler.size_bytes()];
@@ -79,10 +78,9 @@ struct shift_chars_fn {
       if (idx < offset) {
         return d_filler.data()[idx % d_filler.size_bytes()];
       } else {
-        return d_column.child(strings_column_view::chars_column_index)
-          .element<char>(idx - offset +
-                         d_column.child(strings_column_view::offsets_column_index)
-                           .element<size_type>(d_column.offset()));
+        return d_column.head<char>()[idx - offset +
+                                     d_column.child(strings_column_view::offsets_column_index)
+                                       .element<size_type>(d_column.offset())];
       }
     }
   }
@@ -116,19 +114,19 @@ std::unique_ptr<column> shift(strings_column_view const& input,
   }();
 
   // create output chars child column
-  auto chars_column = create_chars_child_column(static_cast<size_type>(total_bytes), stream, mr);
-  auto d_chars      = mutable_column_device_view::create(chars_column->mutable_view(), stream);
+  rmm::device_uvector<char> chars(total_bytes, stream, mr);
+  auto d_chars = chars.data();
 
   // run kernel to shift all the characters
   thrust::transform(rmm::exec_policy(stream),
                     thrust::counting_iterator<size_type>(0),
                     thrust::counting_iterator<size_type>(total_bytes),
-                    d_chars->data<char>(),
+                    d_chars,
                     shift_chars_fn{*d_input, d_fill_str, shift_offset});
 
   // caller sets the null-mask
   return make_strings_column(
-    input.size(), std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{});
+    input.size(), std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
 }
 
 }  // namespace cudf::strings::detail
diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu
index 49e1b11c1db..d2e3b6f6af3 100644
--- a/cpp/src/strings/filling/fill.cu
+++ b/cpp/src/strings/filling/fill.cu
@@ -98,7 +98,7 @@ std::unique_ptr<column> fill(strings_column_view const& input,
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(chars_column->release().data.release()[0]),
                              null_count,
                              std::move(null_mask));
 }
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 9f95fedfe0b..7a26fc45dcb 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -139,11 +139,12 @@ std::unique_ptr<column> filter_characters(
 
   // this utility calls the strip_fn to build the offsets and chars columns
   filter_fn ffn{*d_strings, keep_characters, table.begin(), table.end(), d_replacement};
-  auto children = cudf::strings::detail::make_strings_children(ffn, strings.size(), stream, mr);
+  auto [offsets_column, chars_column] =
+    cudf::strings::detail::make_strings_children(ffn, strings.size(), stream, mr);
 
   return make_strings_column(strings_count,
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              strings.null_count(),
                              cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 850ccaa4535..ec77aea6338 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -112,7 +112,7 @@ std::unique_ptr<column> pad(strings_column_view const& input,
 
   auto d_strings = column_device_view::create(input.parent(), stream);
 
-  auto children = [&] {
+  auto [offsets_column, chars_column] = [&] {
     if (side == side_type::LEFT) {
       auto fn = pad_fn<side_type::LEFT>{*d_strings, width, fill_char_size, d_fill_char};
       return make_strings_children(fn, input.size(), stream, mr);
@@ -125,8 +125,8 @@ std::unique_ptr<column> pad(strings_column_view const& input,
   }();
 
   return make_strings_column(input.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
@@ -151,11 +151,12 @@ std::unique_ptr<column> zfill(strings_column_view const& input,
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
   auto d_strings = column_device_view::create(input.parent(), stream);
-  auto children  = make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr);
+  auto [offsets_column, chars_column] =
+    make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh
index 23b53062bf3..bc8f5d68a4b 100644
--- a/cpp/src/strings/regex/utilities.cuh
+++ b/cpp/src/strings/regex/utilities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ namespace detail {
 constexpr auto regex_launch_kernel_block_size = 256;
 
 template <typename ForEachFunction>
-__global__ void for_each_kernel(ForEachFunction fn, reprog_device const d_prog, size_type size)
+CUDF_KERNEL void for_each_kernel(ForEachFunction fn, reprog_device const d_prog, size_type size)
 {
   extern __shared__ u_char shmem[];
   if (threadIdx.x == 0) { d_prog.store(shmem); }
@@ -71,10 +71,10 @@ void launch_for_each_kernel(ForEachFunction fn,
 }
 
 template <typename TransformFunction, typename OutputType>
-__global__ void transform_kernel(TransformFunction fn,
-                                 reprog_device const d_prog,
-                                 OutputType* d_output,
-                                 size_type size)
+CUDF_KERNEL void transform_kernel(TransformFunction fn,
+                                  reprog_device const d_prog,
+                                  OutputType* d_output,
+                                  size_type size)
 {
   extern __shared__ u_char shmem[];
   if (threadIdx.x == 0) { d_prog.store(shmem); }
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 847a64f5602..b4a770f72bd 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,8 +81,6 @@ auto generate_empty_output(strings_column_view const& input,
                            rmm::cuda_stream_view stream,
                            rmm::mr::device_memory_resource* mr)
 {
-  auto chars_column = create_chars_child_column(0, stream, mr);
-
   auto offsets_column = make_numeric_column(
     data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
   CUDF_CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data<size_type>(),
@@ -92,7 +90,7 @@ auto generate_empty_output(strings_column_view const& input,
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             rmm::device_buffer{},
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
@@ -166,7 +164,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
     make_strings_children(fn, strings_count * repeat_times, strings_count, stream, mr);
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
@@ -252,7 +250,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(chars_column->release().data.release()[0]),
                              null_count,
                              std::move(null_mask));
 }
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index fc11b7d80b3..edec525a913 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -126,8 +126,8 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& input,
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
 
-  using BackRefIterator = decltype(backrefs.begin());
-  auto children         = make_strings_children(
+  using BackRefIterator               = decltype(backrefs.begin());
+  auto [offsets_column, chars_column] = make_strings_children(
     backrefs_fn<BackRefIterator>{*d_strings, d_repl_template, backrefs.begin(), backrefs.end()},
     *d_prog,
     input.size(),
@@ -135,8 +135,8 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& input,
     mr);
 
   return make_strings_column(input.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 28736c2ca15..3d0210d61b0 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,10 +74,7 @@ using target_pair = thrust::pair<size_type, size_type>;
  * @brief Helper functions for performing character-parallel replace
  */
 struct replace_multi_parallel_fn {
-  __device__ char const* get_base_ptr() const
-  {
-    return d_strings.child(strings_column_view::chars_column_index).data<char>();
-  }
+  __device__ char const* get_base_ptr() const { return d_strings.head<char>(); }
 
   __device__ size_type const* get_offsets_ptr() const
   {
@@ -377,7 +374,8 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
     });
 
   // use this utility to gather the string parts into a contiguous chars column
-  auto chars = make_strings_column(indices.begin(), indices.end(), stream, mr);
+  auto chars      = make_strings_column(indices.begin(), indices.end(), stream, mr);
+  auto chars_data = chars->release().data;
 
   // create offsets from the sizes
   offsets =
@@ -386,7 +384,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
   // build the strings columns from the chars and offsets
   return make_strings_column(strings_count,
                              std::move(offsets),
-                             std::move(chars->release().children.back()),
+                             std::move(chars_data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
@@ -456,12 +454,12 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
   auto d_targets      = column_device_view::create(targets.parent(), stream);
   auto d_replacements = column_device_view::create(repls.parent(), stream);
 
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     replace_multi_fn{*d_strings, *d_targets, *d_replacements}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
@@ -483,7 +481,8 @@ std::unique_ptr<column> replace(strings_column_view const& input,
     CUDF_EXPECTS(repls.size() == targets.size(), "Sizes for targets and repls must match");
 
   return (input.size() == input.null_count() ||
-          ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD))
+          ((input.chars_size(stream) / (input.size() - input.null_count())) <
+           AVG_CHAR_BYTES_THRESHOLD))
            ? replace_string_parallel(input, targets, repls, stream, mr)
            : replace_character_parallel(input, targets, repls, stream, mr);
 }
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 3375cb7a789..c212d9f44ba 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -185,15 +185,15 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
 
   auto found_ranges = rmm::device_uvector<found_range>(d_progs.size() * input.size(), stream);
 
-  auto children = make_strings_children(
+  auto [offsets_column, chars_column] = make_strings_children(
     replace_multi_regex_fn{*d_strings, d_progs, found_ranges.data(), *d_repls},
     input.size(),
     stream,
     mr);
 
   return make_strings_column(input.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index aa955d3086e..936127f254b 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -415,7 +415,7 @@ std::unique_ptr<column> replace_char_parallel(strings_column_view const& strings
   auto const strings_count = strings.size();
   auto const offset_count  = strings_count + 1;
   auto const d_offsets     = strings.offsets_begin();
-  auto const d_in_chars    = strings.chars_begin();
+  auto const d_in_chars    = strings.chars_begin(stream);
   auto const chars_bytes   = chars_end - chars_start;
   auto const target_size   = d_target.size_bytes();
 
@@ -486,9 +486,8 @@ std::unique_ptr<column> replace_char_parallel(strings_column_view const& strings
                     offsets_update_fn);
 
   // build the characters column
-  auto chars_column =
-    create_chars_child_column(chars_bytes + (delta_per_target * target_count), stream, mr);
-  auto d_out_chars = chars_column->mutable_view().data<char>();
+  rmm::device_uvector<char> chars(chars_bytes + (delta_per_target * target_count), stream, mr);
+  auto d_out_chars = chars.data();
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(chars_start),
@@ -501,7 +500,7 @@ std::unique_ptr<column> replace_char_parallel(strings_column_view const& strings
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             chars.release(),
                              strings.null_count(),
                              cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
@@ -532,12 +531,12 @@ std::unique_ptr<column> replace_row_parallel(strings_column_view const& strings,
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
   // this utility calls the given functor to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     replace_row_parallel_fn{*d_strings, d_target, d_repl, maxrepl}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              strings.null_count(),
                              cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
@@ -574,7 +573,7 @@ std::unique_ptr<column> replace<replace_algorithm::AUTO>(strings_column_view con
       ? 0
       : cudf::detail::get_value<int32_t>(strings.offsets(), strings.offset(), stream);
   size_type const chars_end   = (offset_count == strings.offsets().size())
-                                  ? strings.chars_size()
+                                  ? strings.chars_size(stream)
                                   : cudf::detail::get_value<int32_t>(
                                     strings.offsets(), strings.offset() + strings_count, stream);
   size_type const chars_bytes = chars_end - chars_start;
@@ -612,7 +611,7 @@ std::unique_ptr<column> replace<replace_algorithm::CHAR_PARALLEL>(
                                                      : cudf::detail::get_value<int32_t>(
                                                       strings.offsets(), strings.offset(), stream);
   size_type chars_end      = (offset_count == strings.offsets().size())
-                               ? strings.chars_size()
+                               ? strings.chars_size(stream)
                                : cudf::detail::get_value<int32_t>(
                               strings.offsets(), strings.offset() + strings_count, stream);
   return replace_char_parallel(
@@ -697,12 +696,12 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
   // this utility calls the given functor to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     replace_slice_fn{*d_strings, d_repl, start, stop}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              strings.null_count(),
                              cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
@@ -733,8 +732,8 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
-  auto chars_column = create_chars_child_column(bytes, stream, mr);
-  auto d_chars      = chars_column->mutable_view().data<char>();
+  rmm::device_uvector<char> chars(bytes, stream, mr);
+  auto d_chars = chars.data();
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
@@ -745,7 +744,7 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
                      });
 
   return make_strings_column(
-    strings_count, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{});
+    strings_count, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 502d5f1a52e..10d83932928 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,12 +116,12 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
 
-  auto children = make_strings_children(
+  auto [offsets_column, chars_column] = make_strings_children(
     replace_regex_fn{*d_strings, d_repl, maxrepl}, *d_prog, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              input.null_count(),
                              cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/reverse.cu b/cpp/src/strings/reverse.cu
index 2855bdbb827..aecb029f25f 100644
--- a/cpp/src/strings/reverse.cu
+++ b/cpp/src/strings/reverse.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -65,7 +65,7 @@ std::unique_ptr<column> reverse(strings_column_view const& input,
   auto result = std::make_unique<column>(input.parent(), stream, mr);
   auto const d_offsets =
     result->view().child(strings_column_view::offsets_column_index).data<size_type>();
-  auto d_chars = result->mutable_view().child(strings_column_view::chars_column_index).data<char>();
+  auto d_chars = result->mutable_view().head<char>();
 
   auto const d_column = column_device_view::create(input.parent(), stream);
   thrust::for_each_n(rmm::exec_policy(stream),
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index d35f512e0f7..78343d58626 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -115,11 +115,11 @@ struct empty_target_fn {
  * @brief String per warp function for find/rfind
  */
 template <typename TargetIterator, bool forward = true>
-__global__ void finder_warp_parallel_fn(column_device_view const d_strings,
-                                        TargetIterator const d_targets,
-                                        size_type const start,
-                                        size_type const stop,
-                                        size_type* d_results)
+CUDF_KERNEL void finder_warp_parallel_fn(column_device_view const d_strings,
+                                         TargetIterator const d_targets,
+                                         size_type const start,
+                                         size_type const stop,
+                                         size_type* d_results)
 {
   size_type const idx = static_cast<size_type>(threadIdx.x + blockIdx.x * blockDim.x);
 
@@ -186,7 +186,7 @@ void find_utility(strings_column_view const& input,
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
   auto d_results = output.mutable_view().data<size_type>();
-  if ((input.chars_size() / (input.size() - input.null_count())) > AVG_CHAR_BYTES_THRESHOLD) {
+  if ((input.chars_size(stream) / (input.size() - input.null_count())) > AVG_CHAR_BYTES_THRESHOLD) {
     // warp-per-string runs faster for longer strings (but not shorter ones)
     constexpr int block_size = 256;
     cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
@@ -346,9 +346,9 @@ namespace {
  * @param d_target String to search for in each row of `d_strings`
  * @param d_results Indicates which rows contain `d_target`
  */
-__global__ void contains_warp_parallel_fn(column_device_view const d_strings,
-                                          string_view const d_target,
-                                          bool* d_results)
+CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
+                                           string_view const d_target,
+                                           bool* d_results)
 {
   size_type const idx = static_cast<size_type>(threadIdx.x + blockIdx.x * blockDim.x);
   using warp_reduce   = cub::WarpReduce<bool>;
@@ -538,7 +538,7 @@ std::unique_ptr<column> contains(strings_column_view const& input,
 {
   // use warp parallel when the average string width is greater than the threshold
   if ((input.null_count() < input.size()) &&
-      ((input.chars_size() / input.size()) > AVG_CHAR_BYTES_THRESHOLD)) {
+      ((input.chars_size(stream) / input.size()) > AVG_CHAR_BYTES_THRESHOLD)) {
     return contains_warp_parallel(input, target, stream, mr);
   }
 
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 5a1fee92c7d..1e55986fdb8 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -209,7 +209,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
 
   return make_strings_column(strings.size(),
                              std::move(offsets),
-                             std::move(chars),
+                             std::move(chars->release().data.release()[0]),
                              strings.null_count(),
                              cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index dc0b04af388..c5fb44fc3dd 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,10 +49,7 @@ namespace cudf::strings::detail {
  */
 template <typename Derived>
 struct base_split_tokenizer {
-  __device__ char const* get_base_ptr() const
-  {
-    return d_strings.child(strings_column_view::chars_column_index).data<char>();
-  }
+  __device__ char const* get_base_ptr() const { return d_strings.head<char>(); }
 
   __device__ string_view const get_string(size_type idx) const
   {
diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu
index 0b55e18b00a..5ba4d8d3132 100644
--- a/cpp/src/strings/strings_column_factories.cu
+++ b/cpp/src/strings/strings_column_factories.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -131,10 +131,33 @@ std::unique_ptr<column> make_strings_column(size_type num_strings,
 
   std::vector<std::unique_ptr<column>> children;
   children.emplace_back(std::move(offsets_column));
-  children.emplace_back(std::move(chars_column));
   return std::make_unique<column>(data_type{type_id::STRING},
                                   num_strings,
-                                  rmm::device_buffer{},
+                                  std::move(*(chars_column->release().data.release())),
+                                  std::move(null_mask),
+                                  null_count,
+                                  std::move(children));
+}
+
+std::unique_ptr<column> make_strings_column(size_type num_strings,
+                                            std::unique_ptr<column> offsets_column,
+                                            rmm::device_buffer&& chars_buffer,
+                                            size_type null_count,
+                                            rmm::device_buffer&& null_mask)
+{
+  CUDF_FUNC_RANGE();
+
+  if (null_count > 0) { CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable."); }
+  CUDF_EXPECTS(num_strings == offsets_column->size() - 1,
+               "Invalid offsets column size for strings column.");
+  CUDF_EXPECTS(offsets_column->null_count() == 0, "Offsets column should not contain nulls");
+
+  std::vector<std::unique_ptr<column>> children;
+  children.emplace_back(std::move(offsets_column));
+
+  return std::make_unique<column>(data_type{type_id::STRING},
+                                  num_strings,
+                                  std::move(chars_buffer),
                                   std::move(null_mask),
                                   null_count,
                                   std::move(children));
@@ -151,7 +174,6 @@ std::unique_ptr<column> make_strings_column(size_type num_strings,
   if (num_strings == 0) { return make_empty_column(type_id::STRING); }
 
   auto const offsets_size = static_cast<size_type>(offsets.size());
-  auto const chars_size   = static_cast<size_type>(chars.size());
 
   if (null_count > 0) CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable.");
 
@@ -164,21 +186,13 @@ std::unique_ptr<column> make_strings_column(size_type num_strings,
     rmm::device_buffer(),
     0);
 
-  auto chars_column = std::make_unique<column>(  //
-    data_type{type_id::INT8},
-    chars_size,
-    chars.release(),
-    rmm::device_buffer(),
-    0);
-
   auto children = std::vector<std::unique_ptr<column>>();
 
   children.emplace_back(std::move(offsets_column));
-  children.emplace_back(std::move(chars_column));
 
   return std::make_unique<column>(data_type{type_id::STRING},
                                   num_strings,
-                                  rmm::device_buffer{},
+                                  chars.release(),
                                   std::move(null_mask),
                                   null_count,
                                   std::move(children));
diff --git a/cpp/src/strings/strings_column_view.cpp b/cpp/src/strings/strings_column_view.cpp
index 4b206666d4b..27a8c6fb17f 100644
--- a/cpp/src/strings/strings_column_view.cpp
+++ b/cpp/src/strings/strings_column_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,12 @@
  * limitations under the License.
  */
 
+#include <cudf/detail/get_value.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 //
 strings_column_view::strings_column_view(column_view strings_column) : column_view(strings_column)
@@ -42,26 +45,28 @@ strings_column_view::offset_iterator strings_column_view::offsets_end() const
   return offsets_begin() + size() + 1;
 }
 
-column_view strings_column_view::chars() const
+column_view strings_column_view::chars(rmm::cuda_stream_view stream) const
 {
   CUDF_EXPECTS(num_children() > 0, "strings column has no children");
-  return child(chars_column_index);
+  return column_view(
+    data_type{type_id::INT8}, chars_size(stream), chars_begin(stream), nullptr, 0, 0);
 }
 
-size_type strings_column_view::chars_size() const noexcept
+size_type strings_column_view::chars_size(rmm::cuda_stream_view stream) const noexcept
 {
   if (size() == 0) return 0;
-  return chars().size();
+  return detail::get_value<size_type>(offsets(), offsets().size() - 1, stream);
 }
 
-strings_column_view::chars_iterator strings_column_view::chars_begin() const
+strings_column_view::chars_iterator strings_column_view::chars_begin(rmm::cuda_stream_view) const
 {
-  return chars().begin<char>();
+  return head<char>();
 }
 
-strings_column_view::chars_iterator strings_column_view::chars_end() const
+strings_column_view::chars_iterator strings_column_view::chars_end(
+  rmm::cuda_stream_view stream) const
 {
-  return chars_begin() + chars_size();
+  return chars_begin(stream) + chars_size(stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 0ca5e103d3d..039a8ac8a62 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -110,12 +110,12 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
-  auto children = make_strings_children(
+  auto [offsets_column, chars_column] = make_strings_children(
     translate_fn{*d_strings, table.begin(), table.end()}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
-                             std::move(children.first),
-                             std::move(children.second),
+                             std::move(offsets_column),
+                             std::move(chars_column->release().data.release()[0]),
                              strings.null_count(),
                              cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu
index aa87a663964..19f1ac55bb0 100644
--- a/cpp/src/strings/wrap.cu
+++ b/cpp/src/strings/wrap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/for_each.h>
@@ -111,8 +112,11 @@ std::unique_ptr<column> wrap(strings_column_view const& strings,
   auto offsets_column = std::make_unique<column>(strings.offsets(), stream, mr);  // makes a copy
   auto d_new_offsets  = offsets_column->view().template data<int32_t>();
 
-  auto chars_column = std::make_unique<column>(strings.chars(), stream, mr);  // makes a copy
-  auto d_chars      = chars_column->mutable_view().data<char>();
+  auto chars_buffer = rmm::device_buffer{strings.chars_begin(stream),
+                                         static_cast<std::size_t>(strings.chars_size(stream)),
+                                         stream,
+                                         mr};  // makes a copy
+  auto d_chars      = static_cast<char*>(chars_buffer.data());
 
   device_execute_functor d_execute_fctr{d_column, d_new_offsets, d_chars, width};
 
@@ -123,7 +127,7 @@ std::unique_ptr<column> wrap(strings_column_view const& strings,
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(chars_buffer),
                              null_count,
                              std::move(null_mask));
 }
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index acb153f28d6..f47d066852c 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/structs/structs_column_view.hpp>
@@ -229,6 +230,7 @@ std::unique_ptr<column> superimpose_nulls_no_sanitize(bitmask_type const* null_m
                                                       rmm::cuda_stream_view stream,
                                                       rmm::mr::device_memory_resource* mr)
 {
+  CUDF_FUNC_RANGE();
   if (input->type().id() == cudf::type_id::EMPTY) {
     // EMPTY columns should not have a null mask,
     // so don't superimpose null mask on empty columns.
@@ -258,19 +260,11 @@ std::unique_ptr<column> superimpose_nulls_no_sanitize(bitmask_type const* null_m
   // If the input is also a struct, repeat for all its children. Otherwise just return.
   if (input->type().id() != cudf::type_id::STRUCT) { return std::move(input); }
 
-  auto const current_mask   = input->view().null_mask();
   auto const new_null_count = input->null_count();  // this was just computed in the step above
   auto content              = input->release();
 
-  // Build new children columns.
-  std::for_each(content.children.begin(),
-                content.children.end(),
-                [current_mask, new_null_count, stream, mr](auto& child) {
-                  child = superimpose_nulls_no_sanitize(
-                    current_mask, new_null_count, std::move(child), stream, mr);
-                });
-
   // Replace the children columns.
+  // make_structs_column recursively calls superimpose_nulls
   return cudf::make_structs_column(num_rows,
                                    std::move(content.children),
                                    new_null_count,
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu
index 2d53faf548e..c6d299424d2 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cu
+++ b/cpp/src/text/bpe/byte_pair_encoding.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -122,11 +122,11 @@ struct bpe_unpairable_offsets_fn {
  * @param d_rerank_data Working memory to hold locations where reranking is required
  */
 template <typename MapRefType>
-__global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
-                                MapRefType const d_map,
-                                int8_t* d_spaces_data,          // working memory
-                                cudf::size_type* d_ranks_data,  // more working memory
-                                int8_t* d_rerank_data           // and one more working memory
+CUDF_KERNEL void bpe_parallel_fn(cudf::column_device_view const d_strings,
+                                 MapRefType const d_map,
+                                 int8_t* d_spaces_data,          // working memory
+                                 cudf::size_type* d_ranks_data,  // more working memory
+                                 int8_t* d_rerank_data           // and one more working memory
 )
 {
   // string per block
@@ -291,9 +291,9 @@ __global__ void bpe_parallel_fn(cudf::column_device_view const d_strings,
  * @param d_spaces_data Output the location where separator will be inserted
  * @param d_sizes Output sizes of each row
  */
-__global__ void bpe_finalize(cudf::column_device_view const d_strings,
-                             int8_t* d_spaces_data,    // where separators are inserted
-                             cudf::size_type* d_sizes  // output sizes of encoded strings
+CUDF_KERNEL void bpe_finalize(cudf::column_device_view const d_strings,
+                              int8_t* d_spaces_data,    // where separators are inserted
+                              cudf::size_type* d_sizes  // output sizes of encoded strings
 )
 {
   // string per block
@@ -342,7 +342,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                                                  rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
 {
-  if (input.is_empty() || input.chars_size() == 0) {
+  if (input.is_empty() || input.chars_size(stream) == 0) {
     return cudf::make_empty_column(cudf::type_id::STRING);
   }
 
@@ -356,11 +356,11 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                                                    : cudf::detail::get_value<cudf::size_type>(
                                                       input.offsets(), input.offset(), stream);
   auto const last_offset   = (input.offset() == 0 && input.size() == input.offsets().size() - 1)
-                               ? input.chars().size()
+                               ? input.chars_size(stream)
                                : cudf::detail::get_value<cudf::size_type>(
                                  input.offsets(), input.size() + input.offset(), stream);
   auto const chars_size    = last_offset - first_offset;
-  auto const d_input_chars = input.chars().data<char>() + first_offset;
+  auto const d_input_chars = input.chars_begin(stream) + first_offset;
 
   auto const offset_data_type = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
   auto offsets                = cudf::make_numeric_column(
@@ -406,7 +406,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
       cudf::column_view(cudf::device_span<cudf::size_type const>(tmp_offsets));
     auto const tmp_size  = offsets_total - 1;
     auto const tmp_input = cudf::column_view(
-      input.parent().type(), tmp_size, nullptr, nullptr, 0, 0, {col_offsets, input.chars()});
+      input.parent().type(), tmp_size, input.chars_begin(stream), nullptr, 0, 0, {col_offsets});
     auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream);
 
     // launch the byte-pair-encoding kernel on the temp column
@@ -429,8 +429,8 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                std::overflow_error);
 
   // build the output: inserting separators to the input character data
-  auto chars   = cudf::strings::detail::create_chars_child_column(bytes, stream, mr);
-  auto d_chars = chars->mutable_view().data<char>();
+  rmm::device_uvector<char> chars(bytes, stream, mr);
+  auto d_chars = chars.data();
 
   auto const d_inserts     = d_working.data();  // stores the insert positions
   auto offsets_at_non_zero = [d_spaces = d_spaces.data()] __device__(auto idx) {
@@ -453,7 +453,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
 
   return cudf::make_strings_column(input.size(),
                                    std::move(offsets),
-                                   std::move(chars),
+                                   chars.release(),
                                    input.null_count(),
                                    cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index 38cb7dd6753..60625d6383a 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -156,15 +156,18 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
 
   cudf::string_view const d_separator(separator.data(), separator.size());
 
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     detokenizer_fn{*strings_column, d_row_map, tokens_offsets.data(), d_separator},
     output_count,
     stream,
     mr);
 
   // make the output strings column from the offsets and chars column
-  return cudf::make_strings_column(
-    output_count, std::move(children.first), std::move(children.second), 0, rmm::device_buffer{});
+  return cudf::make_strings_column(output_count,
+                                   std::move(offsets_column),
+                                   std::move(chars_column->release().data.release()[0]),
+                                   0,
+                                   rmm::device_buffer{});
 }
 
 }  // namespace detail
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 31e2405ce88..882d9a04501 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -128,23 +128,26 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
   // create a temporary column view from the non-empty offsets and chars column views
   cudf::column_view strings_view(cudf::data_type{cudf::type_id::STRING},
                                  strings_count,
-                                 nullptr,
+                                 strings.chars_begin(stream),
                                  nullptr,
                                  0,
                                  0,
-                                 {non_empty_offsets_column->view(), strings.chars()});
+                                 {non_empty_offsets_column->view()});
   strings_column = cudf::column_device_view::create(strings_view, stream);
   d_strings      = *strings_column;
 
   // compute the number of strings of ngrams
   auto const ngrams_count = strings_count - ngrams + 1;
 
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, stream, mr);
 
   // make the output strings column from the offsets and chars column
-  return cudf::make_strings_column(
-    ngrams_count, std::move(children.first), std::move(children.second), 0, rmm::device_buffer{});
+  return cudf::make_strings_column(ngrams_count,
+                                   std::move(offsets_column),
+                                   std::move(chars_column->release().data.release()[0]),
+                                   0,
+                                   rmm::device_buffer{});
 }
 
 }  // namespace detail
@@ -239,8 +242,11 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
   auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     generator, strings_count, total_ngrams, stream, mr);
 
-  return cudf::make_strings_column(
-    total_ngrams, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{});
+  return cudf::make_strings_column(total_ngrams,
+                                   std::move(offsets_column),
+                                   std::move(chars_column->release().data.release()[0]),
+                                   0,
+                                   rmm::device_buffer{});
 }
 
 namespace {
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 4e0a538ffe9..dcb59166cec 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,10 +62,10 @@ template <
   typename HashFunction,
   typename hash_value_type = std::
     conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
-__global__ void minhash_kernel(cudf::column_device_view const d_strings,
-                               cudf::device_span<hash_value_type const> seeds,
-                               cudf::size_type width,
-                               hash_value_type* d_hashes)
+CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
+                                cudf::device_span<hash_value_type const> seeds,
+                                cudf::size_type width,
+                                hash_value_type* d_hashes)
 {
   auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
   if (idx >= (static_cast<std::size_t>(d_strings.size()) *
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index bc5cd04eac6..642dca5fc47 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -232,9 +232,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
   rmm::device_uvector<cudf::size_type> ngram_sizes(total_ngrams, stream);
 
   // build output chars column
-  auto chars_column = cudf::strings::detail::create_chars_child_column(
-    static_cast<cudf::size_type>(output_chars_size), stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
+  rmm::device_uvector<char> chars(output_chars_size, stream, mr);
+  auto d_chars = chars.data();
   // Generate the ngrams into the chars column data buffer.
   // The ngram_builder_fn functor also fills the ngram_sizes vector with the
   // size of each ngram.
@@ -253,11 +252,10 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
   // build the offsets column -- converting the ngram sizes into offsets
   auto offsets_column = std::get<0>(
     cudf::detail::make_offsets_child_column(ngram_sizes.begin(), ngram_sizes.end(), stream, mr));
-  chars_column->set_null_count(0);
   offsets_column->set_null_count(0);
   // create the output strings column
   return make_strings_column(
-    total_ngrams, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{});
+    total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
 }
 
 }  // namespace detail
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 0fc1d221b15..d46ca25835f 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -182,12 +182,12 @@ std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const&
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
   // build offsets and children using the normalize_space_fn
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     normalize_spaces_fn{*d_strings}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
-                                   std::move(children.first),
-                                   std::move(children.second),
+                                   std::move(offsets_column),
+                                   std::move(chars_column->release().data.release()[0]),
                                    strings.null_count(),
                                    cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
@@ -210,7 +210,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
     auto const offsets     = strings.offsets();
     auto const d_offsets   = offsets.data<cudf::size_type>() + strings.offset();
     auto const offset = cudf::detail::get_value<cudf::size_type>(offsets, strings.offset(), stream);
-    auto const d_chars = strings.chars().data<char>() + offset;
+    auto const d_chars = strings.chars_begin(stream) + offset;
     return normalizer.normalize(d_chars, d_offsets, strings.size(), stream);
   }();
 
@@ -228,12 +228,12 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
   // build offsets and children using the codepoint_to_utf8_fn
-  auto children = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
     codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
-                                   std::move(children.first),
-                                   std::move(children.second),
+                                   std::move(offsets_column),
+                                   std::move(chars_column->release().data.release()[0]),
                                    strings.null_count(),
                                    cudf::detail::copy_bitmask(strings.parent(), stream, mr));
 }
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index a4b28fe2dab..50d7bbd077d 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -228,12 +228,13 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls replacer to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr);
+  auto [offsets_column, chars_column] =
+    cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,
-                                   std::move(children.first),
-                                   std::move(children.second),
+                                   std::move(offsets_column),
+                                   std::move(chars_column->release().data.release()[0]),
                                    strings.null_count(),
                                    std::move(null_mask));
 }
@@ -260,12 +261,13 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls filterer to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
+  auto [offsets_column, chars_column] =
+    cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,
-                                   std::move(children.first),
-                                   std::move(children.second),
+                                   std::move(offsets_column),
+                                   std::move(chars_column->release().data.release()[0]),
                                    strings.null_count(),
                                    std::move(null_mask));
 }
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index 34eb95bea5c..c83bc2e318f 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -204,13 +204,13 @@ extract_code_points_from_utf8(unsigned char const* strings,
  * @param[out] code_points The resulting code point values from normalization.
  * @param[out] chars_per_thread Output number of code point values per string.
  */
-__global__ void kernel_data_normalizer(unsigned char const* strings,
-                                       size_t const total_bytes,
-                                       uint32_t const* cp_metadata,
-                                       uint64_t const* aux_table,
-                                       bool const do_lower_case,
-                                       uint32_t* code_points,
-                                       uint32_t* chars_per_thread)
+CUDF_KERNEL void kernel_data_normalizer(unsigned char const* strings,
+                                        size_t const total_bytes,
+                                        uint32_t const* cp_metadata,
+                                        uint64_t const* aux_table,
+                                        bool const do_lower_case,
+                                        uint32_t* code_points,
+                                        uint32_t* chars_per_thread)
 {
   constexpr uint32_t init_val                     = (1 << FILTER_BIT);
   uint32_t replacement_code_points[MAX_NEW_CHARS] = {init_val, init_val, init_val};
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index 1a3084a257f..c9592e5cc48 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ namespace {
  * @param[out] attn_mask Identifies valid token id entries
  * @param[out] metadata Additional data per row
  */
-__global__ void kernel_compute_tensor_metadata(
+CUDF_KERNEL void kernel_compute_tensor_metadata(
   // input
   uint32_t const* token_ids,
   cudf::size_type const* offsets,
@@ -186,7 +186,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
   auto const offsets   = strings.offsets();
   auto const d_offsets = offsets.data<cudf::size_type>() + strings.offset();
   auto const offset  = cudf::detail::get_value<cudf::size_type>(offsets, strings.offset(), stream);
-  auto const d_chars = strings.chars().data<char>() + offset;
+  auto const d_chars = strings.chars_begin(stream) + offset;
 
   // Create tokenizer
   wordpiece_tokenizer tokenizer(
diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
index 3b912017320..d2804af5f8b 100644
--- a/cpp/src/text/subword/wordpiece_tokenizer.cu
+++ b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -75,12 +75,12 @@ namespace {
  * @param[out] tokens_per_word An array of size `num_code_points` which hold the number of
  *        tokens. This kernel just sets all the values to 0.
  */
-__global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_points,
-                                                       uint32_t* start_word_indices,
-                                                       uint32_t* end_word_indices,
-                                                       size_t num_code_points,
-                                                       uint32_t* token_ids,
-                                                       uint8_t* tokens_per_word)
+CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_points,
+                                                        uint32_t* start_word_indices,
+                                                        uint32_t* end_word_indices,
+                                                        size_t num_code_points,
+                                                        uint32_t* token_ids,
+                                                        uint8_t* tokens_per_word)
 {
   cudf::thread_index_type char_for_thread = static_cast<cudf::thread_index_type>(blockDim.x) *
                                               static_cast<cudf::thread_index_type>(blockIdx.x) +
@@ -131,11 +131,11 @@ __global__ void init_data_and_mark_word_start_and_ends(uint32_t const* code_poin
  *        written to indicate this.
  * @param num_strings The total number of strings to be processed.
  */
-__global__ void mark_string_start_and_ends(uint32_t const* code_points,
-                                           cudf::size_type const* strings_offsets,
-                                           uint32_t* start_word_indices,
-                                           uint32_t* end_word_indices,
-                                           uint32_t num_strings)
+CUDF_KERNEL void mark_string_start_and_ends(uint32_t const* code_points,
+                                            cudf::size_type const* strings_offsets,
+                                            uint32_t* start_word_indices,
+                                            uint32_t* end_word_indices,
+                                            uint32_t num_strings)
 {
   cudf::thread_index_type idx = static_cast<cudf::thread_index_type>(blockDim.x) *
                                   static_cast<cudf::thread_index_type>(blockIdx.x) +
@@ -319,20 +319,20 @@ struct mark_special_tokens {
  * @param outer_hash_b_param: The b parameter for the outer hash
  * @param num_outer_bins: The number of bins for the outer hash
  */
-__global__ void kernel_wordpiece_tokenizer(uint32_t const* code_points,
-                                           uint64_t const* hash_table,
-                                           uint64_t const* bin_coefficients,
-                                           uint16_t const* bin_offsets,
-                                           uint16_t unk_token_id,
-                                           uint32_t outer_hash_a_param,
-                                           uint32_t outer_hash_b_param,
-                                           uint16_t num_outer_bins,
-                                           uint32_t const* word_starts,
-                                           uint32_t const* word_ends,
-                                           uint32_t max_word_length,
-                                           uint32_t total_words,
-                                           uint32_t* token_ids,
-                                           uint8_t* tokens_per_word)
+CUDF_KERNEL void kernel_wordpiece_tokenizer(uint32_t const* code_points,
+                                            uint64_t const* hash_table,
+                                            uint64_t const* bin_coefficients,
+                                            uint16_t const* bin_offsets,
+                                            uint16_t unk_token_id,
+                                            uint32_t outer_hash_a_param,
+                                            uint32_t outer_hash_b_param,
+                                            uint16_t num_outer_bins,
+                                            uint32_t const* word_starts,
+                                            uint32_t const* word_ends,
+                                            uint32_t max_word_length,
+                                            uint32_t total_words,
+                                            uint32_t* token_ids,
+                                            uint8_t* tokens_per_word)
 {
   cudf::thread_index_type word_to_tokenize = static_cast<cudf::thread_index_type>(blockDim.x) *
                                                static_cast<cudf::thread_index_type>(blockIdx.x) +
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 87f6a61a533..c256607fb23 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -182,7 +182,8 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
   auto chars_bytes = cudf::detail::get_value<cudf::size_type>(
                        offsets, strings_column.offset() + strings_count, stream) -
                      offset;
-  auto d_chars = strings_column.chars().data<uint8_t>();  // unsigned is necessary for checking bits
+  auto d_chars =
+    strings_column.parent().data<uint8_t>();  // unsigned is necessary for checking bits
   d_chars += offset;
 
   // To minimize memory, count the number of characters so we can
@@ -218,14 +219,13 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
       return idx < chars_bytes ? cudf::strings::detail::is_begin_utf8_char(d_chars[idx]) : true;
     });
 
-  // create the output chars column -- just a copy of the input's chars column
-  cudf::column_view chars_view(
-    cudf::data_type{cudf::type_id::INT8}, chars_bytes, d_chars, nullptr, 0);
-  auto chars_column = std::make_unique<cudf::column>(chars_view, stream, mr);
+  // create the output chars buffer -- just a copy of the input's chars
+  rmm::device_uvector<char> output_chars(chars_bytes, stream, mr);
+  thrust::copy(rmm::exec_policy(stream), d_chars, d_chars + chars_bytes, output_chars.data());
 
   // return new strings column
   return cudf::make_strings_column(
-    num_characters, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{});
+    num_characters, std::move(offsets_column), output_chars.release(), 0, rmm::device_buffer{});
 }
 
 }  // namespace detail
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index 511f1995374..a9e8d4d9a24 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -214,10 +214,10 @@ struct mark_delimiters_fn {
   }
 };
 
-__global__ void token_counts_fn(cudf::column_device_view const d_strings,
-                                cudf::string_view const d_delimiter,
-                                cudf::size_type* d_counts,
-                                int8_t* d_results)
+CUDF_KERNEL void token_counts_fn(cudf::column_device_view const d_strings,
+                                 cudf::string_view const d_delimiter,
+                                 cudf::size_type* d_counts,
+                                 int8_t* d_results)
 {
   // string per warp
   auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
@@ -240,10 +240,8 @@ __global__ void token_counts_fn(cudf::column_device_view const d_strings,
 
   auto const offsets =
     d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
-  auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
-  auto const chars_begin =
-    d_strings.child(cudf::strings_column_view::chars_column_index).data<char>() +
-    offsets[d_strings.offset()];
+  auto const offset      = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
+  auto const chars_begin = d_strings.data<char>() + offsets[d_strings.offset()];
 
   auto const begin        = d_str.data();
   auto const end          = begin + d_str.size_bytes();
@@ -372,7 +370,7 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
   auto map_ref           = vocabulary._impl->get_map_ref();
   auto const zero_itr    = thrust::make_counting_iterator<cudf::size_type>(0);
 
-  if ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
+  if ((input.chars_size(stream) / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
     auto const sizes_itr =
       cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{*d_strings, d_delimiter});
     auto [token_offsets, total_count] =
@@ -401,11 +399,11 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
                                                    : cudf::detail::get_value<cudf::size_type>(
                                                       input.offsets(), input.offset(), stream);
   auto const last_offset   = (input.offset() == 0 && input.size() == input.offsets().size() - 1)
-                               ? input.chars().size()
+                               ? input.chars_size(stream)
                                : cudf::detail::get_value<cudf::size_type>(
                                  input.offsets(), input.size() + input.offset(), stream);
   auto const chars_size    = last_offset - first_offset;
-  auto const d_input_chars = input.chars().data<char>() + first_offset;
+  auto const d_input_chars = input.chars_begin(stream) + first_offset;
 
   rmm::device_uvector<cudf::size_type> d_token_counts(input.size(), stream);
   rmm::device_uvector<int8_t> d_marks(chars_size, stream);
@@ -436,9 +434,8 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
 
   auto tmp_offsets =
     std::make_unique<cudf::column>(std::move(d_tmp_offsets), rmm::device_buffer{}, 0);
-  auto tmp_chars = cudf::column_view(input.chars().type(), chars_size, d_input_chars, nullptr, 0);
   auto const tmp_input = cudf::column_view(
-    input.parent().type(), total_count, nullptr, nullptr, 0, 0, {tmp_offsets->view(), tmp_chars});
+    input.parent().type(), total_count, d_input_chars, nullptr, 0, 0, {tmp_offsets->view()});
 
   auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream);
 
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index 224dd93b048..eaf47adec10 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ namespace detail {
  * @param output_column The destination for the results of evaluating the expression.
  */
 template <cudf::size_type max_block_size, bool has_nulls>
-__launch_bounds__(max_block_size) __global__
+__launch_bounds__(max_block_size) CUDF_KERNEL
   void compute_column_kernel(table_device_view const table,
                              ast::detail::expression_device_view device_expression_data,
                              mutable_column_device_view output_column)
diff --git a/cpp/src/transform/jit/kernel.cu b/cpp/src/transform/jit/kernel.cu
index 0170cc50c6f..1e913ecb5bb 100644
--- a/cpp/src/transform/jit/kernel.cu
+++ b/cpp/src/transform/jit/kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@ namespace transformation {
 namespace jit {
 
 template <typename TypeOut, typename TypeIn>
-__global__ void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data)
+CUDF_KERNEL void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data)
 {
   // cannot use global_thread_id utility due to a JIT build issue by including
   // the `cudf/detail/utilities/cuda.cuh` header
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index b151b44565d..a91dc8fbbc6 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -398,10 +398,10 @@ __device__ size_type row_size_functor::operator()<struct_view>(column_device_vie
  * @param output Output span of size (# rows) where per-row bit sizes are stored
  * @param max_branch_depth Maximum depth of the span stack needed per-thread
  */
-__global__ void compute_row_sizes(device_span<column_device_view const> cols,
-                                  device_span<column_info const> info,
-                                  device_span<size_type> output,
-                                  size_type max_branch_depth)
+CUDF_KERNEL void compute_row_sizes(device_span<column_device_view const> cols,
+                                   device_span<column_info const> info,
+                                   device_span<size_type> output,
+                                   size_type max_branch_depth)
 {
   extern __shared__ row_span thread_branch_stacks[];
   int const tid = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/cpp/src/transform/row_conversion.cu b/cpp/src/transform/row_conversion.cu
index a1c5827e5da..b294369a90e 100644
--- a/cpp/src/transform/row_conversion.cu
+++ b/cpp/src/transform/row_conversion.cu
@@ -314,14 +314,14 @@ struct fixed_width_row_offset_functor {
  * @param output_nm array of pointers to the output null masks
  * @param input_data pointing to the incoming row data
  */
-__global__ void copy_from_rows_fixed_width_optimized(const size_type num_rows,
-                                                     const size_type num_columns,
-                                                     const size_type row_size,
-                                                     const size_type* input_offset_in_row,
-                                                     const size_type* num_bytes,
-                                                     int8_t** output_data,
-                                                     bitmask_type** output_nm,
-                                                     const int8_t* input_data)
+CUDF_KERNEL void copy_from_rows_fixed_width_optimized(const size_type num_rows,
+                                                      const size_type num_columns,
+                                                      const size_type row_size,
+                                                      const size_type* input_offset_in_row,
+                                                      const size_type* num_bytes,
+                                                      int8_t** output_data,
+                                                      bitmask_type** output_nm,
+                                                      const int8_t* input_data)
 {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -433,15 +433,15 @@ __global__ void copy_from_rows_fixed_width_optimized(const size_type num_rows,
   }
 }
 
-__global__ void copy_to_rows_fixed_width_optimized(const size_type start_row,
-                                                   const size_type num_rows,
-                                                   const size_type num_columns,
-                                                   const size_type row_size,
-                                                   const size_type* output_offset_in_row,
-                                                   const size_type* num_bytes,
-                                                   const int8_t** input_data,
-                                                   const bitmask_type** input_nm,
-                                                   int8_t* output_data)
+CUDF_KERNEL void copy_to_rows_fixed_width_optimized(const size_type start_row,
+                                                    const size_type num_rows,
+                                                    const size_type num_columns,
+                                                    const size_type row_size,
+                                                    const size_type* output_offset_in_row,
+                                                    const size_type* num_bytes,
+                                                    const int8_t** input_data,
+                                                    const bitmask_type** input_nm,
+                                                    int8_t* output_data)
 {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -588,16 +588,16 @@ __global__ void copy_to_rows_fixed_width_optimized(const size_type start_row,
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_to_rows(const size_type num_rows,
-                             const size_type num_columns,
-                             const size_type shmem_used_per_tile,
-                             device_span<const tile_info> tile_infos,
-                             const int8_t** input_data,
-                             const size_type* col_sizes,
-                             const size_type* col_offsets,
-                             RowOffsetFunctor row_offsets,
-                             size_type const* batch_row_boundaries,
-                             int8_t** output_data)
+CUDF_KERNEL void copy_to_rows(const size_type num_rows,
+                              const size_type num_columns,
+                              const size_type shmem_used_per_tile,
+                              device_span<const tile_info> tile_infos,
+                              const int8_t** input_data,
+                              const size_type* col_sizes,
+                              const size_type* col_offsets,
+                              RowOffsetFunctor row_offsets,
+                              size_type const* batch_row_boundaries,
+                              int8_t** output_data)
 {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -731,15 +731,15 @@ __global__ void copy_to_rows(const size_type num_rows,
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_validity_to_rows(const size_type num_rows,
-                                      const size_type num_columns,
-                                      const size_type shmem_used_per_tile,
-                                      RowOffsetFunctor row_offsets,
-                                      size_type const* batch_row_boundaries,
-                                      int8_t** output_data,
-                                      const size_type validity_offset,
-                                      device_span<const tile_info> tile_infos,
-                                      const bitmask_type** input_nm)
+CUDF_KERNEL void copy_validity_to_rows(const size_type num_rows,
+                                       const size_type num_columns,
+                                       const size_type shmem_used_per_tile,
+                                       RowOffsetFunctor row_offsets,
+                                       size_type const* batch_row_boundaries,
+                                       int8_t** output_data,
+                                       const size_type validity_offset,
+                                       device_span<const tile_info> tile_infos,
+                                       const bitmask_type** input_nm)
 {
   extern __shared__ int8_t shared_data[];
 
@@ -851,15 +851,15 @@ __global__ void copy_validity_to_rows(const size_type num_rows,
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_strings_to_rows(size_type const num_rows,
-                                     size_type const num_variable_columns,
-                                     int8_t const** variable_input_data,
-                                     size_type const* variable_col_output_offsets,
-                                     size_type const** variable_col_offsets,
-                                     size_type fixed_width_row_size,
-                                     RowOffsetFunctor row_offsets,
-                                     size_type const batch_row_offset,
-                                     int8_t* output_data)
+CUDF_KERNEL void copy_strings_to_rows(size_type const num_rows,
+                                      size_type const num_variable_columns,
+                                      int8_t const** variable_input_data,
+                                      size_type const* variable_col_output_offsets,
+                                      size_type const** variable_col_offsets,
+                                      size_type fixed_width_row_size,
+                                      RowOffsetFunctor row_offsets,
+                                      size_type const batch_row_offset,
+                                      int8_t* output_data)
 {
   // Each block will take a group of rows controlled by NUM_STRING_ROWS_PER_BLOCK_TO_ROWS. Each warp
   // will copy a row at a time. The base thread will first go through column data and fill out
@@ -920,16 +920,16 @@ __global__ void copy_strings_to_rows(size_type const num_rows,
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_from_rows(const size_type num_rows,
-                               const size_type num_columns,
-                               const size_type shmem_used_per_tile,
-                               RowOffsetFunctor row_offsets,
-                               size_type const* batch_row_boundaries,
-                               int8_t** output_data,
-                               const size_type* col_sizes,
-                               const size_type* col_offsets,
-                               device_span<const tile_info> tile_infos,
-                               const int8_t* input_data)
+CUDF_KERNEL void copy_from_rows(const size_type num_rows,
+                                const size_type num_columns,
+                                const size_type shmem_used_per_tile,
+                                RowOffsetFunctor row_offsets,
+                                size_type const* batch_row_boundaries,
+                                int8_t** output_data,
+                                const size_type* col_sizes,
+                                const size_type* col_offsets,
+                                device_span<const tile_info> tile_infos,
+                                const int8_t* input_data)
 {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
@@ -1042,15 +1042,15 @@ __global__ void copy_from_rows(const size_type num_rows,
  *
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_validity_from_rows(const size_type num_rows,
-                                        const size_type num_columns,
-                                        const size_type shmem_used_per_tile,
-                                        RowOffsetFunctor row_offsets,
-                                        size_type const* batch_row_boundaries,
-                                        bitmask_type** output_nm,
-                                        const size_type validity_offset,
-                                        device_span<const tile_info> tile_infos,
-                                        const int8_t* input_data)
+CUDF_KERNEL void copy_validity_from_rows(const size_type num_rows,
+                                         const size_type num_columns,
+                                         const size_type shmem_used_per_tile,
+                                         RowOffsetFunctor row_offsets,
+                                         size_type const* batch_row_boundaries,
+                                         bitmask_type** output_nm,
+                                         const size_type validity_offset,
+                                         device_span<const tile_info> tile_infos,
+                                         const int8_t* input_data)
 {
   extern __shared__ int8_t shared[];
 
@@ -1175,14 +1175,14 @@ __global__ void copy_validity_from_rows(const size_type num_rows,
  * @param num_string_columns number of string columns in the table
  */
 template <typename RowOffsetFunctor>
-__global__ void copy_strings_from_rows(RowOffsetFunctor row_offsets,
-                                       int32_t** string_row_offsets,
-                                       int32_t** string_lengths,
-                                       size_type** string_column_offsets,
-                                       char** string_col_data,
-                                       int8_t const* row_data,
-                                       size_type const num_rows,
-                                       size_type const num_string_columns)
+CUDF_KERNEL void copy_strings_from_rows(RowOffsetFunctor row_offsets,
+                                        int32_t** string_row_offsets,
+                                        int32_t** string_lengths,
+                                        size_type** string_column_offsets,
+                                        char** string_col_data,
+                                        int8_t const* row_data,
+                                        size_type const num_rows,
+                                        size_type const num_string_columns)
 {
   // Each warp takes a tile, which is a single column and up to ROWS_PER_BLOCK rows. A tile will not
   // wrap around the bottom of the table. The warp will copy the strings for each row in the tile.
@@ -1994,11 +1994,9 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
     CUDF_EXPECTS(!variable_width_table.is_empty(), "No variable-width columns when expected!");
     CUDF_EXPECTS(variable_width_offsets.has_value(), "No variable width offset data!");
 
-    auto const variable_data_begin =
-      thrust::make_transform_iterator(variable_width_table.begin(), [](auto const& c) {
-        strings_column_view const scv{c};
-        return is_compound(c.type()) ? scv.chars().template data<int8_t>() : nullptr;
-      });
+    auto const variable_data_begin = thrust::make_transform_iterator(
+      variable_width_table.begin(),
+      [](auto const& c) { return is_compound(c.type()) ? c.template data<int8_t>() : nullptr; });
     std::vector<int8_t const*> variable_width_input_data(
       variable_data_begin, variable_data_begin + variable_width_table.num_columns());
 
@@ -2511,8 +2509,7 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const& input,
           make_strings_column(num_rows,
                               std::make_unique<cudf::column>(
                                 std::move(string_col_offsets[string_idx]), rmm::device_buffer{}, 0),
-                              std::make_unique<cudf::column>(
-                                std::move(string_data_cols[string_idx]), rmm::device_buffer{}, 0),
+                              string_data_cols[string_idx].release(),
                               0,
                               std::move(*string_data.null_mask.release()));
         // Null count set to 0, temporarily. Will be fixed up before return.
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f7b805b68f5..a3b982a6719 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -653,8 +653,10 @@ ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 06fb687ac2d..0f7c1053adf 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -22,6 +22,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
@@ -406,9 +407,9 @@ TEST_F(OverflowTest, OverflowTest)
 
     // try and concatenate 6 string columns of with 1 billion chars in each
     auto offsets    = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, size};
-    auto many_chars = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, size);
+    auto many_chars = rmm::device_uvector<char>(size, cudf::get_default_stream());
     auto col        = cudf::make_strings_column(
-      1, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{});
+      1, offsets.release(), many_chars.release(), 0, rmm::device_buffer{});
 
     cudf::table_view tbl({*col});
     EXPECT_THROW(cudf::concatenate(std::vector<cudf::table_view>({tbl, tbl, tbl, tbl, tbl, tbl})),
@@ -422,7 +423,7 @@ TEST_F(OverflowTest, OverflowTest)
     // try and concatenate 6 string columns 1 billion rows each
     auto many_offsets =
       cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, size + 1);
-    auto chars = cudf::test::fixed_width_column_wrapper<int8_t>{0, 1, 2};
+    auto chars = rmm::device_uvector<char>(3, cudf::get_default_stream());
     auto col   = cudf::make_strings_column(
       size, std::move(many_offsets), chars.release(), 0, rmm::device_buffer{});
 
@@ -533,10 +534,9 @@ TEST_F(OverflowTest, Presliced)
     auto offset_gen = cudf::detail::make_counting_transform_iterator(
       0, [string_size](cudf::size_type index) { return index * string_size; });
     cudf::test::fixed_width_column_wrapper<int> offsets(offset_gen, offset_gen + num_rows + 1);
-    auto many_chars =
-      cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, total_chars_size);
-    auto col = cudf::make_strings_column(
-      num_rows, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{});
+    auto many_chars = rmm::device_uvector<char>(total_chars_size, cudf::get_default_stream());
+    auto col        = cudf::make_strings_column(
+      num_rows, offsets.release(), many_chars.release(), 0, rmm::device_buffer{});
 
     auto sliced = cudf::split(*col, {(num_rows / 2) - 1});
 
@@ -557,13 +557,12 @@ TEST_F(OverflowTest, Presliced)
     constexpr cudf::size_type num_rows         = total_chars_size / string_size;
 
     // try and concatenate 4 string columns of with ~1/2 billion chars in each
-    auto offsets = cudf::sequence(num_rows + 1,
+    auto offsets    = cudf::sequence(num_rows + 1,
                                   cudf::numeric_scalar<cudf::size_type>(0),
                                   cudf::numeric_scalar<cudf::size_type>(string_size));
-    auto many_chars =
-      cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, total_chars_size);
-    auto col = cudf::make_strings_column(
-      num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
+    auto many_chars = rmm::device_uvector<char>(total_chars_size, cudf::get_default_stream());
+    auto col        = cudf::make_strings_column(
+      num_rows, std::move(offsets), many_chars.release(), 0, rmm::device_buffer{});
 
     // should pass (with 2 rows to spare)
     // leaving this disabled as it typically runs out of memory on a T4
@@ -636,7 +635,7 @@ TEST_F(OverflowTest, Presliced)
                                   cudf::numeric_scalar<cudf::size_type>(0),
                                   cudf::numeric_scalar<cudf::size_type>(list_size));
 
-    auto col = cudf::make_strings_column(
+    auto col = cudf::make_lists_column(
       num_rows, std::move(offsets), std::move(struct_col), 0, rmm::device_buffer{});
 
     // should pass (with 2 rows to spare)
@@ -722,13 +721,12 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
     constexpr cudf::size_type num_rows    = 1024;
     constexpr cudf::size_type string_size = inner_size / num_rows;
 
-    auto offsets = cudf::sequence(num_rows + 1,
+    auto offsets    = cudf::sequence(num_rows + 1,
                                   cudf::numeric_scalar<cudf::size_type>(0),
                                   cudf::numeric_scalar<cudf::size_type>(string_size));
-    auto many_chars =
-      cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size);
-    auto col = cudf::make_strings_column(
-      num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
+    auto many_chars = rmm::device_uvector<char>(inner_size, cudf::get_default_stream());
+    auto col        = cudf::make_strings_column(
+      num_rows, std::move(offsets), many_chars.release(), 0, rmm::device_buffer{});
 
     auto sliced = cudf::slice(*col, {16, 32});
 
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index f0c69ea6bfb..6e90d4462df 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -31,7 +31,7 @@
 #include <algorithm>
 
 template <typename T>
-__global__ void gpu_atomic_test(T* result, T* data, size_t size)
+CUDF_KERNEL void gpu_atomic_test(T* result, T* data, size_t size)
 {
   size_t id   = blockIdx.x * blockDim.x + threadIdx.x;
   size_t step = blockDim.x * gridDim.x;
@@ -79,7 +79,7 @@ __device__ T atomic_op(T* addr, T const& value, BinaryOp op)
 }
 
 template <typename T>
-__global__ void gpu_atomicCAS_test(T* result, T* data, size_t size)
+CUDF_KERNEL void gpu_atomicCAS_test(T* result, T* data, size_t size)
 {
   size_t id   = blockIdx.x * blockDim.x + threadIdx.x;
   size_t step = blockDim.x * gridDim.x;
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index 6bb1afda2a8..5cb2d729f3d 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -40,7 +40,7 @@ TEST(StreamCheck, success) { EXPECT_NO_THROW(CUDF_CHECK_CUDA(0)); }
 
 namespace {
 // Some silly kernel that will cause an error
-void __global__ test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; }
+CUDF_KERNEL void test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; }
 }  // namespace
 
 // In a release build and without explicit synchronization, CUDF_CHECK_CUDA may
@@ -70,7 +70,7 @@ TEST(StreamCheck, CatchFailedKernel)
   EXPECT_THROW(CUDF_CHECK_CUDA(stream.value()), cudf::cuda_error);
 }
 
-__global__ void kernel() { asm("trap;"); }
+CUDF_KERNEL void kernel() { asm("trap;"); }
 
 TEST(DeathTest, CudaFatalError)
 {
@@ -88,9 +88,9 @@ TEST(DeathTest, CudaFatalError)
 
 #ifndef NDEBUG
 
-__global__ void assert_false_kernel() { cudf_assert(false && "this kernel should die"); }
+CUDF_KERNEL void assert_false_kernel() { cudf_assert(false && "this kernel should die"); }
 
-__global__ void assert_true_kernel() { cudf_assert(true && "this kernel should live"); }
+CUDF_KERNEL void assert_true_kernel() { cudf_assert(true && "this kernel should live"); }
 
 TEST(DebugAssertDeathTest, cudf_assert_false)
 {
diff --git a/cpp/tests/identify_stream_usage/test_default_stream_identification.cu b/cpp/tests/identify_stream_usage/test_default_stream_identification.cu
index 28bb47af40d..268c7b37c81 100644
--- a/cpp/tests/identify_stream_usage/test_default_stream_identification.cu
+++ b/cpp/tests/identify_stream_usage/test_default_stream_identification.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #include <stdexcept>
 
-__global__ void kernel() { printf("The kernel ran!\n"); }
+__global__ static void kernel() { printf("The kernel ran!\n"); }
 
 void test_cudaLaunchKernel()
 {
diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu
index 036b9170250..6923b7be42d 100644
--- a/cpp/tests/io/json_type_cast_test.cu
+++ b/cpp/tests/io/json_type_cast_test.cu
@@ -95,7 +95,7 @@ TEST_F(JSONTypeCastTest, String)
     std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size()));
 
   auto str_col = cudf::io::json::detail::parse_data(
-    column.chars().data<char>(),
+    column.chars_begin(stream),
     thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
     column.size(),
     type,
@@ -128,7 +128,7 @@ TEST_F(JSONTypeCastTest, Int)
     std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size()));
 
   auto col = cudf::io::json::detail::parse_data(
-    column.chars().data<char>(),
+    column.chars_begin(stream),
     thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
     column.size(),
     type,
@@ -168,7 +168,7 @@ TEST_F(JSONTypeCastTest, StringEscapes)
     std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size()));
 
   auto col = cudf::io::json::detail::parse_data(
-    column.chars().data<char>(),
+    column.chars_begin(stream),
     thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
     column.size(),
     type,
@@ -237,7 +237,7 @@ TEST_F(JSONTypeCastTest, ErrorNulls)
       std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size()));
 
     auto str_col = cudf::io::json::detail::parse_data(
-      column.chars().data<char>(),
+      column.chars_begin(stream),
       thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())),
       column.size(),
       type,
diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp
index 49b6b8fd259..aa5a1cad96a 100644
--- a/cpp/tests/io/parquet_misc_test.cpp
+++ b/cpp/tests/io/parquet_misc_test.cpp
@@ -138,9 +138,8 @@ TEST_P(ParquetSizedTest, DictionaryTest)
   unsigned int const cardinality = (1 << (GetParam() - 1)) + 1;
   unsigned int const nrows       = std::max(cardinality * 3 / 2, 3'000'000U);
 
-  auto elements       = cudf::detail::make_counting_transform_iterator(0, [cardinality](auto i) {
-    return "a unique string value suffixed with " + std::to_string(i % cardinality);
-  });
+  auto const elements = cudf::detail::make_counting_transform_iterator(
+    0, [cardinality](auto i) { return std::to_string(i % cardinality); });
   auto const col0     = cudf::test::strings_column_wrapper(elements, elements + nrows);
   auto const expected = table_view{{col0}};
 
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index 5cb05ac7011..abbd0c97f07 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -241,7 +241,7 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes)
 
 TEST_F(ParquetReaderTest, UserBoundsWithNullsLarge)
 {
-  constexpr int num_rows = 30 * 1000000;
+  constexpr int num_rows = 30 * 10000;
 
   std::mt19937 gen(6747);
   std::bernoulli_distribution bn(0.7f);
@@ -251,21 +251,23 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsLarge)
 
   cudf::test::fixed_width_column_wrapper<int> col(values, values + num_rows, valids);
 
-  // this file will have row groups of 1,000,000 each
+  // this file will have row groups of 10,000 each
   cudf::table_view tbl({col});
   auto filepath = temp_env->get_temp_filepath("UserBoundsWithNullsLarge.parquet");
   cudf::io::parquet_writer_options out_args =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl)
+      .row_group_size_rows(10000)
+      .max_page_size_rows(1000);
   cudf::io::write_parquet(out_args);
 
   // skip_rows / num_rows
   // clang-format off
-  std::vector<std::pair<int, int>> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {1613470, -1}, {1999999, -1},
+  std::vector<std::pair<int, int>> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {16130, -1}, {19999, -1},
                                            {31, 1}, {32, 1}, {33, 1},
                                            // deliberately span some row group boundaries
-                                           {999000, 1001}, {999000, 2000}, {2999999, 2}, {13999997, -1},
-                                           {16785678, 3}, {22996176, 31},
-                                           {24001231, 17}, {29000001, 989999}, {29999999, 1} };
+                                           {9900, 1001}, {9900, 2000}, {29999, 2}, {139997, -1},
+                                           {167878, 3}, {229976, 31},
+                                           {240031, 17}, {290001, 9899}, {299999, 1} };
   // clang-format on
   for (auto p : params) {
     cudf::io::parquet_reader_options read_args =
@@ -285,25 +287,27 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsLarge)
 
 TEST_F(ParquetReaderTest, ListUserBoundsWithNullsLarge)
 {
-  constexpr int num_rows = 5 * 1000000;
+  constexpr int num_rows = 5 * 10000;
   auto colp              = make_parquet_list_list_col<int>(0, num_rows, 5, 8, true);
   cudf::column_view col  = *colp;
 
-  // this file will have row groups of 1,000,000 each
+  // this file will have row groups of 10,000 each
   cudf::table_view tbl({col});
   auto filepath = temp_env->get_temp_filepath("ListUserBoundsWithNullsLarge.parquet");
   cudf::io::parquet_writer_options out_args =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl)
+      .row_group_size_rows(10000)
+      .max_page_size_rows(1000);
   cudf::io::write_parquet(out_args);
 
   // skip_rows / num_rows
   // clang-format off
-  std::vector<std::pair<int, int>> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {161470, -1}, {4499997, -1},
+  std::vector<std::pair<int, int>> params{ {-1, -1}, {31, -1}, {32, -1}, {33, -1}, {1670, -1}, {44997, -1},
                                            {31, 1}, {32, 1}, {33, 1},
                                            // deliberately span some row group boundaries
-                                           {999000, 1001}, {999000, 2000}, {2999999, 2},
-                                           {1678567, 3}, {4299676, 31},
-                                           {4001231, 17}, {1900000, 989999}, {4999999, 1} };
+                                           {9900, 1001}, {9900, 2000}, {29999, 2},
+                                           {16567, 3}, {42976, 31},
+                                           {40231, 17}, {19000, 9899}, {49999, 1} };
   // clang-format on
   for (auto p : params) {
     cudf::io::parquet_reader_options read_args =
@@ -1951,7 +1955,7 @@ TEST_F(ParquetReaderTest, RepeatedNoAnnotations)
 
 TEST_F(ParquetReaderTest, DeltaSkipRowsWithNulls)
 {
-  constexpr int num_rows = 50'000;
+  constexpr int num_rows = 10'000;
   constexpr auto seed    = 21337;
 
   std::mt19937 engine{seed};
@@ -2003,7 +2007,7 @@ TEST_F(ParquetReaderTest, DeltaSkipRowsWithNulls)
       .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
       .compression(cudf::io::compression_type::NONE)
       .dictionary_policy(cudf::io::dictionary_policy::NEVER)
-      .max_page_size_rows(20'000)
+      .max_page_size_rows(5'000)
       .write_v2_headers(true)
       .build();
   cudf::io::write_parquet(out_opts);
@@ -2018,7 +2022,7 @@ TEST_F(ParquetReaderTest, DeltaSkipRowsWithNulls)
     // skip and truncate
     {1, 32}, {1, 33}, {32, 32}, {33, 139},
     // cross page boundaries
-    {10'000, 20'000}
+    {3'000, 5'000}
   };
 
   // clang-format on
@@ -2044,7 +2048,7 @@ TEST_F(ParquetReaderTest, DeltaSkipRowsWithNulls)
         .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
         .compression(cudf::io::compression_type::NONE)
         .dictionary_policy(cudf::io::dictionary_policy::NEVER)
-        .max_page_size_rows(20'000)
+        .max_page_size_rows(5'000)
         .write_v2_headers(true);
     cudf::io::write_parquet(out_opts2);
 
diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp
index f2b50639a4d..1a373ed92ae 100644
--- a/cpp/tests/io/parquet_v2_test.cpp
+++ b/cpp/tests/io/parquet_v2_test.cpp
@@ -23,6 +23,8 @@
 
 #include <cudf/io/parquet.hpp>
 
+using cudf::test::iterators::no_nulls;
+
 // Base test fixture for V2 header tests
 class ParquetV2Test : public ::cudf::test::BaseFixtureWithParam<bool> {};
 
@@ -33,7 +35,7 @@ INSTANTIATE_TEST_SUITE_P(ParquetV2ReadWriteTest,
 
 TEST_P(ParquetV2Test, MultiColumn)
 {
-  constexpr auto num_rows = 100000;
+  constexpr auto num_rows = 50000;
   auto const is_v2        = GetParam();
 
   // auto col0_data = random_values<bool>(num_rows);
@@ -45,27 +47,25 @@ TEST_P(ParquetV2Test, MultiColumn)
   auto col6_vals = random_values<int16_t>(num_rows);
   auto col7_vals = random_values<int32_t>(num_rows);
   auto col8_vals = random_values<int64_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [col6_vals](auto i) {
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&col6_vals](auto i) {
     return numeric::decimal32{col6_vals[i], numeric::scale_type{5}};
   });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [col7_vals](auto i) {
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&col7_vals](auto i) {
     return numeric::decimal64{col7_vals[i], numeric::scale_type{-5}};
   });
-  auto col8_data = cudf::detail::make_counting_transform_iterator(0, [col8_vals](auto i) {
+  auto col8_data = cudf::detail::make_counting_transform_iterator(0, [&col8_vals](auto i) {
     return numeric::decimal128{col8_vals[i], numeric::scale_type{-6}};
   });
-  auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
-  // column_wrapper<bool> col0{
-  //    col0_data.begin(), col0_data.end(), validity};
-  column_wrapper<int8_t> col1{col1_data.begin(), col1_data.end(), validity};
-  column_wrapper<int16_t> col2{col2_data.begin(), col2_data.end(), validity};
-  column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), validity};
-  column_wrapper<float> col4{col4_data.begin(), col4_data.end(), validity};
-  column_wrapper<double> col5{col5_data.begin(), col5_data.end(), validity};
-  column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, validity};
-  column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, validity};
-  column_wrapper<numeric::decimal128> col8{col8_data, col8_data + num_rows, validity};
+  // column_wrapper<bool> col0{col0_data.begin(), col0_data.end(), no_nulls()};
+  column_wrapper<int8_t> col1{col1_data.begin(), col1_data.end(), no_nulls()};
+  column_wrapper<int16_t> col2{col2_data.begin(), col2_data.end(), no_nulls()};
+  column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), no_nulls()};
+  column_wrapper<float> col4{col4_data.begin(), col4_data.end(), no_nulls()};
+  column_wrapper<double> col5{col5_data.begin(), col5_data.end(), no_nulls()};
+  column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, no_nulls()};
+  column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, no_nulls()};
+  column_wrapper<numeric::decimal128> col8{col8_data, col8_data + num_rows, no_nulls()};
 
   auto expected = table_view{{col1, col2, col3, col4, col5, col6, col7, col8}};
 
@@ -108,17 +108,17 @@ TEST_P(ParquetV2Test, MultiColumnWithNulls)
   auto col5_data = random_values<double>(num_rows);
   auto col6_vals = random_values<int32_t>(num_rows);
   auto col7_vals = random_values<int64_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [col6_vals](auto i) {
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&col6_vals](auto i) {
     return numeric::decimal32{col6_vals[i], numeric::scale_type{-2}};
   });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [col7_vals](auto i) {
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&col7_vals](auto i) {
     return numeric::decimal64{col7_vals[i], numeric::scale_type{-8}};
   });
   // auto col0_mask = cudf::detail::make_counting_transform_iterator(
   //    0, [](auto i) { return (i % 2); });
   auto col1_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 10); });
-  auto col2_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  auto col2_mask = no_nulls();
   auto col3_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i == (num_rows - 1)); });
   auto col4_mask =
@@ -181,11 +181,10 @@ TEST_P(ParquetV2Test, Strings)
 
   auto seq_col0 = random_values<int>(num_rows);
   auto seq_col2 = random_values<float>(num_rows);
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
-  column_wrapper<int> col0{seq_col0.begin(), seq_col0.end(), validity};
+  column_wrapper<int> col0{seq_col0.begin(), seq_col0.end(), no_nulls()};
   column_wrapper<cudf::string_view> col1{strings.begin(), strings.end()};
-  column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), validity};
+  column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), no_nulls()};
 
   auto expected = table_view{{col0, col1, col2}};
 
@@ -688,60 +687,9 @@ TEST_P(ParquetV2Test, PartitionedWriteEmptyColumns)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view());
 }
 
-TEST_P(ParquetV2Test, LargeColumnIndex)
-{
-  // create a file large enough to be written in 2 batches (currently 1GB per batch)
-  // pick fragment size that num_rows is divisible by, so we'll get equal sized row groups
-  const std::string s1(1000, 'a');
-  const std::string s2(1000, 'b');
-  constexpr auto num_rows  = 512 * 1024;
-  constexpr auto frag_size = num_rows / 128;
-  auto const is_v2         = GetParam();
-
-  auto col0_elements = cudf::detail::make_counting_transform_iterator(
-    0, [&](auto i) { return (i < num_rows) ? s1 : s2; });
-  auto col0 = cudf::test::strings_column_wrapper(col0_elements, col0_elements + 2 * num_rows);
-
-  auto const expected = table_view{{col0, col0}};
-
-  auto const filepath = temp_env->get_temp_filepath("LargeColumnIndex.parquet");
-  const cudf::io::parquet_writer_options out_opts =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
-      .stats_level(cudf::io::statistics_freq::STATISTICS_COLUMN)
-      .compression(cudf::io::compression_type::NONE)
-      .dictionary_policy(cudf::io::dictionary_policy::NEVER)
-      .write_v2_headers(is_v2)
-      .max_page_fragment_size(frag_size)
-      .row_group_size_bytes(1024 * 1024 * 1024)
-      .row_group_size_rows(num_rows);
-  cudf::io::write_parquet(out_opts);
-
-  auto const source = cudf::io::datasource::create(filepath);
-  cudf::io::parquet::detail::FileMetaData fmd;
-
-  read_footer(source, &fmd);
-
-  for (auto const& rg : fmd.row_groups) {
-    for (size_t c = 0; c < rg.columns.size(); c++) {
-      auto const& chunk = rg.columns[c];
-
-      auto const ci    = read_column_index(source, chunk);
-      auto const stats = get_statistics(chunk);
-
-      // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
-      auto const ptype = fmd.schema[c + 1].type;
-      auto const ctype = fmd.schema[c + 1].converted_type;
-      ASSERT_TRUE(stats.min_value.has_value());
-      ASSERT_TRUE(stats.max_value.has_value());
-      EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0);
-      EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0);
-    }
-  }
-}
-
 TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
 {
-  constexpr auto num_rows      = 100000;
+  constexpr auto num_rows      = 50000;
   auto const is_v2             = GetParam();
   auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2
                                        : cudf::io::parquet::detail::PageType::DATA_PAGE;
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 51190b5de9e..9415e018c6a 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -28,6 +28,8 @@
 
 #include <fstream>
 
+using cudf::test::iterators::no_nulls;
+
 template <typename mask_op_t>
 void test_durations(mask_op_t mask_op)
 {
@@ -100,13 +102,12 @@ TEST_F(ParquetWriterTest, MultiIndex)
   auto col2_data = random_values<int32_t>(num_rows);
   auto col3_data = random_values<float>(num_rows);
   auto col4_data = random_values<double>(num_rows);
-  auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
-  column_wrapper<int8_t> col0{col0_data.begin(), col0_data.end(), validity};
-  column_wrapper<int16_t> col1{col1_data.begin(), col1_data.end(), validity};
-  column_wrapper<int32_t> col2{col2_data.begin(), col2_data.end(), validity};
-  column_wrapper<float> col3{col3_data.begin(), col3_data.end(), validity};
-  column_wrapper<double> col4{col4_data.begin(), col4_data.end(), validity};
+  column_wrapper<int8_t> col0{col0_data.begin(), col0_data.end(), no_nulls()};
+  column_wrapper<int16_t> col1{col1_data.begin(), col1_data.end(), no_nulls()};
+  column_wrapper<int32_t> col2{col2_data.begin(), col2_data.end(), no_nulls()};
+  column_wrapper<float> col3{col3_data.begin(), col3_data.end(), no_nulls()};
+  column_wrapper<double> col4{col4_data.begin(), col4_data.end(), no_nulls()};
 
   auto expected = table_view{{col0, col1, col2, col3, col4}};
 
@@ -139,9 +140,7 @@ TEST_F(ParquetWriterTest, BufferSource)
 {
   constexpr auto num_rows = 100 << 10;
   auto const seq_col      = random_values<int>(num_rows);
-  auto const validity =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
-  column_wrapper<int> col{seq_col.begin(), seq_col.end(), validity};
+  column_wrapper<int> col{seq_col.begin(), seq_col.end(), no_nulls()};
 
   auto const expected = table_view{{col}};
 
@@ -185,12 +184,13 @@ TEST_F(ParquetWriterTest, BufferSource)
 TEST_F(ParquetWriterTest, ManyFragments)
 {
   srand(31337);
-  auto const expected = create_random_fixed_table<int>(10, 6'000'000, false);
+  auto const expected = create_random_fixed_table<int>(1, 700'000, false);
 
   auto const filepath = temp_env->get_temp_filepath("ManyFragments.parquet");
   cudf::io::parquet_writer_options const args =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
-      .max_page_size_bytes(8 * 1024);
+      .max_page_size_bytes(8 * 1024)
+      .max_page_fragment_size(10);
   cudf::io::write_parquet(args);
 
   cudf::io::parquet_reader_options const read_opts =
@@ -342,11 +342,12 @@ TEST_F(ParquetWriterTest, DeviceWriteLargeishFile)
 
   // exercises multiple rowgroups
   srand(31337);
-  auto expected = create_random_fixed_table<int>(4, 4 * 1024 * 1024, false);
+  auto expected = create_random_fixed_table<int>(4, 1024 * 1024, false);
 
   // write out using the custom sink (which uses device writes)
   cudf::io::parquet_writer_options args =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected);
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected)
+      .row_group_size_rows(128 * 1024);
   cudf::io::write_parquet(args);
 
   cudf::io::parquet_reader_options custom_args =
@@ -613,11 +614,10 @@ TEST_F(ParquetWriterTest, EmptyListWithStruct)
 TEST_F(ParquetWriterTest, CheckPageRows)
 {
   auto sequence = thrust::make_counting_iterator(0);
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
   constexpr auto page_rows = 5000;
   constexpr auto num_rows  = 2 * page_rows;
-  column_wrapper<int> col(sequence, sequence + num_rows, validity);
+  column_wrapper<int> col(sequence, sequence + num_rows, no_nulls());
 
   auto expected = table_view{{col}};
 
@@ -1267,8 +1267,7 @@ TEST_F(ParquetWriterTest, CompStatsEmptyTable)
 
 TEST_F(ParquetWriterTest, NoNullsAsNonNullable)
 {
-  auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
-  column_wrapper<int32_t> col{{1, 2, 3}, valids};
+  column_wrapper<int32_t> col{{1, 2, 3}, no_nulls()};
   table_view expected({col});
 
   cudf::io::table_input_metadata expected_metadata(expected);
@@ -1459,10 +1458,9 @@ TYPED_TEST(ParquetWriterNumericTypeTest, SingleColumn)
 {
   auto sequence =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return TypeParam(i % 400); });
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
   constexpr auto num_rows = 800;
-  column_wrapper<TypeParam> col(sequence, sequence + num_rows, validity);
+  column_wrapper<TypeParam> col(sequence, sequence + num_rows, no_nulls());
 
   auto expected = table_view{{col}};
 
@@ -1516,11 +1514,10 @@ TYPED_TEST(ParquetWriterTimestampTypeTest, Timestamps)
 {
   auto sequence = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return ((std::rand() / 10000) * 1000); });
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
   constexpr auto num_rows = 100;
   column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
-    sequence, sequence + num_rows, validity);
+    sequence, sequence + num_rows, no_nulls());
 
   auto expected = table_view{{col}};
 
@@ -1567,11 +1564,10 @@ TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampOverflow)
 {
   constexpr int64_t max = std::numeric_limits<int64_t>::max();
   auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return max - i; });
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
   constexpr auto num_rows = 100;
   column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
-    sequence, sequence + num_rows, validity);
+    sequence, sequence + num_rows, no_nulls());
   table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("ParquetTimestampOverflow.parquet");
diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu
index 8d0e54f024f..8444716bccd 100644
--- a/cpp/tests/scalar/scalar_device_view_test.cu
+++ b/cpp/tests/scalar/scalar_device_view_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,14 +35,14 @@ struct TypedScalarDeviceViewTest : public cudf::test::BaseFixture {};
 TYPED_TEST_SUITE(TypedScalarDeviceViewTest, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 template <typename ScalarDeviceViewType>
-__global__ void test_set_value(ScalarDeviceViewType s, ScalarDeviceViewType s1)
+CUDF_KERNEL void test_set_value(ScalarDeviceViewType s, ScalarDeviceViewType s1)
 {
   s1.set_value(s.value());
   s1.set_valid(true);
 }
 
 template <typename ScalarDeviceViewType>
-__global__ void test_value(ScalarDeviceViewType s, ScalarDeviceViewType s1, bool* result)
+CUDF_KERNEL void test_value(ScalarDeviceViewType s, ScalarDeviceViewType s1, bool* result)
 {
   *result = (s.value() == s1.value());
 }
@@ -73,7 +73,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, Value)
 }
 
 template <typename ScalarDeviceViewType>
-__global__ void test_null(ScalarDeviceViewType s, bool* result)
+CUDF_KERNEL void test_null(ScalarDeviceViewType s, bool* result)
 {
   *result = s.is_valid();
 }
@@ -92,7 +92,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, ConstructNull)
 }
 
 template <typename ScalarDeviceViewType>
-__global__ void test_setnull(ScalarDeviceViewType s)
+CUDF_KERNEL void test_setnull(ScalarDeviceViewType s)
 {
   s.set_valid(false);
 }
@@ -113,10 +113,10 @@ TYPED_TEST(TypedScalarDeviceViewTest, SetNull)
 
 struct StringScalarDeviceViewTest : public cudf::test::BaseFixture {};
 
-__global__ void test_string_value(cudf::string_scalar_device_view s,
-                                  char const* value,
-                                  cudf::size_type size,
-                                  bool* result)
+CUDF_KERNEL void test_string_value(cudf::string_scalar_device_view s,
+                                   char const* value,
+                                   cudf::size_type size,
+                                   bool* result)
 {
   *result = (s.value() == cudf::string_view(value, size));
 }
diff --git a/cpp/tests/streams/io/orc_test.cpp b/cpp/tests/streams/io/orc_test.cpp
new file mode 100644
index 00000000000..929c3697b3b
--- /dev/null
+++ b/cpp/tests/streams/io/orc_test.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/io/detail/orc.hpp>
+#include <cudf/io/orc.hpp>
+#include <cudf/io/orc_metadata.hpp>
+#include <cudf/io/orc_types.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <iostream>
+#include <random>
+#include <sstream>
+#include <string>
+#include <vector>
+
+auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
+  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+class ORCTest : public cudf::test::BaseFixture {};
+
+template <typename... UniqPtrs>
+std::vector<std::unique_ptr<cudf::column>> make_uniqueptrs_vector(UniqPtrs&&... uniqptrs)
+{
+  std::vector<std::unique_ptr<cudf::column>> ptrsvec;
+  (ptrsvec.push_back(std::forward<UniqPtrs>(uniqptrs)), ...);
+  return ptrsvec;
+}
+
+cudf::table construct_table()
+{
+  constexpr auto num_rows = 10;
+
+  auto const zeros_iterator = thrust::make_constant_iterator(0);
+  auto const ones_iterator  = thrust::make_constant_iterator(1);
+
+  cudf::test::fixed_width_column_wrapper<bool> col0(zeros_iterator, zeros_iterator + num_rows);
+  cudf::test::fixed_width_column_wrapper<int8_t> col1(zeros_iterator, zeros_iterator + num_rows);
+  cudf::test::fixed_width_column_wrapper<int16_t> col2(zeros_iterator, zeros_iterator + num_rows);
+  cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros_iterator, zeros_iterator + num_rows);
+  cudf::test::fixed_width_column_wrapper<float> col4(zeros_iterator, zeros_iterator + num_rows);
+  cudf::test::fixed_width_column_wrapper<double> col5(zeros_iterator, zeros_iterator + num_rows);
+
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6 = [&ones_iterator, num_rows] {
+    auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+      return numeric::decimal128{ones_iterator[i], numeric::scale_type{12}};
+    });
+    return cudf::test::fixed_width_column_wrapper<numeric::decimal128>(col6_data,
+                                                                       col6_data + num_rows);
+  }();
+
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7 = [&ones_iterator, num_rows] {
+    auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+      return numeric::decimal128{ones_iterator[i], numeric::scale_type{-12}};
+    });
+    return cudf::test::fixed_width_column_wrapper<numeric::decimal128>(col7_data,
+                                                                       col7_data + num_rows);
+  }();
+
+  cudf::test::lists_column_wrapper<int64_t> col8 = [] {
+    auto col8_mask =
+      cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); });
+    return cudf::test::lists_column_wrapper<int64_t>(
+      {{1, 1}, {1, 1, 1}, {}, {1}, {1, 1, 1, 1}, {1, 1, 1, 1, 1}, {}, {1, -1}, {}, {-1, -1}},
+      col8_mask);
+  }();
+
+  cudf::test::structs_column_wrapper col9 = [&ones_iterator] {
+    auto child_col_mask =
+      cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); });
+    cudf::test::fixed_width_column_wrapper<int32_t> child_col(
+      ones_iterator, ones_iterator + num_rows, child_col_mask);
+    return cudf::test::structs_column_wrapper{child_col};
+  }();
+
+  cudf::test::strings_column_wrapper col10 = [] {
+    std::vector<std::string> col10_data(num_rows, "rapids");
+    return cudf::test::strings_column_wrapper(col10_data.begin(), col10_data.end());
+  }();
+
+  auto colsptr = make_uniqueptrs_vector(col0.release(),
+                                        col1.release(),
+                                        col2.release(),
+                                        col3.release(),
+                                        col4.release(),
+                                        col5.release(),
+                                        col6.release(),
+                                        col7.release(),
+                                        col8.release(),
+                                        col9.release(),
+                                        col10.release());
+  return cudf::table(std::move(colsptr));
+}
+
+TEST_F(ORCTest, ORCWriter)
+{
+  auto tab      = construct_table();
+  auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, tab);
+  cudf::io::write_orc(out_opts, cudf::test::get_default_stream());
+}
+
+TEST_F(ORCTest, ORCReader)
+{
+  auto tab      = construct_table();
+  auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, tab);
+  cudf::io::write_orc(out_opts, cudf::test::get_default_stream());
+
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{{filepath}});
+  auto result = cudf::io::read_orc(read_opts, cudf::test::get_default_stream());
+
+  auto meta        = read_orc_metadata(cudf::io::source_info{filepath});
+  auto const stats = cudf::io::read_parsed_orc_statistics(cudf::io::source_info{filepath});
+}
diff --git a/cpp/tests/streams/labeling_bins_test.cpp b/cpp/tests/streams/labeling_bins_test.cpp
new file mode 100644
index 00000000000..a1d3983aacc
--- /dev/null
+++ b/cpp/tests/streams/labeling_bins_test.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/labeling/label_bins.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class LabelingBinsStreamTest : public cudf::test::BaseFixture {};
+
+TEST_F(LabelingBinsStreamTest, SimpleStringsTest)
+{
+  cudf::test::strings_column_wrapper left_edges{"a", "b", "c", "d", "e"};
+  cudf::test::strings_column_wrapper right_edges{"b", "c", "d", "e", "f"};
+  cudf::test::strings_column_wrapper input{"abc", "bcd", "cde", "def", "efg"};
+
+  cudf::label_bins(input,
+                   left_edges,
+                   cudf::inclusive::YES,
+                   right_edges,
+                   cudf::inclusive::NO,
+                   cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/pool_test.cu b/cpp/tests/streams/pool_test.cu
index 0f92e1c0c2b..52debe24fe8 100644
--- a/cpp/tests/streams/pool_test.cu
+++ b/cpp/tests/streams/pool_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 class StreamPoolTest : public cudf::test::BaseFixture {};
 
-__global__ void do_nothing_kernel() {}
+CUDF_KERNEL void do_nothing_kernel() {}
 
 TEST_F(StreamPoolTest, ForkStreams)
 {
diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp
index d1e0dfb1ff1..c6cc8e078bb 100644
--- a/cpp/tests/strings/array_tests.cpp
+++ b/cpp/tests/strings/array_tests.cpp
@@ -25,6 +25,7 @@
 #include <cudf/sorting.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -154,9 +155,14 @@ TEST_F(StringsColumnTest, GatherTooBig)
   std::vector<int8_t> h_chars(3000000);
   cudf::test::fixed_width_column_wrapper<int8_t> chars(h_chars.begin(), h_chars.end());
   cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets({0, 3000000});
-  auto input = cudf::column_view(
-    cudf::data_type{cudf::type_id::STRING}, 1, nullptr, nullptr, 0, 0, {offsets, chars});
-  auto map = thrust::constant_iterator<int8_t>(0);
+  auto input = cudf::column_view(cudf::data_type{cudf::type_id::STRING},
+                                 1,
+                                 cudf::column_view(chars).begin<int8_t>(),
+                                 nullptr,
+                                 0,
+                                 0,
+                                 {offsets});
+  auto map   = thrust::constant_iterator<int8_t>(0);
   cudf::test::fixed_width_column_wrapper<int8_t> gather_map(map, map + 1000);
   EXPECT_THROW(cudf::gather(cudf::table_view{{input}}, gather_map), std::overflow_error);
 }
@@ -220,7 +226,6 @@ TEST_F(StringsColumnTest, OffsetsBeginEnd)
   scv = cudf::strings_column_view(cudf::slice(input, {1, 5}).front());
   EXPECT_EQ(std::distance(scv.offsets_begin(), scv.offsets_end()),
             static_cast<std::ptrdiff_t>(scv.size() + 1));
-  EXPECT_EQ(std::distance(scv.chars_begin(), scv.chars_end()), 16L);
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 13459197aa3..2d9e2035e5e 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -297,17 +297,14 @@ TEST_F(StringsContainsTests, HexTest)
   std::vector<cudf::size_type> offsets(
     {thrust::make_counting_iterator<cudf::size_type>(0),
      thrust::make_counting_iterator<cudf::size_type>(0) + count + 1});
-  auto d_chars = std::make_unique<cudf::column>(
-    cudf::detail::make_device_uvector_sync(
-      ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
-    rmm::device_buffer{},
-    0);
+  auto d_chars = cudf::detail::make_device_uvector_sync(
+    ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto d_offsets = std::make_unique<cudf::column>(
     cudf::detail::make_device_uvector_sync(
       offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
     rmm::device_buffer{},
     0);
-  auto input = cudf::make_strings_column(count, std::move(d_offsets), std::move(d_chars), 0, {});
+  auto input = cudf::make_strings_column(count, std::move(d_offsets), d_chars.release(), 0, {});
 
   auto strings_view = cudf::strings_column_view(input->view());
   for (auto ch : ascii_chars) {
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index 1066738df72..64123690aea 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -87,16 +87,18 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
     EXPECT_TRUE(column->nullable());
     EXPECT_TRUE(column->has_nulls());
   }
-  EXPECT_EQ(2, column->num_children());
+  EXPECT_EQ(1, column->num_children());
+  EXPECT_NE(nullptr, column->view().head());
 
   cudf::strings_column_view strings_view(column->view());
   EXPECT_EQ(strings_view.size(), count);
   EXPECT_EQ(strings_view.offsets().size(), count + 1);
-  EXPECT_EQ(strings_view.chars().size(), memsize);
+  EXPECT_EQ(strings_view.chars_size(cudf::get_default_stream()), memsize);
 
   // check string data
   auto h_chars_data = cudf::detail::make_std_vector_sync(
-    cudf::device_span<char const>(strings_view.chars().data<char>(), strings_view.chars().size()),
+    cudf::device_span<char const>(strings_view.chars_begin(cudf::get_default_stream()),
+                                  strings_view.chars_size(cudf::get_default_stream())),
     cudf::get_default_stream());
   auto h_offsets_data = cudf::detail::make_std_vector_sync(
     cudf::device_span<cudf::size_type const>(
@@ -143,11 +145,8 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
   }
 
   std::vector<cudf::bitmask_type> h_nulls{h_null_mask};
-  auto d_buffer = std::make_unique<cudf::column>(
-    cudf::detail::make_device_uvector_sync(
-      h_buffer, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
-    rmm::device_buffer{},
-    0);
+  auto d_buffer = cudf::detail::make_device_uvector_sync(
+    h_buffer, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto d_offsets = std::make_unique<cudf::column>(
     cudf::detail::make_device_uvector_sync(
       h_offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
@@ -156,19 +155,21 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
   auto d_nulls = cudf::detail::make_device_uvector_sync(
     h_nulls, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto column = cudf::make_strings_column(
-    count, std::move(d_offsets), std::move(d_buffer), null_count, d_nulls.release());
+    count, std::move(d_offsets), d_buffer.release(), null_count, d_nulls.release());
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
   EXPECT_EQ(column->null_count(), null_count);
-  EXPECT_EQ(2, column->num_children());
+  EXPECT_EQ(1, column->num_children());
+  EXPECT_NE(nullptr, column->view().head());
 
   cudf::strings_column_view strings_view(column->view());
   EXPECT_EQ(strings_view.size(), count);
   EXPECT_EQ(strings_view.offsets().size(), count + 1);
-  EXPECT_EQ(strings_view.chars().size(), memsize);
+  EXPECT_EQ(strings_view.chars_size(cudf::get_default_stream()), memsize);
 
   // check string data
   auto h_chars_data = cudf::detail::make_std_vector_sync(
-    cudf::device_span<char const>(strings_view.chars().data<char>(), strings_view.chars().size()),
+    cudf::device_span<char const>(strings_view.chars_begin(cudf::get_default_stream()),
+                                  strings_view.chars_size(cudf::get_default_stream())),
     cudf::get_default_stream());
   auto h_offsets_data = cudf::detail::make_std_vector_sync(
     cudf::device_span<cudf::size_type const>(
@@ -193,8 +194,7 @@ TEST_F(StringsFactoriesTest, CreateScalar)
 
 TEST_F(StringsFactoriesTest, EmptyStringsColumn)
 {
-  auto d_chars = std::make_unique<cudf::column>(
-    rmm::device_uvector<char>{0, cudf::get_default_stream()}, rmm::device_buffer{}, 0);
+  auto d_chars   = rmm::device_uvector<char>(0, cudf::get_default_stream());
   auto d_offsets = std::make_unique<cudf::column>(
     cudf::detail::make_zeroed_device_uvector_sync<cudf::size_type>(
       1, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
@@ -203,7 +203,7 @@ TEST_F(StringsFactoriesTest, EmptyStringsColumn)
   rmm::device_uvector<cudf::bitmask_type> d_nulls{0, cudf::get_default_stream()};
 
   auto results =
-    cudf::make_strings_column(0, std::move(d_offsets), std::move(d_chars), 0, d_nulls.release());
+    cudf::make_strings_column(0, std::move(d_offsets), d_chars.release(), 0, d_nulls.release());
   cudf::test::expect_column_empty(results->view());
 
   rmm::device_uvector<thrust::pair<char const*, cudf::size_type>> d_strings{
diff --git a/cpp/tests/transform/row_conversion.cpp b/cpp/tests/transform/row_conversion.cpp
index e54929f1651..542ccc5e2d5 100644
--- a/cpp/tests/transform/row_conversion.cpp
+++ b/cpp/tests/transform/row_conversion.cpp
@@ -14,26 +14,20 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/io/parquet.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/row_conversion.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/wrappers/timestamps.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/random.hpp>
-#include <cudf_test/table_utilities.hpp>
-
-#include <rmm/exec_policy.hpp>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 
 #include <limits>
+#include <random>
 
 struct ColumnToRowTests : public cudf::test::BaseFixture {};
 struct RowToColumnTests : public cudf::test::BaseFixture {};
@@ -833,19 +827,7 @@ TEST_F(RowToColumnTests, SimpleString)
   EXPECT_EQ(new_rows.size(), 1);
   for (auto& row : new_rows) {
     auto new_cols = cudf::convert_from_rows(cudf::lists_column_view(*row), schema);
-
     EXPECT_EQ(row->size(), 5);
-    auto const num_columns = new_cols->num_columns();
-
-    cudf::strings_column_view str_col = new_cols->get_column(1).view();
-    std::vector<thrust::host_vector<int8_t>> col_data;
-    std::vector<thrust::host_vector<cudf::size_type>> offset_data;
-    for (int i = 0; i < num_columns; ++i) {
-      offset_data.emplace_back(
-        std::get<0>(cudf::test::to_host<cudf::size_type>(str_col.offsets())));
-      col_data.emplace_back(std::get<0>(cudf::test::to_host<int8_t>(str_col.chars())));
-    }
-
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in, *new_cols);
   }
 }
diff --git a/cpp/tests/types/type_dispatcher_test.cu b/cpp/tests/types/type_dispatcher_test.cu
index d7df8f03ec1..0b26330d323 100644
--- a/cpp/tests/types/type_dispatcher_test.cu
+++ b/cpp/tests/types/type_dispatcher_test.cu
@@ -59,7 +59,7 @@ struct verify_dispatched_type {
   }
 };
 
-__global__ void dispatch_test_kernel(cudf::type_id id, bool* d_result)
+CUDF_KERNEL void dispatch_test_kernel(cudf::type_id id, bool* d_result)
 {
   if (0 == threadIdx.x + blockIdx.x * blockDim.x)
     *d_result = cudf::type_dispatcher(cudf::data_type{id}, verify_dispatched_type{}, id);
@@ -119,7 +119,7 @@ struct verify_double_dispatched_type {
   }
 };
 
-__global__ void double_dispatch_test_kernel(cudf::type_id id1, cudf::type_id id2, bool* d_result)
+CUDF_KERNEL void double_dispatch_test_kernel(cudf::type_id id1, cudf::type_id id2, bool* d_result)
 {
   if (0 == threadIdx.x + blockIdx.x * blockDim.x)
     *d_result = cudf::double_type_dispatcher(
diff --git a/cpp/tests/utilities_tests/column_wrapper_tests.cpp b/cpp/tests/utilities_tests/column_wrapper_tests.cpp
index da17e33e11a..479c6687e75 100644
--- a/cpp/tests/utilities_tests/column_wrapper_tests.cpp
+++ b/cpp/tests/utilities_tests/column_wrapper_tests.cpp
@@ -255,7 +255,7 @@ TYPED_TEST(StringsColumnWrapperTest, NullablePairListConstructorAllNull)
   EXPECT_EQ(view.size(), count);
   EXPECT_EQ(view.offsets().size(), count + 1);
   // all null entries results in no data allocated to chars
-  EXPECT_EQ(nullptr, view.chars().head());
+  EXPECT_EQ(nullptr, view.parent().head());
   EXPECT_NE(nullptr, view.offsets().head());
   EXPECT_TRUE(view.has_nulls());
   EXPECT_EQ(view.null_count(), 5);
diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu
index 870528d306c..2075c67a18a 100644
--- a/cpp/tests/utilities_tests/span_tests.cu
+++ b/cpp/tests/utilities_tests/span_tests.cu
@@ -247,7 +247,7 @@ TEST(SpanTest, CanConstructFromDeviceContainers)
   (void)device_span<int const>(d_uvector_c);
 }
 
-__global__ void simple_device_kernel(device_span<bool> result) { result[0] = true; }
+CUDF_KERNEL void simple_device_kernel(device_span<bool> result) { result[0] = true; }
 
 TEST(SpanTest, CanUseDeviceSpan)
 {
@@ -277,7 +277,7 @@ TEST(MdSpanTest, CanDetermineEmptiness)
   EXPECT_TRUE(device_2dspan<int const>{no_columns_vector}.is_empty());
 }
 
-__global__ void readwrite_kernel(device_2dspan<int> result)
+CUDF_KERNEL void readwrite_kernel(device_2dspan<int> result)
 {
   if (result[5][6] == 5) {
     result[5][6] *= 6;
@@ -436,7 +436,7 @@ TEST(HostDeviceSpanTest, CanSendToDevice)
   EXPECT_EQ(std::string(d_message), hello_world_message);
 }
 
-__global__ void simple_device_char_kernel(device_span<char> result)
+CUDF_KERNEL void simple_device_char_kernel(device_span<char> result)
 {
   char const* str = "world hello";
   for (int offset = 0; offset < result.size(); ++offset) {
diff --git a/dependencies.yaml b/dependencies.yaml
index 28b3afd7bbc..90a04b2f876 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -457,6 +457,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
+          - breathe>=4.35.0
           - dask-cuda==24.2.*
           - *doxygen
           - make
@@ -533,16 +534,19 @@ dependencies:
           - {matrix: null, packages: *run_cudf_packages_all_cu11}
       - output_types: conda
         matrices:
+          - matrix: {cuda: "12.*"}
+            packages:
+              - pynvjitlink
           - matrix: {cuda: "11.*"}
             packages:
               - cubinlinker
               - ptxcompiler
-          - {matrix: null, packages: null}
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
               - rmm-cu12==24.2.*
+              - pynvjitlink-cu12
           - matrix: {cuda: "11.*"}
             packages:
               - rmm-cu11==24.2.*
@@ -619,7 +623,7 @@ dependencies:
           - fastavro>=0.22.9
           - hypothesis
           - pytest-benchmark
-          - pytest-cases
+          - pytest-cases>=3.8.2
           - python-snappy>=0.6.0
           - scipy
       - output_types: conda
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index d98d1fa5d81..34ffd7f0258 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # cudf documentation build configuration file, created by
 # sphinx-quickstart on Wed May  3 10:59:22 2017.
@@ -16,11 +16,33 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
+import glob
 import os
+import re
 import sys
+import xml.etree.ElementTree as ET
 
 from docutils.nodes import Text
 from sphinx.addnodes import pending_xref
+from sphinx.highlighting import lexers
+from sphinx.ext import intersphinx
+from pygments.lexer import RegexLexer
+from pygments.token import Text as PText
+
+
+class PseudoLexer(RegexLexer):
+    """Trivial lexer for pseudocode."""
+
+    name = "pseudocode"
+    aliases = ["pseudo"]
+    tokens = {
+        "root": [
+            (r".*\n", PText),
+        ]
+    }
+
+
+lexers["pseudo"] = PseudoLexer()
 
 # -- Custom Extensions ----------------------------------------------------
 sys.path.append(os.path.abspath("./_ext"))
@@ -35,6 +57,7 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
+    "breathe",
     "sphinx.ext.intersphinx",
     "sphinx.ext.autodoc",
     "sphinx.ext.autosummary",
@@ -46,7 +69,74 @@
     "myst_nb",
 ]
 
-nb_execution_excludepatterns = ['performance-comparisons.ipynb']
+
+# Preprocess doxygen xml for compatibility with latest Breathe
+def clean_definitions(root):
+    # Breathe can't handle SFINAE properly:
+    # https://github.com/breathe-doc/breathe/issues/624
+    seen_ids = set()
+    for sectiondef in root.findall(".//sectiondef"):
+        for memberdef in sectiondef.findall("./memberdef"):
+            id_ = memberdef.get("id")
+            for tparamlist in memberdef.findall("./templateparamlist"):
+                for param in tparamlist.findall("./param"):
+                    for type_ in param.findall("./type"):
+                        # CUDF_ENABLE_IF or std::enable_if
+                        if "enable_if" in ET.tostring(type_).decode().lower():
+                            if id_ not in seen_ids:
+                                # If this is the first time we're seeing this function,
+                                # just remove the template parameter.
+                                seen_ids.add(id_)
+                                tparamlist.remove(param)
+                            else:
+                                # Otherwise, remove the overload altogether and just
+                                # rely on documenting one of the SFINAE overloads.
+                                sectiondef.remove(memberdef)
+                            break
+
+                        # In addition to enable_if, check for overloads set up by
+                        # ...*=nullptr.
+                        for type_ in param.findall("./defval"):
+                            if "nullptr" in ET.tostring(type_).decode():
+                                try:
+                                    tparamlist.remove(param)
+                                except ValueError:
+                                    # May have already been removed in above,
+                                    # so skip.
+                                    pass
+                                break
+
+    # All of these in type declarations cause Breathe to choke.
+    # For friend, see https://github.com/breathe-doc/breathe/issues/916
+    strings_to_remove = (
+        "__forceinline__",
+        "CUDF_HOST_DEVICE",
+        "decltype(auto)",
+        "friend",
+    )
+    for node in root.iter():
+        for string in strings_to_remove:
+            if node.text is not None:
+                node.text = node.text.replace(string, "")
+            if node.tail is not None:
+                node.tail = node.tail.replace(string, "")
+
+
+def clean_all_xml_files(path):
+    for fn in glob.glob(os.path.join(path, "*.xml")):
+        tree = ET.parse(fn)
+        clean_definitions(tree.getroot())
+        tree.write(fn)
+
+
+# Breathe Configuration
+breathe_projects = {"libcudf": "../../../cpp/doxygen/xml"}
+for project_path in breathe_projects.values():
+    clean_all_xml_files(project_path)
+breathe_default_project = "libcudf"
+
+
+nb_execution_excludepatterns = ["performance-comparisons.ipynb"]
 
 nb_execution_mode = "force"
 nb_execution_timeout = 300
@@ -79,9 +169,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '24.02'
+version = "24.02"
 # The full version, including alpha/beta/rc tags.
-release = '24.02.00'
+release = "24.02.00"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -93,7 +183,10 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = ['venv', "**/includes/**",]
+exclude_patterns = [
+    "venv",
+    "**/includes/**",
+]
 
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = "sphinx"
@@ -195,12 +288,17 @@
 
 # Example configuration for intersphinx: refer to the Python standard library.
 intersphinx_mapping = {
-    "python": ("https://docs.python.org/3", None),
     "cupy": ("https://docs.cupy.dev/en/stable/", None),
+    "dlpack": ("https://dmlc.github.io/dlpack/latest/", None),
     "numpy": ("https://numpy.org/doc/stable", None),
-    "pyarrow": ("https://arrow.apache.org/docs/", None),
     "pandas": ("https://pandas.pydata.org/docs/", None),
-    "typing_extensions": ("https://typing-extensions.readthedocs.io/en/stable/", None),
+    "pyarrow": ("https://arrow.apache.org/docs/", None),
+    "python": ("https://docs.python.org/3", None),
+    "rmm": ("https://docs.rapids.ai/api/rmm/nightly/", None),
+    "typing_extensions": (
+        "https://typing-extensions.readthedocs.io/en/stable/",
+        None,
+    ),
 }
 
 # Config numpydoc
@@ -238,14 +336,178 @@ def resolve_aliases(app, doctree):
             text_node.parent.replace(text_node, Text(text_to_render, ""))
 
 
-def ignore_internal_references(app, env, node, contnode):
-    name = node.get("reftarget", None)
-    if name == "cudf.core.index.GenericIndex":
+def _generate_namespaces(namespaces):
+    all_namespaces = []
+    for base_namespace, other_namespaces in namespaces.items():
+        all_namespaces.append(base_namespace + "::")
+        for other_namespace in other_namespaces:
+            all_namespaces.append(f"{other_namespace}::")
+            all_namespaces.append(f"{base_namespace}::{other_namespace}::")
+    return all_namespaces
+
+
+_all_namespaces = _generate_namespaces(
+    {
+        # Note that io::datasource is actually a nested class
+        "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression"},
+        "numeric": {},
+        "nvtext": {},
+    }
+)
+
+_names_to_skip = {
+    # External names
+    "thrust",
+    "cuda",
+    "arrow",
+    # Unknown types
+    "int8_t",
+    "int16_t",
+    "int32_t",
+    "int64_t",
+    "__int128_t",
+    "size_t",
+    "uint8_t",
+    "uint16_t",
+    "uint32_t",
+    "uint64_t",
+    # Internal objects
+    "id_to_type_impl",
+    "type_to_scalar_type_impl",
+    "type_to_scalar_type_impl",
+    "detail",
+    # kafka objects
+    "python_callable_type",
+    "kafka_oauth_callback_wrapper_type",
+    # Template types
+    "Radix",
+    # Unsupported by Breathe
+    # https://github.com/breathe-doc/breathe/issues/355
+    "deprecated",
+    # TODO: This is currently in a src file but perhaps should be public
+    "orc::column_statistics",
+    # Sphinx doesn't know how to distinguish between the ORC and Parquet
+    # definitions because Breathe doesn't to preserve namespaces for enums.
+    "TypeKind",
+}
+
+_domain_objects = None
+_prefixed_domain_objects = None
+_intersphinx_cache = {}
+
+_intersphinx_extra_prefixes = ("rmm", "rmm::mr", "mr")
+
+
+def _cached_intersphinx_lookup(env, node, contnode):
+    """Perform an intersphinx lookup and cache the result.
+
+    Have to manually manage the intersphinx cache because lru_cache doesn't
+    handle the env object properly.
+    """
+    key = (node, contnode)
+    if key in _intersphinx_cache:
+        return _intersphinx_cache[key]
+    if (
+        ref := intersphinx.resolve_reference_detect_inventory(
+            env, node, contnode
+        )
+    ) is not None:
+        _intersphinx_cache[key] = ref
+    return ref
+
+
+def on_missing_reference(app, env, node, contnode):
+    # These variables are defined outside the function to speed up the build.
+    global _all_namespaces, _names_to_skip, _intersphinx_extra_prefixes, _domain_objects, _prefixed_domain_objects, _intersphinx_cache
+
+    # Precompute and cache domains for faster lookups
+    if _domain_objects is None:
+        _domain_objects = {}
+        _prefixed_domain_objects = {}
+        for name, _, _, docname, _, _ in env.domains["cpp"].get_objects():
+            _domain_objects[name] = docname
+            for prefix in _all_namespaces:
+                _prefixed_domain_objects[f"{prefix}{name}"] = name
+
+    reftarget = node.get("reftarget")
+    if reftarget == "cudf.core.index.GenericIndex":
         # We don't exposed docs for `cudf.core.index.GenericIndex`
         # hence we would want the docstring & mypy references to
         # use `cudf.Index`
         node["reftarget"] = "cudf.Index"
         return contnode
+    if "namespacecudf" in reftarget:
+        node["reftarget"] = "cudf"
+        return contnode
+    if "classcudf_1_1column__device__view_" in reftarget:
+        node["reftarget"] = "cudf::column_device_view"
+        return contnode
+
+    if (refid := node.get("refid")) is not None and "hpp" in refid:
+        # We don't want to link to C++ header files directly from the
+        # Sphinx docs, those are pages that doxygen automatically
+        # generates. Adding those would clutter the Sphinx output.
+        return contnode
+
+    if node["refdomain"] in ("std", "cpp") and reftarget is not None:
+        if any(toskip in reftarget for toskip in _names_to_skip):
+            return contnode
+
+        # Strip template parameters and just use the base type.
+        if match := re.search("(.*)<.*>", reftarget):
+            reftarget = match.group(1)
+
+        # Try to find the target prefixed with e.g. namespaces in case that's
+        # all that's missing.
+        # We need to do this search because the call sites may not have used
+        # the namespaces and we don't want to force them to, and we have to
+        # consider both directions because of issues like
+        # https://github.com/breathe-doc/breathe/issues/860
+        # (there may be other related issues, I haven't investigated all
+        # possible combinations of failures in depth).
+        if (name := _prefixed_domain_objects.get(reftarget)) is None:
+            for prefix in _all_namespaces:
+                if f"{prefix}{reftarget}" in _domain_objects:
+                    name = f"{prefix}{reftarget}"
+                    break
+        if name is not None:
+            return env.domains["cpp"].resolve_xref(
+                env,
+                _domain_objects[name],
+                app.builder,
+                node["reftype"],
+                name,
+                node,
+                contnode,
+            )
+
+        # Final possibility is an intersphinx lookup to see if the symbol
+        # exists in one of the other inventories. First we check the symbol
+        # itself in case it was originally templated and that caused the lookup
+        # to fail.
+        if reftarget != node["reftarget"]:
+            node["reftarget"] = reftarget
+            if (
+                ref := _cached_intersphinx_lookup(env, node, contnode)
+            ) is not None:
+                return ref
+
+        # If the template wasn't the (only) issue, we check the various
+        # namespace prefixes that may need to be added or removed.
+        for prefix in _intersphinx_extra_prefixes:
+            if prefix not in reftarget:
+                node["reftarget"] = f"{prefix}::{reftarget}"
+                if (
+                    ref := _cached_intersphinx_lookup(env, node, contnode)
+                ) is not None:
+                    return ref
+            else:
+                node["reftarget"] = reftarget.replace(f"{prefix}::", "")
+                if (
+                    ref := _cached_intersphinx_lookup(env, node, contnode)
+                ) is not None:
+                    return ref
+
     return None
 
 
@@ -257,8 +519,11 @@ def ignore_internal_references(app, env, node, contnode):
     ("py:class", "typing_extensions.Self"),
 ]
 
+
 def setup(app):
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
-    app.add_js_file("https://docs.rapids.ai/assets/js/custom.js", loading_method="defer")
+    app.add_js_file(
+        "https://docs.rapids.ai/assets/js/custom.js", loading_method="defer"
+    )
     app.connect("doctree-read", resolve_aliases)
-    app.connect("missing-reference", ignore_internal_references)
+    app.connect("missing-reference", on_missing_reference)
diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index bf9c2b98c2d..bbeaf0a5f00 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -113,6 +113,9 @@ There are a few known limitations that you should be aware of:
   pandas
 - `cudf.pandas` isn't compatible with directly using `import cudf`
    and is intended to be used with pandas-based workflows.
+- Unpickling objects that were pickled with "regular" pandas will not
+  work: you must have pickled an object with `cudf.pandas` enabled for
+  it to be unpickled when `cudf.pandas` is enabled.
 - Global variables can be accessed but can't be modified during CPU-fallback
 
   ```python
diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md
index 016c2c1d281..0b37de00f6b 100644
--- a/docs/cudf/source/developer_guide/library_design.md
+++ b/docs/cudf/source/developer_guide/library_design.md
@@ -325,26 +325,26 @@ This section describes the internal implementation details of the copy-on-write
 It is recommended that developers familiarize themselves with [the user-facing documentation](copy-on-write-user-doc) of this functionality before reading through the internals
 below.
 
-The core copy-on-write implementation relies on the factory function `as_exposure_tracked_buffer` and the two classes `ExposureTrackedBuffer` and `BufferSlice`.
+The core copy-on-write implementation relies on `ExposureTrackedBuffer` and the tracking features of `BufferOwner`.
 
-An `ExposureTrackedBuffer` is a subclass of the regular `Buffer` that tracks internal and external references to its underlying memory. Internal references are tracked by maintaining [weak references](https://docs.python.org/3/library/weakref.html) to every `BufferSlice` of the underlying memory. External references are tracked through "exposure" status of the underlying memory. A buffer is considered exposed if the device pointer (integer or void*) has been handed out to a library outside of cudf. In this case, we have no way of knowing if the data are being modified by a third party.
+`BufferOwner` tracks internal and external references to its underlying memory. Internal references are tracked by maintaining [weak references](https://docs.python.org/3/library/weakref.html) to every `ExposureTrackedBuffer` of the underlying memory. External references are tracked through "exposure" status of the underlying memory. A buffer is considered exposed if the device pointer (integer or void*) has been handed out to a library outside of cudf. In this case, we have no way of knowing if the data are being modified by a third party.
 
-`BufferSlice` is a subclass of `ExposureTrackedBuffer` that represents a _slice_ of the memory underlying a exposure tracked buffer.
+`ExposureTrackedBuffer` is a subclass of `Buffer` that represents a _slice_ of the memory underlying an exposure tracked buffer.
 
-When the cudf option `"copy_on_write"` is `True`, `as_buffer` calls `as_exposure_tracked_buffer`, which always returns a `BufferSlice`. It is then the slices that determine whether or not to make a copy when a write operation is performed on a `Column` (see below). If multiple slices point to the same underlying memory, then a copy must be made whenever a modification is attempted.
+When the cudf option `"copy_on_write"` is `True`, `as_buffer` returns a `ExposureTrackedBuffer`. It is this class that determines whether or not to make a copy when a write operation is performed on a `Column` (see below). If multiple slices point to the same underlying memory, then a copy must be made whenever a modification is attempted.
 
 
 ### Eager copies when exposing to third-party libraries
 
-If a `Column`/`BufferSlice` is exposed to a third-party library via `__cuda_array_interface__`, we are no longer able to track whether or not modification of the buffer has occurred. Hence whenever
+If a `Column`/`ExposureTrackedBuffer` is exposed to a third-party library via `__cuda_array_interface__`, we are no longer able to track whether or not modification of the buffer has occurred. Hence whenever
 someone accesses data through the `__cuda_array_interface__`, we eagerly trigger the copy by calling
-`.make_single_owner_inplace` which ensures a true copy of underlying data is made and that the slice is the sole owner. Any future copy requests must also trigger a true physical copy (since we cannot track the lifetime of the third-party object). To handle this we also mark the `Column`/`BufferSlice` as exposed thus indicating that any future shallow-copy requests will trigger a true physical copy rather than a copy-on-write shallow copy.
+`.make_single_owner_inplace` which ensures a true copy of underlying data is made and that the slice is the sole owner. Any future copy requests must also trigger a true physical copy (since we cannot track the lifetime of the third-party object). To handle this we also mark the `Column`/`ExposureTrackedBuffer` as exposed thus indicating that any future shallow-copy requests will trigger a true physical copy rather than a copy-on-write shallow copy.
 
 ### Obtaining a read-only object
 
 A read-only object can be quite useful for operations that will not
 mutate the data. This can be achieved by calling `.get_ptr(mode="read")`, and using `cuda_array_interface_wrapper` to wrap a `__cuda_array_interface__` object around it.
-This will not trigger a deep copy even if multiple `BufferSlice` points to the same `ExposureTrackedBuffer`. This API should only be used when the lifetime of the proxy object is restricted to cudf's internal code execution. Handing this out to external libraries or user-facing APIs will lead to untracked references and undefined copy-on-write behavior. We currently use this API for device to host
+This will not trigger a deep copy even if multiple `ExposureTrackedBuffer`s point to the same `ExposureTrackedBufferOwner`. This API should only be used when the lifetime of the proxy object is restricted to cudf's internal code execution. Handing this out to external libraries or user-facing APIs will lead to untracked references and undefined copy-on-write behavior. We currently use this API for device to host
 copies like in `ColumnBase.data_array_view(mode="read")` which is used for `Column.values_host`.
 
 
diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst
index 21badd683af..3765b560a7f 100644
--- a/docs/cudf/source/index.rst
+++ b/docs/cudf/source/index.rst
@@ -29,4 +29,5 @@ other operations.
 
    user_guide/index
    cudf_pandas/index
+   libcudf_docs/index
    developer_guide/index
diff --git a/docs/cudf/source/libcudf_docs/api_docs/aggregation_factories.rst b/docs/cudf/source/libcudf_docs/api_docs/aggregation_factories.rst
new file mode 100644
index 00000000000..49677acc730
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/aggregation_factories.rst
@@ -0,0 +1,5 @@
+Aggregation Factories
+=====================
+
+.. doxygengroup:: aggregation_factories
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/aggregation_groupby.rst b/docs/cudf/source/libcudf_docs/api_docs/aggregation_groupby.rst
new file mode 100644
index 00000000000..5af85e5d74a
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/aggregation_groupby.rst
@@ -0,0 +1,5 @@
+Aggregation Groupby
+===================
+
+.. doxygengroup:: aggregation_groupby
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/aggregation_reduction.rst b/docs/cudf/source/libcudf_docs/api_docs/aggregation_reduction.rst
new file mode 100644
index 00000000000..e0f5046fe61
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/aggregation_reduction.rst
@@ -0,0 +1,5 @@
+Aggregation Reduction
+=====================
+
+.. doxygengroup:: aggregation_reduction
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/aggregation_rolling.rst b/docs/cudf/source/libcudf_docs/api_docs/aggregation_rolling.rst
new file mode 100644
index 00000000000..962f332adc3
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/aggregation_rolling.rst
@@ -0,0 +1,5 @@
+Aggregation Rolling
+===================
+
+.. doxygengroup:: aggregation_rolling
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_aggregation.rst b/docs/cudf/source/libcudf_docs/api_docs/column_aggregation.rst
new file mode 100644
index 00000000000..39c0fdd5d3c
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_aggregation.rst
@@ -0,0 +1,14 @@
+Column Aggregation
+==================
+
+.. doxygengroup:: column_aggregation
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   aggregation_factories
+   aggregation_reduction
+   aggregation_groupby
+   aggregation_rolling
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/column_apis.rst
new file mode 100644
index 00000000000..23660576a37
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_apis.rst
@@ -0,0 +1,23 @@
+Column APIs
+===========
+
+.. doxygengroup:: column_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   column_copy
+   column_nullmask
+   column_sort
+   column_search
+   column_hash
+   column_merge
+   column_join
+   column_quantiles
+   column_aggregation
+   column_transformation
+   column_reshape
+   column_reorder
+   column_interop
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/column_classes.rst
new file mode 100644
index 00000000000..cc3ed8c6626
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_classes.rst
@@ -0,0 +1,16 @@
+Column Classes
+==============
+
+.. doxygengroup:: column_classes
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   column_factories
+   dictionary_classes
+   lists_classes
+   strings_classes
+   structs_classes
+   timestamp_classes
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_copy.rst b/docs/cudf/source/libcudf_docs/api_docs/column_copy.rst
new file mode 100644
index 00000000000..a8bc72fc505
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_copy.rst
@@ -0,0 +1,16 @@
+Column Copy
+===========
+
+.. doxygengroup:: column_copy
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   copy_concatenate
+   copy_gather
+   copy_scatter
+   copy_slice
+   copy_split
+   copy_shift
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_factories.rst b/docs/cudf/source/libcudf_docs/api_docs/column_factories.rst
new file mode 100644
index 00000000000..938db2a40a5
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_factories.rst
@@ -0,0 +1,5 @@
+Column Factories
+================
+
+.. doxygengroup:: column_factories
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_hash.rst b/docs/cudf/source/libcudf_docs/api_docs/column_hash.rst
new file mode 100644
index 00000000000..cd0c2838474
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_hash.rst
@@ -0,0 +1,5 @@
+Column Hash
+===========
+
+.. doxygengroup:: column_hash
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_interop.rst b/docs/cudf/source/libcudf_docs/api_docs/column_interop.rst
new file mode 100644
index 00000000000..dd6036b0339
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_interop.rst
@@ -0,0 +1,12 @@
+Column Interop
+==============
+
+.. doxygengroup:: column_interop
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   interop_dlpack
+   interop_arrow
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_join.rst b/docs/cudf/source/libcudf_docs/api_docs/column_join.rst
new file mode 100644
index 00000000000..903319f4881
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_join.rst
@@ -0,0 +1,5 @@
+Column Join
+===========
+
+.. doxygengroup:: column_join
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_merge.rst b/docs/cudf/source/libcudf_docs/api_docs/column_merge.rst
new file mode 100644
index 00000000000..0f12ad3d169
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_merge.rst
@@ -0,0 +1,5 @@
+Column Merge
+============
+
+.. doxygengroup:: column_merge
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_nullmask.rst b/docs/cudf/source/libcudf_docs/api_docs/column_nullmask.rst
new file mode 100644
index 00000000000..35c94e0b03e
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_nullmask.rst
@@ -0,0 +1,5 @@
+Column Nullmask
+===============
+
+.. doxygengroup:: column_nullmask
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_quantiles.rst b/docs/cudf/source/libcudf_docs/api_docs/column_quantiles.rst
new file mode 100644
index 00000000000..f1e1b3f843e
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_quantiles.rst
@@ -0,0 +1,5 @@
+Column Quantiles
+================
+
+.. doxygengroup:: column_quantiles
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_reorder.rst b/docs/cudf/source/libcudf_docs/api_docs/column_reorder.rst
new file mode 100644
index 00000000000..ccc8a48f7df
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_reorder.rst
@@ -0,0 +1,12 @@
+Column Reorder
+==============
+
+.. doxygengroup:: column_reorder
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   reorder_partition
+   reorder_compact
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_reshape.rst b/docs/cudf/source/libcudf_docs/api_docs/column_reshape.rst
new file mode 100644
index 00000000000..59df1eaa5b9
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_reshape.rst
@@ -0,0 +1,11 @@
+Column Reshape
+==============
+
+.. doxygengroup:: column_reshape
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   reshape_transpose
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_search.rst b/docs/cudf/source/libcudf_docs/api_docs/column_search.rst
new file mode 100644
index 00000000000..1997c5618e3
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_search.rst
@@ -0,0 +1,5 @@
+Column Search
+=============
+
+.. doxygengroup:: column_search
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_sort.rst b/docs/cudf/source/libcudf_docs/api_docs/column_sort.rst
new file mode 100644
index 00000000000..4d8fe895109
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_sort.rst
@@ -0,0 +1,5 @@
+Column Sort
+===========
+
+.. doxygengroup:: column_sort
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/column_transformation.rst b/docs/cudf/source/libcudf_docs/api_docs/column_transformation.rst
new file mode 100644
index 00000000000..e2da7e5ad28
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/column_transformation.rst
@@ -0,0 +1,15 @@
+Column Transformation
+=====================
+
+.. doxygengroup:: column_transformation
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   transformation_unaryops
+   transformation_binaryops
+   transformation_transform
+   transformation_replace
+   transformation_fill
diff --git a/docs/cudf/source/libcudf_docs/api_docs/copy_concatenate.rst b/docs/cudf/source/libcudf_docs/api_docs/copy_concatenate.rst
new file mode 100644
index 00000000000..f9400bff9e8
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/copy_concatenate.rst
@@ -0,0 +1,5 @@
+Copy Concatenate
+================
+
+.. doxygengroup:: copy_concatenate
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/copy_gather.rst b/docs/cudf/source/libcudf_docs/api_docs/copy_gather.rst
new file mode 100644
index 00000000000..daf306caa6e
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/copy_gather.rst
@@ -0,0 +1,5 @@
+Copy Gather
+===========
+
+.. doxygengroup:: copy_gather
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/copy_scatter.rst b/docs/cudf/source/libcudf_docs/api_docs/copy_scatter.rst
new file mode 100644
index 00000000000..d7b9a461901
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/copy_scatter.rst
@@ -0,0 +1,5 @@
+Copy Scatter
+============
+
+.. doxygengroup:: copy_scatter
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/copy_shift.rst b/docs/cudf/source/libcudf_docs/api_docs/copy_shift.rst
new file mode 100644
index 00000000000..5187100a8a6
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/copy_shift.rst
@@ -0,0 +1,5 @@
+Copy Shift
+==========
+
+.. doxygengroup:: copy_shift
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/copy_slice.rst b/docs/cudf/source/libcudf_docs/api_docs/copy_slice.rst
new file mode 100644
index 00000000000..3fb2fbe49d0
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/copy_slice.rst
@@ -0,0 +1,5 @@
+Copy Slice
+==========
+
+.. doxygengroup:: copy_slice
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/copy_split.rst b/docs/cudf/source/libcudf_docs/api_docs/copy_split.rst
new file mode 100644
index 00000000000..ddcdfd31cc4
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/copy_split.rst
@@ -0,0 +1,5 @@
+Copy Split
+==========
+
+.. doxygengroup:: copy_split
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/cudf_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/cudf_classes.rst
new file mode 100644
index 00000000000..5473dd56ebb
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/cudf_classes.rst
@@ -0,0 +1,14 @@
+Cudf Classes
+============
+
+.. doxygengroup:: cudf_classes
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   column_classes
+   table_classes
+   scalar_classes
+   fixed_point_classes
diff --git a/docs/cudf/source/libcudf_docs/api_docs/cudf_namespace.rst b/docs/cudf/source/libcudf_docs/api_docs/cudf_namespace.rst
new file mode 100644
index 00000000000..fd4f3c9b6ab
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/cudf_namespace.rst
@@ -0,0 +1,28 @@
+libcudf
+=======
+
+.. TODO: This page really only exists right now for the purpose of resolving namespace links. We may want to just ignore these instead
+
+.. doxygennamespace:: cudf
+   :desc-only:
+
+.. doxygennamespace:: cudf::ast
+   :desc-only:
+
+.. doxygennamespace:: cudf::io
+   :desc-only:
+
+.. doxygennamespace:: cudf::io::orc
+   :desc-only:
+
+.. doxygennamespace:: cudf::io::parquet
+   :desc-only:
+
+.. doxygennamespace:: cudf::hashing
+   :desc-only:
+
+.. doxygennamespace:: numeric
+   :desc-only:
+
+.. doxygennamespace:: cudf::tdigest
+   :desc-only:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/datetime_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/datetime_apis.rst
new file mode 100644
index 00000000000..8f7960444fc
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/datetime_apis.rst
@@ -0,0 +1,12 @@
+Datetime APIs
+=============
+
+.. doxygengroup:: datetime_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   datetime_extract
+   datetime_compute
diff --git a/docs/cudf/source/libcudf_docs/api_docs/datetime_compute.rst b/docs/cudf/source/libcudf_docs/api_docs/datetime_compute.rst
new file mode 100644
index 00000000000..0c7ba0f522f
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/datetime_compute.rst
@@ -0,0 +1,5 @@
+Datetime Compute
+================
+
+.. doxygengroup:: datetime_compute
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/datetime_extract.rst b/docs/cudf/source/libcudf_docs/api_docs/datetime_extract.rst
new file mode 100644
index 00000000000..da212480abc
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/datetime_extract.rst
@@ -0,0 +1,5 @@
+Datetime Extract
+================
+
+.. doxygengroup:: datetime_extract
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/default_stream.rst b/docs/cudf/source/libcudf_docs/api_docs/default_stream.rst
new file mode 100644
index 00000000000..c50493a8f60
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/default_stream.rst
@@ -0,0 +1,5 @@
+Default Stream
+==============
+
+.. doxygengroup:: default_stream
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/dictionary_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/dictionary_apis.rst
new file mode 100644
index 00000000000..8d463b0a956
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/dictionary_apis.rst
@@ -0,0 +1,13 @@
+Dictionary APIs
+===============
+
+.. doxygengroup:: dictionary_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   dictionary_encode
+   dictionary_search
+   dictionary_update
diff --git a/docs/cudf/source/libcudf_docs/api_docs/dictionary_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/dictionary_classes.rst
new file mode 100644
index 00000000000..00dec78c5f5
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/dictionary_classes.rst
@@ -0,0 +1,5 @@
+Dictionary Classes
+==================
+
+.. doxygengroup:: dictionary_classes
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/dictionary_encode.rst b/docs/cudf/source/libcudf_docs/api_docs/dictionary_encode.rst
new file mode 100644
index 00000000000..ed77380f281
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/dictionary_encode.rst
@@ -0,0 +1,5 @@
+Dictionary Encode
+=================
+
+.. doxygengroup:: dictionary_encode
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/dictionary_search.rst b/docs/cudf/source/libcudf_docs/api_docs/dictionary_search.rst
new file mode 100644
index 00000000000..b187d29ca0b
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/dictionary_search.rst
@@ -0,0 +1,5 @@
+Dictionary Search
+=================
+
+.. doxygengroup:: dictionary_search
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/dictionary_update.rst b/docs/cudf/source/libcudf_docs/api_docs/dictionary_update.rst
new file mode 100644
index 00000000000..8b0c12c09d9
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/dictionary_update.rst
@@ -0,0 +1,5 @@
+Dictionary Update
+=================
+
+.. doxygengroup:: dictionary_update
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/expressions.rst b/docs/cudf/source/libcudf_docs/api_docs/expressions.rst
new file mode 100644
index 00000000000..c65d8a29858
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/expressions.rst
@@ -0,0 +1,5 @@
+Expression Evaluation
+=====================
+
+.. doxygengroup:: expressions
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/fixed_point_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/fixed_point_classes.rst
new file mode 100644
index 00000000000..0a1ef1b4d63
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/fixed_point_classes.rst
@@ -0,0 +1,6 @@
+
+Fixed Point Classes
+===================
+
+.. doxygengroup:: fixed_point_classes
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/index.rst b/docs/cudf/source/libcudf_docs/api_docs/index.rst
new file mode 100644
index 00000000000..c077a7cd452
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/index.rst
@@ -0,0 +1,29 @@
+libcudf documentation
+=====================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   cudf_namespace
+   default_stream
+   cudf_classes
+   column_apis
+   datetime_apis
+   strings_apis
+   dictionary_apis
+   io_apis
+   json_apis
+   lists_apis
+   nvtext_apis
+   utility_apis
+   labeling_apis
+   expressions
+   tdigest
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/docs/cudf/source/libcudf_docs/api_docs/interop_arrow.rst b/docs/cudf/source/libcudf_docs/api_docs/interop_arrow.rst
new file mode 100644
index 00000000000..e0f0edfc9ae
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/interop_arrow.rst
@@ -0,0 +1,5 @@
+Interop Arrow
+=============
+
+.. doxygengroup:: interop_arrow
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/interop_dlpack.rst b/docs/cudf/source/libcudf_docs/api_docs/interop_dlpack.rst
new file mode 100644
index 00000000000..4be168c5132
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/interop_dlpack.rst
@@ -0,0 +1,5 @@
+Interop Dlpack
+==============
+
+.. doxygengroup:: interop_dlpack
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/io_apis.rst
new file mode 100644
index 00000000000..a23c0948584
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/io_apis.rst
@@ -0,0 +1,15 @@
+Io APIs
+=======
+
+.. doxygengroup:: io_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   io_types
+   io_readers
+   io_writers
+   io_datasources
+   io_datasinks
diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_datasinks.rst b/docs/cudf/source/libcudf_docs/api_docs/io_datasinks.rst
new file mode 100644
index 00000000000..15b0da5f7a7
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/io_datasinks.rst
@@ -0,0 +1,5 @@
+Io Datasinks
+============
+
+.. doxygengroup:: io_datasinks
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_datasources.rst b/docs/cudf/source/libcudf_docs/api_docs/io_datasources.rst
new file mode 100644
index 00000000000..3d5834892eb
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/io_datasources.rst
@@ -0,0 +1,5 @@
+Io Datasources
+==============
+
+.. doxygengroup:: io_datasources
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
new file mode 100644
index 00000000000..a835673dee4
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
@@ -0,0 +1,5 @@
+Io Readers
+==========
+
+.. doxygengroup:: io_readers
+   :desc-only:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_types.rst b/docs/cudf/source/libcudf_docs/api_docs/io_types.rst
new file mode 100644
index 00000000000..abdc43bc6fa
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/io_types.rst
@@ -0,0 +1,5 @@
+Io Types
+========
+
+.. doxygengroup:: io_types
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_writers.rst b/docs/cudf/source/libcudf_docs/api_docs/io_writers.rst
new file mode 100644
index 00000000000..add57ecfed4
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/io_writers.rst
@@ -0,0 +1,5 @@
+Io Writers
+==========
+
+.. doxygengroup:: io_writers
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/json_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/json_apis.rst
new file mode 100644
index 00000000000..119dbc36fa1
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/json_apis.rst
@@ -0,0 +1,11 @@
+JSON APIs
+=========
+
+.. doxygengroup:: json_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   json_object
diff --git a/docs/cudf/source/libcudf_docs/api_docs/json_object.rst b/docs/cudf/source/libcudf_docs/api_docs/json_object.rst
new file mode 100644
index 00000000000..510a2f9eb07
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/json_object.rst
@@ -0,0 +1,5 @@
+JSON Object
+===========
+
+.. doxygengroup:: json_object
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/label_bins.rst b/docs/cudf/source/libcudf_docs/api_docs/label_bins.rst
new file mode 100644
index 00000000000..ca80912e527
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/label_bins.rst
@@ -0,0 +1,5 @@
+Label Bins
+==========
+
+.. doxygengroup:: label_bins
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/labeling_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/labeling_apis.rst
new file mode 100644
index 00000000000..24726ce2e09
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/labeling_apis.rst
@@ -0,0 +1,11 @@
+Labeling APIs
+=============
+
+.. doxygengroup:: labeling_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   label_bins
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_apis.rst
new file mode 100644
index 00000000000..d34253d909a
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_apis.rst
@@ -0,0 +1,20 @@
+Lists APIs
+==========
+
+.. doxygengroup:: lists_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   lists_combine
+   lists_modify
+   lists_extract
+   lists_filling
+   lists_contains
+   lists_gather
+   lists_elements
+   lists_filtering
+   lists_sort
+   set_operations
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_classes.rst
new file mode 100644
index 00000000000..9b89c164746
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_classes.rst
@@ -0,0 +1,5 @@
+Lists Classes
+=============
+
+.. doxygengroup:: lists_classes
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_combine.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_combine.rst
new file mode 100644
index 00000000000..e26fd89a3c2
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_combine.rst
@@ -0,0 +1,5 @@
+Lists Combine
+=============
+
+.. doxygengroup:: lists_combine
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_contains.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_contains.rst
new file mode 100644
index 00000000000..ccb1366a6fb
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_contains.rst
@@ -0,0 +1,5 @@
+Lists Contains
+==============
+
+.. doxygengroup:: lists_contains
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_elements.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_elements.rst
new file mode 100644
index 00000000000..fb7758a46d0
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_elements.rst
@@ -0,0 +1,5 @@
+Lists Elements
+==============
+
+.. doxygengroup:: lists_elements
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_extract.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_extract.rst
new file mode 100644
index 00000000000..f721d89684e
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_extract.rst
@@ -0,0 +1,5 @@
+Lists Extract
+=============
+
+.. doxygengroup:: lists_extract
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_filling.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_filling.rst
new file mode 100644
index 00000000000..5ab3c491dbe
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_filling.rst
@@ -0,0 +1,5 @@
+Lists Filling
+=============
+
+.. doxygengroup:: lists_filling
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_filtering.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_filtering.rst
new file mode 100644
index 00000000000..af9f1ebaa2b
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_filtering.rst
@@ -0,0 +1,5 @@
+Lists Filtering
+===============
+
+.. doxygengroup:: lists_filtering
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_gather.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_gather.rst
new file mode 100644
index 00000000000..4cb927b2672
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_gather.rst
@@ -0,0 +1,5 @@
+Lists Gather
+============
+
+.. doxygengroup:: lists_gather
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_modify.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_modify.rst
new file mode 100644
index 00000000000..d41020b75cd
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_modify.rst
@@ -0,0 +1,5 @@
+Lists Modify
+============
+
+.. doxygengroup:: lists_modify
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/lists_sort.rst b/docs/cudf/source/libcudf_docs/api_docs/lists_sort.rst
new file mode 100644
index 00000000000..63de08d1622
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/lists_sort.rst
@@ -0,0 +1,5 @@
+Lists Sort
+==========
+
+.. doxygengroup:: lists_sort
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_apis.rst
new file mode 100644
index 00000000000..f938f2914ed
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_apis.rst
@@ -0,0 +1,18 @@
+Nvtext APIs
+===========
+
+.. doxygengroup:: nvtext_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   nvtext_ngrams
+   nvtext_normalize
+   nvtext_stemmer
+   nvtext_edit_distance
+   nvtext_tokenize
+   nvtext_replace
+   nvtext_minhash
+   nvtext_jaccard
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_edit_distance.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_edit_distance.rst
new file mode 100644
index 00000000000..e5eb2dc8c95
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_edit_distance.rst
@@ -0,0 +1,5 @@
+Nvtext Edit Distance
+====================
+
+.. doxygengroup:: nvtext_edit_distance
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_jaccard.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_jaccard.rst
new file mode 100644
index 00000000000..75124c5655a
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_jaccard.rst
@@ -0,0 +1,5 @@
+Nvtext Jaccard
+==============
+
+.. doxygengroup:: nvtext_jaccard
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_minhash.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_minhash.rst
new file mode 100644
index 00000000000..57d8445a3eb
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_minhash.rst
@@ -0,0 +1,5 @@
+Nvtext Minhash
+==============
+
+.. doxygengroup:: nvtext_minhash
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_ngrams.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_ngrams.rst
new file mode 100644
index 00000000000..27f93211f4c
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_ngrams.rst
@@ -0,0 +1,5 @@
+Nvtext Ngrams
+=============
+
+.. doxygengroup:: nvtext_ngrams
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_normalize.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_normalize.rst
new file mode 100644
index 00000000000..7654f3c19f6
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_normalize.rst
@@ -0,0 +1,5 @@
+Nvtext Normalize
+================
+
+.. doxygengroup:: nvtext_normalize
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_replace.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_replace.rst
new file mode 100644
index 00000000000..cdd2e45f48b
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_replace.rst
@@ -0,0 +1,5 @@
+Nvtext Replace
+==============
+
+.. doxygengroup:: nvtext_replace
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_stemmer.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_stemmer.rst
new file mode 100644
index 00000000000..fbe5675fe2d
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_stemmer.rst
@@ -0,0 +1,5 @@
+Nvtext Stemmer
+==============
+
+.. doxygengroup:: nvtext_stemmer
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/nvtext_tokenize.rst b/docs/cudf/source/libcudf_docs/api_docs/nvtext_tokenize.rst
new file mode 100644
index 00000000000..58fc422dc88
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/nvtext_tokenize.rst
@@ -0,0 +1,5 @@
+Nvtext Tokenize
+===============
+
+.. doxygengroup:: nvtext_tokenize
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/reorder_compact.rst b/docs/cudf/source/libcudf_docs/api_docs/reorder_compact.rst
new file mode 100644
index 00000000000..099d08f74ab
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/reorder_compact.rst
@@ -0,0 +1,5 @@
+Reorder Compact
+===============
+
+.. doxygengroup:: reorder_compact
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/reorder_partition.rst b/docs/cudf/source/libcudf_docs/api_docs/reorder_partition.rst
new file mode 100644
index 00000000000..a8b88ab244f
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/reorder_partition.rst
@@ -0,0 +1,5 @@
+Reorder Partition
+=================
+
+.. doxygengroup:: reorder_partition
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/reshape_transpose.rst b/docs/cudf/source/libcudf_docs/api_docs/reshape_transpose.rst
new file mode 100644
index 00000000000..1ab29129c33
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/reshape_transpose.rst
@@ -0,0 +1,5 @@
+Reshape Transpose
+=================
+
+.. doxygengroup:: reshape_transpose
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/scalar_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/scalar_classes.rst
new file mode 100644
index 00000000000..3022e5d2292
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/scalar_classes.rst
@@ -0,0 +1,11 @@
+Scalar Classes
+==============
+
+.. doxygengroup:: scalar_classes
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   scalar_factories
diff --git a/docs/cudf/source/libcudf_docs/api_docs/scalar_factories.rst b/docs/cudf/source/libcudf_docs/api_docs/scalar_factories.rst
new file mode 100644
index 00000000000..782ce6cb421
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/scalar_factories.rst
@@ -0,0 +1,5 @@
+Scalar Factories
+================
+
+.. doxygengroup:: scalar_factories
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/set_operations.rst b/docs/cudf/source/libcudf_docs/api_docs/set_operations.rst
new file mode 100644
index 00000000000..22118daec00
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/set_operations.rst
@@ -0,0 +1,5 @@
+Set Operations
+==============
+
+.. doxygengroup:: set_operations
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_apis.rst
new file mode 100644
index 00000000000..c42d5479954
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_apis.rst
@@ -0,0 +1,23 @@
+Strings APIs
+============
+
+.. doxygengroup:: strings_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   strings_case
+   strings_types
+   strings_combine
+   strings_contains
+   strings_convert
+   strings_copy
+   strings_slice
+   strings_find
+   strings_modify
+   strings_replace
+   strings_split
+   strings_extract
+   strings_regex
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_case.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_case.rst
new file mode 100644
index 00000000000..ddec78a268e
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_case.rst
@@ -0,0 +1,5 @@
+Strings Case
+============
+
+.. doxygengroup:: strings_case
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_classes.rst
new file mode 100644
index 00000000000..867b2ac242a
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_classes.rst
@@ -0,0 +1,5 @@
+Strings Classes
+===============
+
+.. doxygengroup:: strings_classes
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_combine.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_combine.rst
new file mode 100644
index 00000000000..4542308e0c7
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_combine.rst
@@ -0,0 +1,5 @@
+Strings Combine
+===============
+
+.. doxygengroup:: strings_combine
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_contains.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_contains.rst
new file mode 100644
index 00000000000..250eb96e541
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_contains.rst
@@ -0,0 +1,5 @@
+Strings Contains
+================
+
+.. doxygengroup:: strings_contains
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
new file mode 100644
index 00000000000..ae5d78fb1a1
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
@@ -0,0 +1,5 @@
+Strings Convert
+===============
+
+.. doxygengroup:: strings_convert
+   :desc-only:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_copy.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_copy.rst
new file mode 100644
index 00000000000..0c2884361bf
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_copy.rst
@@ -0,0 +1,5 @@
+Strings Copy
+============
+
+.. doxygengroup:: strings_copy
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_extract.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_extract.rst
new file mode 100644
index 00000000000..8018bbc627a
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_extract.rst
@@ -0,0 +1,5 @@
+Strings Extract
+===============
+
+.. doxygengroup:: strings_extract
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_find.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_find.rst
new file mode 100644
index 00000000000..709bd138f54
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_find.rst
@@ -0,0 +1,5 @@
+Strings Find
+============
+
+.. doxygengroup:: strings_find
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_modify.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_modify.rst
new file mode 100644
index 00000000000..4bf54e5e835
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_modify.rst
@@ -0,0 +1,5 @@
+Strings Modify
+==============
+
+.. doxygengroup:: strings_modify
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_regex.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_regex.rst
new file mode 100644
index 00000000000..719b09be24e
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_regex.rst
@@ -0,0 +1,5 @@
+Strings Regex
+=============
+
+.. doxygengroup:: strings_regex
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_replace.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_replace.rst
new file mode 100644
index 00000000000..884295e6d78
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_replace.rst
@@ -0,0 +1,5 @@
+Strings Replace
+===============
+
+.. doxygengroup:: strings_replace
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_slice.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_slice.rst
new file mode 100644
index 00000000000..bc1831a9dd4
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_slice.rst
@@ -0,0 +1,5 @@
+Strings Slice
+=============
+
+.. doxygengroup:: strings_slice
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_split.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_split.rst
new file mode 100644
index 00000000000..ddf7cf2f1d6
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_split.rst
@@ -0,0 +1,5 @@
+Strings Split
+=============
+
+.. doxygengroup:: strings_split
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_types.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_types.rst
new file mode 100644
index 00000000000..aff356efb71
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_types.rst
@@ -0,0 +1,5 @@
+Strings Types
+=============
+
+.. doxygengroup:: strings_types
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/structs_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/structs_classes.rst
new file mode 100644
index 00000000000..2669c2884d6
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/structs_classes.rst
@@ -0,0 +1,5 @@
+Structs Classes
+===============
+
+.. doxygengroup:: structs_classes
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/table_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/table_classes.rst
new file mode 100644
index 00000000000..f00e315e597
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/table_classes.rst
@@ -0,0 +1,5 @@
+Table Classes
+=============
+
+.. doxygengroup:: table_classes
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/tdigest.rst b/docs/cudf/source/libcudf_docs/api_docs/tdigest.rst
new file mode 100644
index 00000000000..9eb97df8337
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/tdigest.rst
@@ -0,0 +1,5 @@
+tdigest
+=======
+
+.. doxygengroup:: tdigest
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/timestamp_classes.rst b/docs/cudf/source/libcudf_docs/api_docs/timestamp_classes.rst
new file mode 100644
index 00000000000..4651d8dbd32
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/timestamp_classes.rst
@@ -0,0 +1,5 @@
+Timestamp Classes
+=================
+
+.. doxygengroup:: timestamp_classes
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/transformation_binaryops.rst b/docs/cudf/source/libcudf_docs/api_docs/transformation_binaryops.rst
new file mode 100644
index 00000000000..59be3a7b45e
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/transformation_binaryops.rst
@@ -0,0 +1,5 @@
+Transformation Binaryops
+========================
+
+.. doxygengroup:: transformation_binaryops
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/transformation_fill.rst b/docs/cudf/source/libcudf_docs/api_docs/transformation_fill.rst
new file mode 100644
index 00000000000..939fba261df
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/transformation_fill.rst
@@ -0,0 +1,5 @@
+Transformation Fill
+===================
+
+.. doxygengroup:: transformation_fill
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/transformation_replace.rst b/docs/cudf/source/libcudf_docs/api_docs/transformation_replace.rst
new file mode 100644
index 00000000000..6800b7bba76
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/transformation_replace.rst
@@ -0,0 +1,5 @@
+Transformation Replace
+======================
+
+.. doxygengroup:: transformation_replace
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/transformation_transform.rst b/docs/cudf/source/libcudf_docs/api_docs/transformation_transform.rst
new file mode 100644
index 00000000000..108a680f5a1
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/transformation_transform.rst
@@ -0,0 +1,5 @@
+Transformation Transform
+========================
+
+.. doxygengroup:: transformation_transform
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/transformation_unaryops.rst b/docs/cudf/source/libcudf_docs/api_docs/transformation_unaryops.rst
new file mode 100644
index 00000000000..07b5ba6388b
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/transformation_unaryops.rst
@@ -0,0 +1,5 @@
+Transformation Unaryops
+=======================
+
+.. doxygengroup:: transformation_unaryops
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/utility_apis.rst b/docs/cudf/source/libcudf_docs/api_docs/utility_apis.rst
new file mode 100644
index 00000000000..9f0cda74582
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/utility_apis.rst
@@ -0,0 +1,15 @@
+Utility APIs
+============
+
+.. doxygengroup:: utility_apis
+   :members:
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   utility_types
+   utility_dispatcher
+   utility_bitmask
+   utility_error
+   utility_span
diff --git a/docs/cudf/source/libcudf_docs/api_docs/utility_bitmask.rst b/docs/cudf/source/libcudf_docs/api_docs/utility_bitmask.rst
new file mode 100644
index 00000000000..5dba1928baf
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/utility_bitmask.rst
@@ -0,0 +1,5 @@
+Utility Bitmask
+===============
+
+.. doxygengroup:: utility_bitmask
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/utility_dispatcher.rst b/docs/cudf/source/libcudf_docs/api_docs/utility_dispatcher.rst
new file mode 100644
index 00000000000..539505e4551
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/utility_dispatcher.rst
@@ -0,0 +1,5 @@
+Utility Dispatcher
+==================
+
+.. doxygengroup:: utility_dispatcher
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/utility_error.rst b/docs/cudf/source/libcudf_docs/api_docs/utility_error.rst
new file mode 100644
index 00000000000..acb575636c9
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/utility_error.rst
@@ -0,0 +1,5 @@
+Utility Error
+=============
+
+.. doxygengroup:: utility_error
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/utility_span.rst b/docs/cudf/source/libcudf_docs/api_docs/utility_span.rst
new file mode 100644
index 00000000000..fdb1d254c0e
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/utility_span.rst
@@ -0,0 +1,5 @@
+Utility Span
+============
+
+.. doxygengroup:: utility_span
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/utility_types.rst b/docs/cudf/source/libcudf_docs/api_docs/utility_types.rst
new file mode 100644
index 00000000000..f021535d589
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/utility_types.rst
@@ -0,0 +1,5 @@
+Utility Types
+=============
+
+.. doxygengroup:: utility_types
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/index.rst b/docs/cudf/source/libcudf_docs/index.rst
new file mode 100644
index 00000000000..39ed46d8578
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/index.rst
@@ -0,0 +1,17 @@
+libcudf documentation
+=====================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   api_docs/index.rst
+   md_regex
+   unicode_limitations
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/docs/cudf/source/libcudf_docs/md_regex.rst b/docs/cudf/source/libcudf_docs/md_regex.rst
new file mode 100644
index 00000000000..0eb0f464063
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/md_regex.rst
@@ -0,0 +1,4 @@
+.. _md_regex:
+
+.. include:: ../../../../cpp/doxygen/regex.md
+   :parser: myst_parser.sphinx_
diff --git a/docs/cudf/source/libcudf_docs/unicode_limitations.rst b/docs/cudf/source/libcudf_docs/unicode_limitations.rst
new file mode 100644
index 00000000000..1f069088160
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/unicode_limitations.rst
@@ -0,0 +1,4 @@
+.. _unicode_limitations:
+
+.. include:: ../../../../cpp/doxygen/unicode.md
+   :parser: myst_parser.sphinx_
diff --git a/docs/cudf/source/user_guide/api_docs/index_objects.rst b/docs/cudf/source/user_guide/api_docs/index_objects.rst
index 013eaf29a56..b6da9af9b3e 100644
--- a/docs/cudf/source/user_guide/api_docs/index_objects.rst
+++ b/docs/cudf/source/user_guide/api_docs/index_objects.rst
@@ -228,6 +228,7 @@ MultiIndex constructors
 .. autosummary::
    :toctree: api/
 
+   MultiIndex.from_arrays
    MultiIndex.from_tuples
    MultiIndex.from_product
    MultiIndex.from_frame
diff --git a/java/src/main/java/ai/rapids/cudf/DecimalUtils.java b/java/src/main/java/ai/rapids/cudf/DecimalUtils.java
index 1979bd1bd5b..7a5be9b08b9 100644
--- a/java/src/main/java/ai/rapids/cudf/DecimalUtils.java
+++ b/java/src/main/java/ai/rapids/cudf/DecimalUtils.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -82,13 +82,13 @@ public static ColumnVector lessThan(ColumnView lhs, BigDecimal rhs) {
     int leftScale = lhs.getType().getScale();
     int leftPrecision = lhs.getType().getDecimalMaxPrecision();
 
-    // First we have to round the scalar (rhs) to the same scale as lhs.  Because this is a
-    // less than and it is rhs that we are rounding, we will round away from 0 (UP)
-    // to make sure we always return the correct value.
-    // For example:
-    //      100.1 < 100.19
-    // If we rounded down the rhs 100.19 would become 100.1, and now 100.1 is not < 100.1
-    BigDecimal roundedRhs = rhs.setScale(-leftScale, BigDecimal.ROUND_UP);
+    // First we have to round the scalar (rhs) to the same scale as lhs.
+    // For comparing the two values they should be the same scale, we round the value to positive infinity to maintain
+    // the relation. Ex:
+    // 10.2 < 10.29 = true, after rounding rhs to ceiling ===> 10.2 < 10.3 = true, relation is maintained
+    // 10.3 < 10.29 = false, after rounding rhs to ceiling ===> 10.3 < 10.3 = false, relation is maintained
+    // 10.1 < 10.10 = false, after rounding rhs to ceiling ===> 10.1 < 10.1 = false, relation is maintained
+    BigDecimal roundedRhs = rhs.setScale(-leftScale, BigDecimal.ROUND_CEILING);
 
     if (roundedRhs.precision() > leftPrecision) {
       // converting rhs to the same precision as lhs would result in an overflow/error, but
@@ -136,13 +136,13 @@ public static ColumnVector greaterThan(ColumnView lhs, BigDecimal rhs) {
     int cvScale = lhs.getType().getScale();
     int maxPrecision = lhs.getType().getDecimalMaxPrecision();
 
-    // First we have to round the scalar (rhs) to the same scale as lhs.  Because this is a
-    // greater than and it is rhs that we are rounding, we will round towards 0 (DOWN)
-    // to make sure we always return the correct value.
-    // For example:
-    //      100.2 > 100.19
-    // If we rounded up the rhs 100.19 would become 100.2, and now 100.2 is not > 100.2
-    BigDecimal roundedRhs = rhs.setScale(-cvScale, BigDecimal.ROUND_DOWN);
+    // First we have to round the scalar (rhs) to the same scale as lhs.
+    // For comparing the two values they should be the same scale, we round the value to negative infinity to maintain
+    // the relation. Ex:
+    // 10.3 > 10.29 = true, after rounding rhs to floor ===> 10.3 > 10.2 = true, relation is maintained
+    // 10.2 > 10.29 = false, after rounding rhs to floor ===> 10.2 > 10.2 = false, relation is maintained
+    // 10.1 > 10.10 = false, after rounding rhs to floor ===> 10.1 > 10.1 = false, relation is maintained
+    BigDecimal roundedRhs = rhs.setScale(-cvScale, BigDecimal.ROUND_FLOOR);
 
     if (roundedRhs.precision() > maxPrecision) {
       // converting rhs to the same precision as lhs would result in an overflow/error, but
diff --git a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
index 7deb5bae541..666a8864003 100644
--- a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
+++ b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
@@ -1742,7 +1742,7 @@ private static long buildColumnView(SerializedColumnHeader column,
       }
       DType dtype = column.getType();
       long bufferAddress = combinedBuffer.getAddress();
-      long dataAddress = dtype.isNestedType() ? 0 : bufferAddress + offsetsInfo.data;
+      long dataAddress = offsetsInfo.dataLen == 0 ? 0 : bufferAddress + offsetsInfo.data;
       long validityAddress = needsValidityBuffer(column.getNullCount())
           ? bufferAddress + offsetsInfo.validity : 0;
       long offsetsAddress = dtype.hasOffsets() ? bufferAddress + offsetsInfo.offsets : 0;
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 675996df71c..47dc802cd49 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,6 +111,9 @@ std::size_t calc_device_memory_size(cudf::column_view const &view, bool const pa
   auto dtype = view.type();
   if (cudf::is_fixed_width(dtype)) {
     total += pad_size(cudf::size_of(dtype) * view.size(), pad_for_cpu);
+  } else if (dtype.id() == cudf::type_id::STRING) {
+    auto scv = cudf::strings_column_view(view);
+    total += pad_size(scv.chars_size(cudf::get_default_stream()), pad_for_cpu);
   }
 
   return std::accumulate(view.child_begin(), view.child_end(), total,
@@ -1974,18 +1977,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(
             new cudf::column_view(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0));
       } else {
         JNI_NULL_CHECK(env, j_offset, "offset is null", 0);
-        // This must be kept in sync with how string columns are created
-        // offsets are always the first child
-        // data is the second child
-
         cudf::size_type *offsets = reinterpret_cast<cudf::size_type *>(j_offset);
         cudf::column_view offsets_column(cudf::data_type{cudf::type_id::INT32}, size + 1, offsets,
                                          nullptr, 0);
-        cudf::column_view data_column(cudf::data_type{cudf::type_id::INT8}, j_data_size, data,
-                                      nullptr, 0);
         return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRING}, size,
-                                                  nullptr, valid, j_null_count, 0,
-                                                  {offsets_column, data_column}));
+                                                  data, valid, j_null_count, 0, {offsets_column}));
       }
     } else if (n_type == cudf::type_id::LIST) {
       JNI_NULL_CHECK(env, j_children, "children of a list are null", 0);
@@ -2082,8 +2078,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataAddress(JNIE
     if (column->type().id() == cudf::type_id::STRING) {
       if (column->size() > 0) {
         cudf::strings_column_view view = cudf::strings_column_view(*column);
-        cudf::column_view data_view = view.chars();
-        result = reinterpret_cast<jlong>(data_view.data<char>());
+        result = reinterpret_cast<jlong>(view.chars_begin(cudf::get_default_stream()));
       }
     } else if (column->type().id() != cudf::type_id::LIST &&
                column->type().id() != cudf::type_id::STRUCT) {
@@ -2104,8 +2099,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataLength(JNIEn
     if (column->type().id() == cudf::type_id::STRING) {
       if (column->size() > 0) {
         cudf::strings_column_view view = cudf::strings_column_view(*column);
-        cudf::column_view data_view = view.chars();
-        result = data_view.size();
+        result = view.chars_size(cudf::get_default_stream());
       }
     } else if (column->type().id() != cudf::type_id::LIST &&
                column->type().id() != cudf::type_id::STRUCT) {
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index d7d0279174d..295574858da 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -905,12 +905,12 @@ cudf::column_view remove_validity_from_col(cudf::column_view column_view) {
       children.push_back(remove_validity_from_col(*it));
     }
     if (!column_view.nullable() || column_view.null_count() != 0) {
-      return cudf::column_view(column_view.type(), column_view.size(), nullptr,
+      return cudf::column_view(column_view.type(), column_view.size(), column_view.head(),
                                column_view.null_mask(), column_view.null_count(),
                                column_view.offset(), children);
     } else {
-      return cudf::column_view(column_view.type(), column_view.size(), nullptr, nullptr, 0,
-                               column_view.offset(), children);
+      return cudf::column_view(column_view.type(), column_view.size(), column_view.head(), nullptr,
+                               0, column_view.offset(), children);
     }
   }
 }
diff --git a/java/src/test/java/ai/rapids/cudf/DecimalUtilsTest.java b/java/src/test/java/ai/rapids/cudf/DecimalUtilsTest.java
new file mode 100644
index 00000000000..a96eeda5dd7
--- /dev/null
+++ b/java/src/test/java/ai/rapids/cudf/DecimalUtilsTest.java
@@ -0,0 +1,40 @@
+/*
+ *
+ *  Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+import org.junit.jupiter.api.Test;
+
+import java.math.BigDecimal;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
+
+public class DecimalUtilsTest extends CudfTestBase {
+    @Test
+    public void testOutOfBounds() {
+        try (ColumnView cv = ColumnVector.fromDecimals(
+                new BigDecimal("-1E+3"),
+                new BigDecimal("1E+3"),
+                new BigDecimal("9E+1"),
+                new BigDecimal("-9E+1"),
+                new BigDecimal("-91"));
+             ColumnView expected = ColumnVector.fromBooleans(true, true, false, false, true);
+             ColumnView result = DecimalUtils.outOfBounds(cv, 1, -1)) {
+            assertColumnsAreEqual(expected, result);
+        }
+    }
+}
diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py
index f908a995c2a..59d73015962 100644
--- a/python/cudf/benchmarks/API/bench_dataframe.py
+++ b/python/cudf/benchmarks/API/bench_dataframe.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 """Benchmarks of DataFrame methods."""
 
@@ -178,6 +178,8 @@ def bench_nsmallest(benchmark, dataframe, num_cols_to_sort, n):
     benchmark(dataframe.nsmallest, n, by)
 
 
-@pytest_cases.parametrize_with_cases("dataframe, cond, other", prefix="where")
+@pytest_cases.parametrize_with_cases(
+    "dataframe, cond, other", prefix="where", cases="cases_dataframe"
+)
 def bench_where(benchmark, dataframe, cond, other):
     benchmark(dataframe.where, cond, other)
diff --git a/python/cudf/benchmarks/API/bench_functions.py b/python/cudf/benchmarks/API/bench_functions.py
index ec4be221d9f..93109838900 100644
--- a/python/cudf/benchmarks/API/bench_functions.py
+++ b/python/cudf/benchmarks/API/bench_functions.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 """Benchmarks of free functions that accept cudf objects."""
 
@@ -9,7 +9,9 @@
 from utils import benchmark_with_object
 
 
-@pytest_cases.parametrize_with_cases("objs", prefix="concat")
+@pytest_cases.parametrize_with_cases(
+    "objs", prefix="concat", cases="cases_functions"
+)
 @pytest.mark.parametrize(
     "axis",
     [
diff --git a/python/cudf/benchmarks/API/bench_dataframe_cases.py b/python/cudf/benchmarks/API/cases_dataframe.py
similarity index 88%
rename from python/cudf/benchmarks/API/bench_dataframe_cases.py
rename to python/cudf/benchmarks/API/cases_dataframe.py
index fc41d141c8a..d12b9776f1b 100644
--- a/python/cudf/benchmarks/API/bench_dataframe_cases.py
+++ b/python/cudf/benchmarks/API/cases_dataframe.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from utils import benchmark_with_object
 
diff --git a/python/cudf/benchmarks/API/bench_functions_cases.py b/python/cudf/benchmarks/API/cases_functions.py
similarity index 99%
rename from python/cudf/benchmarks/API/bench_functions_cases.py
rename to python/cudf/benchmarks/API/cases_functions.py
index c81f8f20f80..6bc66aa4a9b 100644
--- a/python/cudf/benchmarks/API/bench_functions_cases.py
+++ b/python/cudf/benchmarks/API/cases_functions.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 """Test cases for benchmarks in bench_functions.py."""
 
diff --git a/python/cudf/benchmarks/internal/bench_column.py b/python/cudf/benchmarks/internal/bench_column.py
index d4969b39f7f..8da769b7858 100644
--- a/python/cudf/benchmarks/internal/bench_column.py
+++ b/python/cudf/benchmarks/internal/bench_column.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 """Benchmarks of Column methods."""
 
@@ -18,9 +18,8 @@ def bench_apply_boolean_mask(benchmark, column):
 
 
 @benchmark_with_object(cls="column", dtype="float")
-@pytest.mark.parametrize("dropnan", [True, False])
-def bench_dropna(benchmark, column, dropnan):
-    benchmark(column.dropna, drop_nan=dropnan)
+def bench_dropna(benchmark, column):
+    benchmark(column.dropna)
 
 
 @benchmark_with_object(cls="column", dtype="float")
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 0edf9f8aa95..45aa1081b8d 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -1,17 +1,17 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 
 from typing import Literal
 
 import cupy as cp
 import numpy as np
+import pandas as pd
 
 import rmm
 
 import cudf
 import cudf._lib as libcudf
 from cudf._lib import pylibcudf
-from cudf.api.types import is_categorical_dtype, is_datetime64tz_dtype
 from cudf.core.buffer import (
     Buffer,
     ExposureTrackedBuffer,
@@ -39,6 +39,7 @@ from cudf._lib.types cimport (
 from cudf._lib.null_mask import bitmask_allocation_size_bytes
 from cudf._lib.types import dtype_from_pylibcudf_column
 
+cimport cudf._lib.cpp.copying as cpp_copying
 cimport cudf._lib.cpp.types as libcudf_types
 cimport cudf._lib.cpp.unary as libcudf_unary
 from cudf._lib.cpp.column.column cimport column, column_contents
@@ -52,6 +53,19 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.scalar cimport DeviceScalar
 
 
+cdef get_element(column_view col_view, size_type index):
+
+    cdef unique_ptr[scalar] c_output
+    with nogil:
+        c_output = move(
+            cpp_copying.get_element(col_view, index)
+        )
+
+    return DeviceScalar.from_unique_ptr(
+        move(c_output), dtype=dtype_from_column_view(col_view)
+    )
+
+
 cdef class Column:
     """
     A Column stores columnar data in device memory.
@@ -330,10 +344,10 @@ cdef class Column:
             )
 
     cdef mutable_column_view mutable_view(self) except *:
-        if is_categorical_dtype(self.dtype):
+        if isinstance(self.dtype, cudf.CategoricalDtype):
             col = self.base_children[0]
             data_dtype = col.dtype
-        elif is_datetime64tz_dtype(self.dtype):
+        elif isinstance(self.dtype, pd.DatetimeTZDtype):
             col = self
             data_dtype = _get_base_dtype(col.dtype)
         else:
@@ -393,10 +407,10 @@ cdef class Column:
         return self._view(c_null_count)
 
     cdef column_view _view(self, libcudf_types.size_type null_count) except *:
-        if is_categorical_dtype(self.dtype):
+        if isinstance(self.dtype, cudf.CategoricalDtype):
             col = self.base_children[0]
             data_dtype = col.dtype
-        elif is_datetime64tz_dtype(self.dtype):
+        elif isinstance(self.dtype, pd.DatetimeTZDtype):
             col = self
             data_dtype = _get_base_dtype(col.dtype)
         else:
@@ -468,7 +482,7 @@ cdef class Column:
         # categoricals because cudf supports ordered and unordered categoricals
         # while libcudf supports only unordered categoricals (see
         # https://github.com/rapidsai/cudf/pull/8567).
-        if is_categorical_dtype(self.dtype):
+        if isinstance(self.dtype, cudf.CategoricalDtype):
             col = self.base_children[0]
         else:
             col = self
@@ -634,7 +648,7 @@ cdef class Column:
         """
         column_owner = isinstance(owner, Column)
         mask_owner = owner
-        if column_owner and is_categorical_dtype(owner.dtype):
+        if column_owner and isinstance(owner.dtype, cudf.CategoricalDtype):
             owner = owner.base_children[0]
 
         size = cv.size()
@@ -652,11 +666,29 @@ cdef class Column:
             mask_owner = mask_owner.base_mask
             base_size = owner.base_size
         base_nbytes = base_size * dtype_itemsize
+        # special case for string column
+        is_string_column = (cv.type().id() == libcudf_types.type_id.STRING)
+        if is_string_column:
+            # get the size from offset child column (device to host copy)
+            offsets_column_index = 0
+            offset_child_column = cv.child(offsets_column_index)
+            if offset_child_column.size() == 0:
+                base_nbytes = 0
+            else:
+                chars_size = get_element(
+                    offset_child_column, offset_child_column.size()-1).value
+                base_nbytes = chars_size
+
         if data_ptr:
             if data_owner is None:
+                buffer_size = (
+                    base_nbytes
+                    if is_string_column
+                    else ((size + offset) * dtype_itemsize)
+                )
                 data = as_buffer(
                     rmm.DeviceBuffer(ptr=data_ptr,
-                                     size=(size+offset) * dtype_itemsize)
+                                     size=buffer_size)
                 )
             elif (
                 column_owner and
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index f332fead8d1..8848649736b 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -1,16 +1,17 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from functools import singledispatch
 
 from pandas.core.groupby.groupby import DataError
 
-from cudf.api.types import (
-    is_categorical_dtype,
-    is_decimal_dtype,
-    is_interval_dtype,
-    is_list_dtype,
-    is_string_dtype,
-    is_struct_dtype,
-)
+from cudf.api.types import is_string_dtype
 from cudf.core.buffer import acquire_spill_lock
+from cudf.core.dtypes import (
+    CategoricalDtype,
+    DecimalDtype,
+    IntervalDtype,
+    ListDtype,
+    StructDtype,
+)
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -73,6 +74,43 @@ _DECIMAL_AGGS = {
 ctypedef const scalar constscalar
 
 
+@singledispatch
+def get_valid_aggregation(dtype):
+    if is_string_dtype(dtype):
+        return _STRING_AGGS
+    return "ALL"
+
+
+@get_valid_aggregation.register
+def _(dtype: ListDtype):
+    return _LIST_AGGS
+
+
+@get_valid_aggregation.register
+def _(dtype: CategoricalDtype):
+    return _CATEGORICAL_AGGS
+
+
+@get_valid_aggregation.register
+def _(dtype: ListDtype):
+    return _LIST_AGGS
+
+
+@get_valid_aggregation.register
+def _(dtype: StructDtype):
+    return _STRUCT_AGGS
+
+
+@get_valid_aggregation.register
+def _(dtype: IntervalDtype):
+    return _INTERVAL_AGGS
+
+
+@get_valid_aggregation.register
+def _(dtype: DecimalDtype):
+    return _DECIMAL_AGGS
+
+
 cdef _agg_result_from_columns(
     vector[libcudf_groupby.aggregation_result]& c_result_columns,
     set column_included,
@@ -187,15 +225,7 @@ cdef class GroupBy:
         for i, (col, aggs) in enumerate(zip(values, aggregations)):
             dtype = col.dtype
 
-            valid_aggregations = (
-                _LIST_AGGS if is_list_dtype(dtype)
-                else _STRING_AGGS if is_string_dtype(dtype)
-                else _CATEGORICAL_AGGS if is_categorical_dtype(dtype)
-                else _STRUCT_AGGS if is_struct_dtype(dtype)
-                else _INTERVAL_AGGS if is_interval_dtype(dtype)
-                else _DECIMAL_AGGS if is_decimal_dtype(dtype)
-                else "ALL"
-            )
+            valid_aggregations = get_valid_aggregation(dtype)
             included_aggregations_i = []
 
             c_agg_request = move(libcudf_groupby.aggregation_request())
@@ -258,15 +288,7 @@ cdef class GroupBy:
         for i, (col, aggs) in enumerate(zip(values, aggregations)):
             dtype = col.dtype
 
-            valid_aggregations = (
-                _LIST_AGGS if is_list_dtype(dtype)
-                else _STRING_AGGS if is_string_dtype(dtype)
-                else _CATEGORICAL_AGGS if is_categorical_dtype(dtype)
-                else _STRUCT_AGGS if is_struct_dtype(dtype)
-                else _INTERVAL_AGGS if is_interval_dtype(dtype)
-                else _DECIMAL_AGGS if is_decimal_dtype(dtype)
-                else "ALL"
-            )
+            valid_aggregations = get_valid_aggregation(dtype)
             included_aggregations_i = []
 
             c_agg_request = move(libcudf_groupby.scan_request())
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 8fd2a409d90..13c8ce43ea3 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cpython cimport pycapsule
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -18,8 +18,8 @@ from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
-from cudf.api.types import is_list_dtype, is_struct_dtype
 from cudf.core.buffer import acquire_spill_lock
+from cudf.core.dtypes import ListDtype, StructDtype
 
 
 def from_dlpack(dlpack_capsule):
@@ -98,7 +98,7 @@ cdef vector[column_metadata] gather_metadata(object cols_dtypes) except *:
     if cols_dtypes is not None:
         for idx, (col_name, col_dtype) in enumerate(cols_dtypes):
             cpp_metadata.push_back(column_metadata(col_name.encode()))
-            if is_struct_dtype(col_dtype) or is_list_dtype(col_dtype):
+            if isinstance(col_dtype, (ListDtype, StructDtype)):
                 _set_col_children_metadata(col_dtype, cpp_metadata[idx])
     else:
         raise TypeError(
@@ -113,14 +113,14 @@ cdef _set_col_children_metadata(dtype,
 
     cdef column_metadata element_metadata
 
-    if is_struct_dtype(dtype):
+    if isinstance(dtype, StructDtype):
         for name, value in dtype.fields.items():
             element_metadata = column_metadata(name.encode())
             _set_col_children_metadata(
                 value, element_metadata
             )
             col_meta.children_meta.push_back(element_metadata)
-    elif is_list_dtype(dtype):
+    elif isinstance(dtype, ListDtype):
         col_meta.children_meta.reserve(2)
         # Offsets - child 0
         col_meta.children_meta.push_back(column_metadata())
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 9b027a4d275..ae978d18813 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cpython.buffer cimport PyBUF_READ
 from cpython.memoryview cimport PyMemoryView_FromMemory
@@ -23,7 +23,7 @@ import errno
 import io
 import os
 
-from cudf.api.types import is_struct_dtype
+from cudf.core.dtypes import StructDtype
 
 
 # Converts the Python source input to libcudf IO source_info
@@ -172,7 +172,7 @@ cdef Column update_column_struct_field_names(
             )
         col.set_base_children(tuple(children))
 
-    if is_struct_dtype(col):
+    if isinstance(col.dtype, StructDtype):
         field_names.reserve(len(col.base_children))
         for i in range(info.children.size()):
             field_names.push_back(info.children[i].name)
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 437c3ef6ec4..c361a3f00c4 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 # cython: boundscheck = False
 
@@ -17,6 +17,7 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
+from cudf._lib.column cimport Column
 from cudf._lib.cpp.io.data_sink cimport data_sink
 from cudf._lib.cpp.io.json cimport (
     json_reader_options,
@@ -42,10 +43,6 @@ from cudf._lib.io.utils cimport (
 from cudf._lib.types cimport dtype_to_data_type
 from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 
-from cudf.api.types import is_list_dtype, is_struct_dtype
-
-from cudf._lib.column cimport Column
-
 
 cpdef read_json(object filepaths_or_buffers,
                 object dtype,
@@ -214,13 +211,12 @@ def write_json(
 cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except *:
     cdef schema_element s_element
     cdef data_type lib_type
-    if cudf.api.types.is_categorical_dtype(dtype):
+    dtype = cudf.dtype(dtype)
+    if isinstance(dtype, cudf.CategoricalDtype):
         raise NotImplementedError(
             "CategoricalDtype as dtype is not yet "
             "supported in JSON reader"
         )
-
-    dtype = cudf.dtype(dtype)
     lib_type = dtype_to_data_type(dtype)
     s_element.type = lib_type
     if isinstance(dtype, cudf.StructDtype):
@@ -237,19 +233,18 @@ cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except *:
 
 
 cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
-    if cudf.api.types.is_categorical_dtype(dtype):
+    dtype = cudf.dtype(dtype)
+    if isinstance(dtype, cudf.CategoricalDtype):
         raise NotImplementedError(
             "CategoricalDtype as dtype is not yet "
             "supported in JSON reader"
         )
-
-    dtype = cudf.dtype(dtype)
     return dtype_to_data_type(dtype)
 
 cdef _set_col_children_metadata(Column col,
                                 column_name_info& col_meta):
     cdef column_name_info child_info
-    if is_struct_dtype(col):
+    if isinstance(col.dtype, cudf.StructDtype):
         for i, (child_col, name) in enumerate(
             zip(col.children, list(col.dtype.fields))
         ):
@@ -258,7 +253,7 @@ cdef _set_col_children_metadata(Column col,
             _set_col_children_metadata(
                 child_col, col_meta.children[i]
             )
-    elif is_list_dtype(col):
+    elif isinstance(col.dtype, cudf.ListDtype):
         for i, child_col in enumerate(col.children):
             col_meta.children.push_back(child_info)
             _set_col_children_metadata(
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 0ae039b14d2..c64296eb7da 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import cudf
 from cudf.core.buffer import acquire_spill_lock
@@ -59,7 +59,6 @@ from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 from pyarrow.lib import NativeFile
 
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
-from cudf.api.types import is_list_dtype, is_struct_dtype
 
 
 cpdef read_raw_orc_statistics(filepath_or_buffer):
@@ -474,7 +473,7 @@ cdef class ORCWriter:
 cdef _set_col_children_metadata(Column col,
                                 column_in_metadata& col_meta,
                                 list_column_as_map=False):
-    if is_struct_dtype(col):
+    if isinstance(col.dtype, cudf.StructDtype):
         for i, (child_col, name) in enumerate(
             zip(col.children, list(col.dtype.fields))
         ):
@@ -482,7 +481,7 @@ cdef _set_col_children_metadata(Column col,
             _set_col_children_metadata(
                 child_col, col_meta.child(i), list_column_as_map
             )
-    elif is_list_dtype(col):
+    elif isinstance(col.dtype, cudf.ListDtype):
         if list_column_as_map:
             col_meta.set_list_column_as_map()
         _set_col_children_metadata(
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 4acb1ce10b1..27efc5e1ecd 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 # cython: boundscheck = False
 
@@ -18,12 +18,7 @@ import numpy as np
 
 from cython.operator cimport dereference
 
-from cudf.api.types import (
-    is_decimal_dtype,
-    is_list_dtype,
-    is_list_like,
-    is_struct_dtype,
-)
+from cudf.api.types import is_list_like
 
 from cudf._lib.utils cimport data_from_unique_ptr
 
@@ -220,7 +215,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
 
         # update the decimal precision of each column
         for col in names:
-            if is_decimal_dtype(df._data[col].dtype):
+            if isinstance(df._data[col].dtype, cudf.core.dtypes.DecimalDtype):
                 df._data[col].dtype.precision = (
                     meta_data_per_column[col]["metadata"]["precision"]
                 )
@@ -703,7 +698,7 @@ cdef _set_col_metadata(
         # is true.
         col_meta.set_nullability(True)
 
-    if is_struct_dtype(col):
+    if isinstance(col.dtype, cudf.StructDtype):
         for i, (child_col, name) in enumerate(
             zip(col.children, list(col.dtype.fields))
         ):
@@ -713,13 +708,11 @@ cdef _set_col_metadata(
                 col_meta.child(i),
                 force_nullable_schema
             )
-    elif is_list_dtype(col):
+    elif isinstance(col.dtype, cudf.ListDtype):
         _set_col_metadata(
             col.children[1],
             col_meta.child(1),
             force_nullable_schema
         )
-    else:
-        if is_decimal_dtype(col):
-            col_meta.set_decimal_precision(col.dtype.precision)
-        return
+    elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype):
+        col_meta.set_decimal_precision(col.dtype.precision)
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 27fb9e994f0..37708a4e3ba 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import copy
 
@@ -14,12 +14,7 @@ from libcpp.utility cimport move
 import cudf
 from cudf._lib import pylibcudf
 from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES
-from cudf.core.dtypes import (
-    ListDtype,
-    StructDtype,
-    is_list_dtype,
-    is_struct_dtype,
-)
+from cudf.core.dtypes import ListDtype, StructDtype
 from cudf.core.missing import NA, NaT
 
 cimport cudf._lib.cpp.types as libcudf_types
@@ -79,9 +74,9 @@ def gather_metadata(dtypes):
     out = []
     for name, dtype in dtypes.items():
         v = pylibcudf.interop.ColumnMetadata(name)
-        if is_struct_dtype(dtype):
+        if isinstance(dtype, cudf.StructDtype):
             v.children_meta = gather_metadata(dtype.fields)
-        elif is_list_dtype(dtype):
+        elif isinstance(dtype, cudf.ListDtype):
             # Offsets column is unnamed and has no children
             v.children_meta.append(pylibcudf.interop.ColumnMetadata(""))
             v.children_meta.extend(
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index d87104bf168..1b4f4617e97 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from enum import IntEnum
 
@@ -238,15 +238,15 @@ cdef dtype_from_column_view(column_view cv):
 
 cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
     cdef libcudf_types.type_id tid
-    if cudf.api.types.is_list_dtype(dtype):
+    if isinstance(dtype, cudf.ListDtype):
         tid = libcudf_types.type_id.LIST
-    elif cudf.api.types.is_struct_dtype(dtype):
+    elif isinstance(dtype, cudf.StructDtype):
         tid = libcudf_types.type_id.STRUCT
-    elif cudf.api.types.is_decimal128_dtype(dtype):
+    elif isinstance(dtype, cudf.Decimal128Dtype):
         tid = libcudf_types.type_id.DECIMAL128
-    elif cudf.api.types.is_decimal64_dtype(dtype):
+    elif isinstance(dtype, cudf.Decimal64Dtype):
         tid = libcudf_types.type_id.DECIMAL64
-    elif cudf.api.types.is_decimal32_dtype(dtype):
+    elif isinstance(dtype, cudf.Decimal32Dtype):
         tid = libcudf_types.type_id.DECIMAL32
     else:
         tid = <libcudf_types.type_id> (
@@ -259,21 +259,21 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
         return libcudf_types.data_type(tid)
 
 cpdef dtype_to_pylibcudf_type(dtype):
-    if cudf.api.types.is_list_dtype(dtype):
+    if isinstance(dtype, cudf.ListDtype):
         return pylibcudf.DataType(pylibcudf.TypeId.LIST)
-    elif cudf.api.types.is_struct_dtype(dtype):
+    elif isinstance(dtype, cudf.StructDtype):
         return pylibcudf.DataType(pylibcudf.TypeId.STRUCT)
-    elif cudf.api.types.is_decimal_dtype(dtype):
-        if cudf.api.types.is_decimal128_dtype(dtype):
-            tid = pylibcudf.TypeId.DECIMAL128
-        elif cudf.api.types.is_decimal64_dtype(dtype):
-            tid = pylibcudf.TypeId.DECIMAL64
-        else:
-            tid = pylibcudf.TypeId.DECIMAL32
+    elif isinstance(dtype, cudf.Decimal128Dtype):
+        tid = pylibcudf.TypeId.DECIMAL128
+        return pylibcudf.DataType(tid, -dtype.scale)
+    elif isinstance(dtype, cudf.Decimal64Dtype):
+        tid = pylibcudf.TypeId.DECIMAL64
+        return pylibcudf.DataType(tid, -dtype.scale)
+    elif isinstance(dtype, cudf.Decimal32Dtype):
+        tid = pylibcudf.TypeId.DECIMAL32
         return pylibcudf.DataType(tid, -dtype.scale)
-
     # libcudf types don't support localization so convert to the base type
-    if isinstance(dtype, pd.DatetimeTZDtype):
+    elif isinstance(dtype, pd.DatetimeTZDtype):
         dtype = np.dtype(f"<M8[{dtype.unit}]")
     else:
         dtype = np.dtype(dtype)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 03982a58517..50a47b4f507 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import numpy as np
 import pyarrow as pa
@@ -22,12 +22,6 @@ try:
 except ImportError:
     import json
 
-from cudf.api.types import (
-    is_categorical_dtype,
-    is_decimal_dtype,
-    is_list_dtype,
-    is_struct_dtype,
-)
 from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes, np_to_pa_dtype
 
 PARQUET_META_TYPE_MAP = {
@@ -92,16 +86,16 @@ cpdef generate_pandas_metadata(table, index):
     # Columns
     for name, col in table._data.items():
         col_names.append(name)
-        if is_categorical_dtype(col):
+        if isinstance(col.dtype, cudf.CategoricalDtype):
             raise ValueError(
                 "'category' column dtypes are currently not "
                 + "supported by the gpu accelerated parquet writer"
             )
-        elif (
-            is_list_dtype(col)
-            or is_struct_dtype(col)
-            or is_decimal_dtype(col)
-        ):
+        elif isinstance(col.dtype, (
+            cudf.ListDtype,
+            cudf.StructDtype,
+            cudf.core.dtypes.DecimalDtype
+        )):
             types.append(col.dtype.to_arrow())
         else:
             # A boolean element takes 8 bits in cudf and 1 bit in
@@ -147,12 +141,12 @@ cpdef generate_pandas_metadata(table, index):
                         level=level,
                         column_names=col_names
                     )
-                if is_categorical_dtype(idx):
+                if isinstance(idx.dtype, cudf.CategoricalDtype):
                     raise ValueError(
                         "'category' column dtypes are currently not "
                         + "supported by the gpu accelerated parquet writer"
                     )
-                elif is_list_dtype(idx):
+                elif isinstance(idx.dtype, cudf.ListDtype):
                     types.append(col.dtype.to_arrow())
                 else:
                     # A boolean element takes 8 bits in cudf and 1 bit in
diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
index 67043d3fbb3..053425fff8d 100644
--- a/python/cudf/cudf/core/_internals/timezones.py
+++ b/python/cudf/cudf/core/_internals/timezones.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 import os
 import zoneinfo
@@ -89,8 +89,10 @@ def _read_tzfile_as_frame(tzdir, zone_name):
             [np.timedelta64(0, "s")]
         )
 
-    return DataFrame._from_columns(
-        transition_times_and_offsets, ["transition_times", "offsets"]
+    return DataFrame._from_data(
+        dict(
+            zip(["transition_times", "offsets"], transition_times_and_offsets)
+        )
     )
 
 
diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py
index adf9fe39e4f..ce6bb83bc77 100644
--- a/python/cudf/cudf/core/abc.py
+++ b/python/cudf/cudf/core/abc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 """Common abstract base classes for cudf."""
 
 import pickle
@@ -89,7 +89,13 @@ def device_serialize(self):
         """
         header, frames = self.serialize()
         assert all(
-            isinstance(f, (cudf.core.buffer.Buffer, memoryview))
+            isinstance(
+                f,
+                (
+                    cudf.core.buffer.Buffer,
+                    memoryview,
+                ),
+            )
             for f in frames
         )
         header["type-serialized"] = pickle.dumps(type(self))
diff --git a/python/cudf/cudf/core/buffer/__init__.py b/python/cudf/cudf/core/buffer/__init__.py
index d8883bd97e5..9b9774c12be 100644
--- a/python/cudf/cudf/core/buffer/__init__.py
+++ b/python/cudf/cudf/core/buffer/__init__.py
@@ -1,6 +1,10 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
-from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper
+from cudf.core.buffer.buffer import (
+    Buffer,
+    BufferOwner,
+    cuda_array_interface_wrapper,
+)
 from cudf.core.buffer.exposure_tracked_buffer import ExposureTrackedBuffer
 from cudf.core.buffer.spillable_buffer import SpillableBuffer, SpillLock
 from cudf.core.buffer.utils import (
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index 59d20a2784d..8d278c9c065 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -1,9 +1,10 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
 import math
 import pickle
+import weakref
 from types import SimpleNamespace
 from typing import Any, Dict, Literal, Mapping, Optional, Sequence, Tuple
 
@@ -90,15 +91,31 @@ def cuda_array_interface_wrapper(
     )
 
 
-class Buffer(Serializable):
-    """A Buffer represents device memory.
+class BufferOwner(Serializable):
+    """An owning buffer that represents device memory.
 
-    Use the factory function `as_buffer` to create a Buffer instance.
+    This class isn't meant to be used throughout cuDF. Instead, it
+    standardizes data owning by wrapping any data object that
+    represents device memory. Multiple `Buffer` instances, which are
+    the ones used throughout cuDF, can then refer to the same
+    `BufferOwner` instance.
+
+    In order to implement copy-on-write and spillable buffers, we need the
+    ability to detect external access to the underlying memory. We say that
+    the buffer has been exposed if the device pointer (integer or void*) has
+    been accessed outside of BufferOwner. In this case, we have no control
+    over knowing if the data is being modified by a third party.
+
+    Use `_from_device_memory` and `_from_host_memory` to create
+    a new instance from either device or host memory respectively.
     """
 
     _ptr: int
     _size: int
     _owner: object
+    _exposed: bool
+    # The set of buffers that point to this owner.
+    _slices: weakref.WeakSet[Buffer]
 
     def __init__(self):
         raise ValueError(
@@ -107,8 +124,8 @@ def __init__(self):
         )
 
     @classmethod
-    def _from_device_memory(cls, data: Any) -> Self:
-        """Create a Buffer from an object exposing `__cuda_array_interface__`.
+    def _from_device_memory(cls, data: Any, exposed: bool) -> Self:
+        """Create from an object providing a `__cuda_array_interface__`.
 
         No data is being copied.
 
@@ -116,16 +133,29 @@ def _from_device_memory(cls, data: Any) -> Self:
         ----------
         data : device-buffer-like
             An object implementing the CUDA Array Interface.
+        exposed : bool
+            Mark the buffer as permanently exposed. This is used by
+            ExposureTrackedBuffer to determine when a deep copy is required
+            and by SpillableBuffer to mark the buffer unspillable.
 
         Returns
         -------
-        Buffer
-            Buffer representing the same device memory as `data`
+        BufferOwner
+            BufferOwner wrapping `data`
+
+        Raises
+        ------
+        AttributeError
+            If data does not support the cuda array interface
+        ValueError
+            If the resulting buffer has negative size
         """
 
         # Bypass `__init__` and initialize attributes manually
         ret = cls.__new__(cls)
         ret._owner = data
+        ret._exposed = exposed
+        ret._slices = weakref.WeakSet()
         if isinstance(data, rmm.DeviceBuffer):  # Common case shortcut
             ret._ptr = data.ptr
             ret._size = data.size
@@ -139,7 +169,7 @@ def _from_device_memory(cls, data: Any) -> Self:
 
     @classmethod
     def _from_host_memory(cls, data: Any) -> Self:
-        """Create a Buffer from a buffer or array like object
+        """Create an owner from a buffer or array like object
 
         Data must implement `__array_interface__`, the buffer protocol, and/or
         be convertible to a buffer object using `numpy.array()`
@@ -155,8 +185,8 @@ def _from_host_memory(cls, data: Any) -> Self:
 
         Returns
         -------
-        Buffer
-            Buffer representing a copy of `data`.
+        BufferOwner
+            BufferOwner wrapping a device copy of `data`.
         """
 
         # Convert to numpy array, this will not copy data in most cases.
@@ -166,54 +196,7 @@ def _from_host_memory(cls, data: Any) -> Self:
         # Copy to device memory
         buf = rmm.DeviceBuffer(ptr=ptr, size=size)
         # Create from device memory
-        return cls._from_device_memory(buf)
-
-    def _getitem(self, offset: int, size: int) -> Self:
-        """
-        Sub-classes can overwrite this to implement __getitem__
-        without having to handle non-slice inputs.
-        """
-        return self._from_device_memory(
-            cuda_array_interface_wrapper(
-                ptr=self.get_ptr(mode="read") + offset,
-                size=size,
-                owner=self.owner,
-            )
-        )
-
-    def __getitem__(self, key: slice) -> Self:
-        """Create a new slice of the buffer."""
-        if not isinstance(key, slice):
-            raise TypeError(
-                "Argument 'key' has incorrect type "
-                f"(expected slice, got {key.__class__.__name__})"
-            )
-        start, stop, step = key.indices(self.size)
-        if step != 1:
-            raise ValueError("slice must be C-contiguous")
-        return self._getitem(offset=start, size=stop - start)
-
-    def copy(self, deep: bool = True) -> Self:
-        """
-        Return a copy of Buffer.
-
-        Parameters
-        ----------
-        deep : bool, default True
-            If True, returns a deep copy of the underlying Buffer data.
-            If False, returns a shallow copy of the Buffer pointing to
-            the same underlying data.
-
-        Returns
-        -------
-        Buffer
-        """
-        if deep:
-            return self._from_device_memory(
-                rmm.DeviceBuffer(ptr=self.get_ptr(mode="read"), size=self.size)
-            )
-        else:
-            return self[:]
+        return cls._from_device_memory(buf, exposed=False)
 
     @property
     def size(self) -> int:
@@ -226,20 +209,29 @@ def nbytes(self) -> int:
         return self._size
 
     @property
-    def owner(self) -> Any:
+    def owner(self) -> object:
         """Object owning the memory of the buffer."""
         return self._owner
 
     @property
-    def __cuda_array_interface__(self) -> Mapping:
-        """Implementation of the CUDA Array Interface."""
-        return {
-            "data": (self.get_ptr(mode="write"), False),
-            "shape": (self.size,),
-            "strides": None,
-            "typestr": "|u1",
-            "version": 0,
-        }
+    def exposed(self) -> bool:
+        """The current exposure status of the buffer
+
+        This is used by ExposureTrackedBuffer to determine when a deep copy
+        is required and by SpillableBuffer to mark the buffer unspillable.
+        """
+        return self._exposed
+
+    def mark_exposed(self) -> None:
+        """Mark the buffer as "exposed" permanently
+
+        This is used by ExposureTrackedBuffer to determine when a deep copy
+        is required and by SpillableBuffer to mark the buffer unspillable.
+
+        Notice, once the exposure status becomes True, it will never change
+        back.
+        """
+        self._exposed = True
 
     def get_ptr(self, *, mode: Literal["read", "write"]) -> int:
         """Device pointer to the start of the buffer.
@@ -277,20 +269,148 @@ def memoryview(
         )
         return memoryview(host_buf).toreadonly()
 
+    def __str__(self) -> str:
+        return (
+            f"<{self.__class__.__name__} size={format_bytes(self._size)} "
+            f"ptr={hex(self._ptr)} owner={self._owner!r}>"
+        )
+
+
+class Buffer(Serializable):
+    """A buffer that represents a slice or view of a `BufferOwner`.
+
+    Use the factory function `as_buffer` to create a Buffer instance.
+
+    Note
+    ----
+    This buffer is untyped, so all indexing and sizes are in bytes.
+
+    Parameters
+    ----------
+    owner
+        The owning exposure buffer this refers to.
+    offset
+        The offset relative to the start memory of owner (in bytes).
+    size
+        The size of the buffer (in bytes). If None, use the size of owner.
+    """
+
+    def __init__(
+        self,
+        *,
+        owner: BufferOwner,
+        offset: int = 0,
+        size: Optional[int] = None,
+    ) -> None:
+        size = owner.size if size is None else size
+        if size < 0:
+            raise ValueError("size cannot be negative")
+        if offset < 0:
+            raise ValueError("offset cannot be negative")
+        if offset + size > owner.size:
+            raise ValueError(
+                "offset+size cannot be greater than the size of owner"
+            )
+        self._owner = owner
+        self._offset = offset
+        self._size = size
+
+    @property
+    def size(self) -> int:
+        """Size of the buffer in bytes."""
+        return self._size
+
+    @property
+    def nbytes(self) -> int:
+        """Size of the buffer in bytes."""
+        return self._size
+
+    @property
+    def owner(self) -> BufferOwner:
+        """Object owning the memory of the buffer."""
+        return self._owner
+
+    def __getitem__(self, key: slice) -> Self:
+        """Create a new slice of the buffer."""
+        if not isinstance(key, slice):
+            raise TypeError(
+                "Argument 'key' has incorrect type "
+                f"(expected slice, got {key.__class__.__name__})"
+            )
+        start, stop, step = key.indices(self.size)
+        if step != 1:
+            raise ValueError("slice must be C-contiguous")
+        return self.__class__(
+            owner=self._owner, offset=self._offset + start, size=stop - start
+        )
+
+    def get_ptr(self, *, mode: Literal["read", "write"]) -> int:
+        return self._owner.get_ptr(mode=mode) + self._offset
+
+    def memoryview(self) -> memoryview:
+        return self._owner.memoryview(offset=self._offset, size=self._size)
+
+    def copy(self, deep: bool = True) -> Self:
+        """Return a copy of Buffer.
+
+        Parameters
+        ----------
+        deep : bool, default True
+            - If deep=True, returns a deep copy of the underlying data.
+            - If deep=False, returns a new `Buffer` instance that refers
+              to the same `BufferOwner` as this one. Thus, no device
+              data are being copied.
+
+        Returns
+        -------
+        Buffer
+            A new buffer that either refers to either a new or an existing
+            `BufferOwner` depending on the `deep` argument (see above).
+        """
+
+        # When doing a shallow copy, we just return a new slice
+        if not deep:
+            return self.__class__(
+                owner=self._owner, offset=self._offset, size=self._size
+            )
+
+        # Otherwise, we create a new copy of the memory
+        owner = self._owner._from_device_memory(
+            rmm.DeviceBuffer(
+                ptr=self._owner.get_ptr(mode="read") + self._offset,
+                size=self.size,
+            ),
+            exposed=False,
+        )
+        return self.__class__(owner=owner, offset=0, size=owner.size)
+
+    @property
+    def __cuda_array_interface__(self) -> Mapping:
+        """Implementation of the CUDA Array Interface."""
+        return {
+            "data": (self.get_ptr(mode="write"), False),
+            "shape": (self.size,),
+            "strides": None,
+            "typestr": "|u1",
+            "version": 0,
+        }
+
     def serialize(self) -> Tuple[dict, list]:
         """Serialize the buffer into header and frames.
 
-        The frames can be a mixture of memoryview and Buffer objects.
+        The frames can be a mixture of memoryview, Buffer, and BufferOwner
+        objects.
 
         Returns
         -------
         Tuple[dict, List]
             The first element of the returned tuple is a dict containing any
             serializable metadata required to reconstruct the object. The
-            second element is a list containing Buffers and memoryviews.
+            second element is a list containing single frame.
         """
         header: Dict[str, Any] = {}
         header["type-serialized"] = pickle.dumps(type(self))
+        header["owner-type-serialized"] = pickle.dumps(type(self._owner))
         header["frame_count"] = 1
         frames = [self]
         return header, frames
@@ -317,16 +437,27 @@ def deserialize(cls, header: dict, frames: list) -> Self:
         if isinstance(frame, cls):
             return frame  # The frame is already deserialized
 
+        owner_type: BufferOwner = pickle.loads(header["owner-type-serialized"])
         if hasattr(frame, "__cuda_array_interface__"):
-            return cls._from_device_memory(frame)
-        return cls._from_host_memory(frame)
+            owner = owner_type._from_device_memory(frame, exposed=False)
+        else:
+            owner = owner_type._from_host_memory(frame)
+        return cls(
+            owner=owner,
+            offset=0,
+            size=owner.size,
+        )
 
     def __repr__(self) -> str:
-        klass = self.__class__
-        name = f"{klass.__module__}.{klass.__qualname__}"
         return (
-            f"<{name} size={format_bytes(self._size)} "
-            f"ptr={hex(self._ptr)} owner={repr(self._owner)}>"
+            f"{self.__class__.__name__}(owner={self._owner!r}, "
+            f"offset={self._offset!r}, size={self._size!r})"
+        )
+
+    def __str__(self) -> str:
+        return (
+            f"<{self.__class__.__name__} size={format_bytes(self._size)} "
+            f"offset={format_bytes(self._offset)} of {self._owner}>"
         )
 
 
diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
index f2ac6301944..4c08016adbb 100644
--- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
+++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
@@ -1,241 +1,47 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
-import weakref
-from typing import (
-    Any,
-    Container,
-    Literal,
-    Mapping,
-    Optional,
-    Type,
-    TypeVar,
-    cast,
-)
+from typing import Literal, Mapping, Optional
 
 from typing_extensions import Self
 
 import cudf
-from cudf.core.buffer.buffer import Buffer, get_ptr_and_size
-from cudf.utils.string import format_bytes
-
-T = TypeVar("T", bound="ExposureTrackedBuffer")
-
-
-def get_owner(data, klass: Type[T]) -> Optional[T]:
-    """Get the owner of `data`, if any exist
-
-    Search through the stack of data owners in order to find an
-    owner of type `klass` (not subclasses).
-
-    Parameters
-    ----------
-    data
-        The data object
-
-    Return
-    ------
-    klass or None
-        The owner of `data` if `klass` or None.
-    """
-
-    if type(data) is klass:
-        return data
-    if hasattr(data, "owner"):
-        return get_owner(data.owner, klass)
-    return None
-
-
-def as_exposure_tracked_buffer(
-    data, exposed: bool, subclass: Optional[Type[T]] = None
-) -> BufferSlice:
-    """Factory function to wrap `data` in a slice of an exposure tracked buffer
-
-    If `subclass` is None, a new ExposureTrackedBuffer that points to the
-    memory of `data` is created and a BufferSlice that points to all of the
-    new ExposureTrackedBuffer is returned.
-
-    If `subclass` is not None, a new `subclass` is created instead. Still,
-    a BufferSlice that points to all of the new `subclass` is returned
-
-    It is illegal for an exposure tracked buffer to own another exposure
-    tracked buffer. When representing the same memory, we should have a single
-    exposure tracked buffer and multiple buffer slices.
-
-    Developer Notes
-    ---------------
-    This function always returns slices thus all buffers in cudf will use
-    `BufferSlice` when copy-on-write is enabled. The slices implement
-    copy-on-write by trigging deep copies when write access is detected
-    and multiple slices points to the same exposure tracked buffer.
-
-    Parameters
-    ----------
-    data : buffer-like or array-like
-        A buffer-like or array-like object that represents C-contiguous memory.
-    exposed
-        Mark the buffer as permanently exposed.
-    subclass
-        If not None, a subclass of ExposureTrackedBuffer to wrap `data`.
-
-    Return
-    ------
-    BufferSlice
-        A buffer slice that points to a ExposureTrackedBuffer (or `subclass`),
-        which in turn wraps `data`.
-    """
-
-    if not hasattr(data, "__cuda_array_interface__"):
-        if exposed:
-            raise ValueError("cannot created exposed host memory")
-        return cast(
-            BufferSlice, ExposureTrackedBuffer._from_host_memory(data)[:]
-        )
-
-    owner = get_owner(data, subclass or ExposureTrackedBuffer)
-    if owner is None:
-        return cast(
-            BufferSlice,
-            ExposureTrackedBuffer._from_device_memory(data, exposed=exposed)[
-                :
-            ],
-        )
-
-    # At this point, we know that `data` is owned by a exposure tracked buffer
-    ptr, size = get_ptr_and_size(data.__cuda_array_interface__)
-    if size > 0 and owner._ptr == 0:
-        raise ValueError("Cannot create a non-empty slice of a null buffer")
-    return BufferSlice(base=owner, offset=ptr - owner._ptr, size=size)
+from cudf.core.buffer.buffer import Buffer, BufferOwner
 
 
 class ExposureTrackedBuffer(Buffer):
-    """A Buffer that tracks its "expose" status.
-
-    In order to implement copy-on-write and spillable buffers, we need the
-    ability to detect external access to the underlying memory. We say that
-    the buffer has been exposed if the device pointer (integer or void*) has
-    been accessed outside of ExposureTrackedBuffer. In this case, we have no
-    control over knowing if the data is being modified by a third-party.
-
-    Attributes
-    ----------
-    _exposed
-        The current exposure status of the buffer. Notice, once the exposure
-        status becomes True, it should never change back.
-    _slices
-        The set of BufferSlice instances that point to this buffer.
-    """
-
-    _exposed: bool
-    _slices: weakref.WeakSet[BufferSlice]
-
-    @property
-    def exposed(self) -> bool:
-        return self._exposed
-
-    def mark_exposed(self) -> None:
-        """Mark the buffer as "exposed" permanently"""
-        self._exposed = True
-
-    @classmethod
-    def _from_device_memory(cls, data: Any, *, exposed: bool = False) -> Self:
-        """Create an exposure tracked buffer from device memory.
-
-        No data is being copied.
-
-        Parameters
-        ----------
-        data : device-buffer-like
-            An object implementing the CUDA Array Interface.
-        exposed : bool, optional
-            Mark the buffer as permanently exposed.
-
-        Returns
-        -------
-        ExposureTrackedBuffer
-            Buffer representing the same device memory as `data`
-        """
-        ret = super()._from_device_memory(data)
-        ret._exposed = exposed
-        ret._slices = weakref.WeakSet()
-        return ret
-
-    def _getitem(self, offset: int, size: int) -> BufferSlice:
-        return BufferSlice(base=self, offset=offset, size=size)
-
-    @property
-    def __cuda_array_interface__(self) -> Mapping:
-        self.mark_exposed()
-        return super().__cuda_array_interface__
-
-    def __repr__(self) -> str:
-        return (
-            f"<ExposureTrackedBuffer exposed={self.exposed} "
-            f"size={format_bytes(self._size)} "
-            f"ptr={hex(self._ptr)} owner={repr(self._owner)}>"
-        )
-
-
-class BufferSlice(ExposureTrackedBuffer):
-    """A slice (aka. a view) of a exposure tracked buffer.
+    """An exposure tracked buffer.
 
     Parameters
     ----------
-    base
-        The exposure tracked buffer this slice refers to.
+    owner
+        The owning exposure tracked buffer this refers to.
     offset
-        The offset relative to the start memory of base (in bytes).
+        The offset relative to the start memory of owner (in bytes).
     size
         The size of the slice (in bytes)
-    passthrough_attributes
-        Name of attributes that are passed through to the base as-is.
     """
 
+    _owner: BufferOwner
+
     def __init__(
         self,
-        base: ExposureTrackedBuffer,
-        offset: int,
-        size: int,
-        *,
-        passthrough_attributes: Container[str] = ("exposed",),
+        owner: BufferOwner,
+        offset: int = 0,
+        size: Optional[int] = None,
     ) -> None:
-        if size < 0:
-            raise ValueError("size cannot be negative")
-        if offset < 0:
-            raise ValueError("offset cannot be negative")
-        if offset + size > base.size:
-            raise ValueError(
-                "offset+size cannot be greater than the size of base"
-            )
-        self._base = base
-        self._offset = offset
-        self._size = size
-        self._owner = base
-        self._passthrough_attributes = passthrough_attributes
-        base._slices.add(self)
-
-    def __getattr__(self, name):
-        if name in self._passthrough_attributes:
-            return getattr(self._base, name)
-        raise AttributeError(
-            f"{self.__class__.__name__} object has no attribute {name}"
-        )
+        super().__init__(owner=owner, offset=offset, size=size)
+        self._owner._slices.add(self)
 
-    def _getitem(self, offset: int, size: int) -> BufferSlice:
-        return BufferSlice(
-            base=self._base, offset=offset + self._offset, size=size
-        )
+    @property
+    def exposed(self) -> bool:
+        return self._owner.exposed
 
     def get_ptr(self, *, mode: Literal["read", "write"]) -> int:
         if mode == "write" and cudf.get_option("copy_on_write"):
             self.make_single_owner_inplace()
-        return self._base.get_ptr(mode=mode) + self._offset
-
-    def memoryview(
-        self, *, offset: int = 0, size: Optional[int] = None
-    ) -> memoryview:
-        return self._base.memoryview(offset=self._offset + offset, size=size)
+        return super().get_ptr(mode=mode)
 
     def copy(self, deep: bool = True) -> Self:
         """Return a copy of Buffer.
@@ -260,16 +66,14 @@ def copy(self, deep: bool = True) -> Self:
 
         Returns
         -------
-        BufferSlice
-            A slice pointing to either a new or the existing base buffer
-            depending on the expose status of the base buffer and the
+        ExposureTrackedBuffer
+            A slice pointing to either a new or the existing owner
+            depending on the expose status of the owner and the
             copy-on-write option (see above).
         """
         if cudf.get_option("copy_on_write"):
-            base_copy = self._base.copy(deep=deep or self.exposed)
-        else:
-            base_copy = self._base.copy(deep=deep)
-        return cast(Self, base_copy[self._offset : self._offset + self._size])
+            return super().copy(deep=deep or self.exposed)
+        return super().copy(deep=deep)
 
     @property
     def __cuda_array_interface__(self) -> Mapping:
@@ -278,7 +82,7 @@ def __cuda_array_interface__(self) -> Mapping:
         return super().__cuda_array_interface__
 
     def make_single_owner_inplace(self) -> None:
-        """Make sure this slice is the only one pointing to the base.
+        """Make sure this slice is the only one pointing to the owner.
 
         This is used by copy-on-write to trigger a deep copy when write
         access is detected.
@@ -294,18 +98,11 @@ def make_single_owner_inplace(self) -> None:
             Buffer representing the same device memory as `data`
         """
 
-        if len(self._base._slices) > 1:
-            # If this is not the only slice pointing to `self._base`, we
-            # point to a new deep copy of the base.
+        if len(self._owner._slices) > 1:
+            # If this is not the only slice pointing to `self._owner`, we
+            # point to a new deep copy of the owner.
             t = self.copy(deep=True)
-            self._base = t._base
+            self._owner = t._owner
             self._offset = t._offset
             self._size = t._size
-            self._owner = t._base
-            self._base._slices.add(self)
-
-    def __repr__(self) -> str:
-        return (
-            f"<BufferSlice size={format_bytes(self._size)} "
-            f"offset={format_bytes(self._offset)} of {self._base}>"
-        )
+            self._owner._slices.add(self)
diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index 91f3b2cd544..3e654e01401 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -16,7 +16,7 @@
 
 import rmm.mr
 
-from cudf.core.buffer.spillable_buffer import SpillableBuffer
+from cudf.core.buffer.spillable_buffer import SpillableBufferOwner
 from cudf.options import get_option
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.string import format_bytes
@@ -128,7 +128,7 @@ def log_spill(self, src: str, dst: str, nbytes: int, time: float) -> None:
                 total_time + time,
             )
 
-    def log_expose(self, buf: SpillableBuffer) -> None:
+    def log_expose(self, buf: SpillableBufferOwner) -> None:
         """Log an expose event
 
         We track logged exposes by grouping them by their traceback such
@@ -224,7 +224,7 @@ class SpillManager:
         SpillStatistics for the different levels.
     """
 
-    _buffers: weakref.WeakValueDictionary[int, SpillableBuffer]
+    _buffers: weakref.WeakValueDictionary[int, SpillableBufferOwner]
     statistics: SpillStatistics
 
     def __init__(
@@ -298,14 +298,14 @@ def _out_of_memory_handle(self, nbytes: int, *, retry_once=True) -> bool:
         )
         return False  # Since we didn't find anything to spill, we give up
 
-    def add(self, buffer: SpillableBuffer) -> None:
+    def add(self, buffer: SpillableBufferOwner) -> None:
         """Add buffer to the set of managed buffers
 
         The manager keeps a weak reference to the buffer
 
         Parameters
         ----------
-        buffer : SpillableBuffer
+        buffer : SpillableBufferOwner
             The buffer to manage
         """
         if buffer.size > 0 and not buffer.exposed:
@@ -316,7 +316,7 @@ def add(self, buffer: SpillableBuffer) -> None:
 
     def buffers(
         self, order_by_access_time: bool = False
-    ) -> Tuple[SpillableBuffer, ...]:
+    ) -> Tuple[SpillableBufferOwner, ...]:
         """Get all managed buffers
 
         Parameters
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index 1856bec1876..aeac4b76e58 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -16,8 +16,8 @@
 
 from cudf.core.buffer.buffer import (
     Buffer,
+    BufferOwner,
     cuda_array_interface_wrapper,
-    get_ptr_and_size,
     host_memory_allocation,
 )
 from cudf.utils.nvtx_annotation import _get_color_for_nvtx, annotate
@@ -27,86 +27,6 @@
     from cudf.core.buffer.spill_manager import SpillManager
 
 
-def get_spillable_owner(data) -> Optional[SpillableBuffer]:
-    """Get the spillable owner of `data`, if any exist
-
-    Search through the stack of data owners in order to find an
-    owner of type `SpillableBuffer` (not subclasses).
-
-    Parameters
-    ----------
-    data : buffer-like or array-like
-        A buffer-like or array-like object that represent C-contiguous memory.
-
-    Return
-    ------
-    SpillableBuffer or None
-        The owner of `data` if spillable or None.
-    """
-
-    if type(data) is SpillableBuffer:
-        return data
-    if hasattr(data, "owner"):
-        return get_spillable_owner(data.owner)
-    return None
-
-
-def as_spillable_buffer(data, exposed: bool) -> SpillableBuffer:
-    """Factory function to wrap `data` in a SpillableBuffer object.
-
-    If `data` isn't a buffer already, a new buffer that points to the memory of
-    `data` is created. If `data` represents host memory, it is copied to a new
-    `rmm.DeviceBuffer` device allocation. Otherwise, the memory of `data` is
-    **not** copied, instead the new buffer keeps a reference to `data` in order
-    to retain its lifetime.
-
-    If `data` is owned by a spillable buffer, a "slice" of the buffer is
-    returned. In this case, the spillable buffer must either be "exposed" or
-    spilled locked (called within an acquire_spill_lock context). This is to
-    guarantee that the memory of `data` isn't spilled before this function gets
-    to calculate the offset of the new slice.
-
-    It is illegal for a spillable buffer to own another spillable buffer.
-
-    Parameters
-    ----------
-    data : buffer-like or array-like
-        A buffer-like or array-like object that represent C-contiguous memory.
-    exposed : bool, optional
-        Mark the buffer as permanently exposed (unspillable).
-
-    Return
-    ------
-    SpillableBuffer
-        A spillabe buffer instance that represents the device memory of `data`.
-    """
-
-    from cudf.core.buffer.utils import get_spill_lock
-
-    if not hasattr(data, "__cuda_array_interface__"):
-        if exposed:
-            raise ValueError("cannot created exposed host memory")
-        return SpillableBuffer._from_host_memory(data)
-
-    spillable_owner = get_spillable_owner(data)
-    if spillable_owner is None:
-        return SpillableBuffer._from_device_memory(data, exposed=exposed)
-
-    if not spillable_owner.exposed and get_spill_lock() is None:
-        raise ValueError(
-            "A owning spillable buffer must "
-            "either be exposed or spilled locked."
-        )
-
-    # At this point, we know that `data` is owned by a spillable buffer,
-    # which is exposed or spilled locked.
-    ptr, size = get_ptr_and_size(data.__cuda_array_interface__)
-    base_ptr = spillable_owner.memory_info()[0]
-    return SpillableBufferSlice(
-        spillable_owner, offset=ptr - base_ptr, size=size
-    )
-
-
 class SpillLock:
     pass
 
@@ -141,7 +61,7 @@ def __getitem__(self, i):
         raise IndexError("tuple index out of range")
 
 
-class SpillableBuffer(Buffer):
+class SpillableBufferOwner(BufferOwner):
     """A Buffer that supports spilling memory off the GPU to avoid OOMs.
 
     This buffer supports spilling the represented data to host memory.
@@ -150,9 +70,9 @@ class SpillableBuffer(Buffer):
     device memory usage see `cudf.core.buffer.spill_manager.SpillManager`.
     Unspill is triggered automatically when accessing the data of the buffer.
 
-    The buffer might not be spillable, which is based on the "expose" status
-    of the buffer. We say that the buffer has been exposed if the device
-    pointer (integer or void*) has been accessed outside of SpillableBuffer.
+    The buffer might not be spillable, which is based on the "expose" status of
+    the buffer. We say that the buffer has been exposed if the device pointer
+    (integer or void*) has been accessed outside of SpillableBufferOwner.
     In this case, we cannot invalidate the device pointer by moving the data
     to host.
 
@@ -160,17 +80,17 @@ class SpillableBuffer(Buffer):
     property. To avoid this, one can use `.get_ptr()` instead, which support
     exposing the buffer temporarily.
 
-    Use the factory function `as_buffer` to create a SpillableBuffer instance.
+    Use the factory function `as_buffer` to create a SpillableBufferOwner
+    instance.
     """
 
     lock: RLock
     _spill_locks: weakref.WeakSet
     _last_accessed: float
     _ptr_desc: Dict[str, Any]
-    _exposed: bool
     _manager: SpillManager
 
-    def _finalize_init(self, ptr_desc: Dict[str, Any], exposed: bool) -> None:
+    def _finalize_init(self, ptr_desc: Dict[str, Any]) -> None:
         """Finish initialization of the spillable buffer
 
         This implements the common initialization that `_from_device_memory`
@@ -180,8 +100,6 @@ def _finalize_init(self, ptr_desc: Dict[str, Any], exposed: bool) -> None:
         ----------
         ptr_desc : dict
             Description of the memory.
-        exposed : bool, optional
-            Mark the buffer as permanently exposed (unspillable).
         """
 
         from cudf.core.buffer.spill_manager import get_global_manager
@@ -190,7 +108,6 @@ def _finalize_init(self, ptr_desc: Dict[str, Any], exposed: bool) -> None:
         self._spill_locks = weakref.WeakSet()
         self._last_accessed = time.monotonic()
         self._ptr_desc = ptr_desc
-        self._exposed = exposed
         manager = get_global_manager()
         if manager is None:
             raise ValueError(
@@ -202,7 +119,7 @@ def _finalize_init(self, ptr_desc: Dict[str, Any], exposed: bool) -> None:
         self._manager.add(self)
 
     @classmethod
-    def _from_device_memory(cls, data: Any, *, exposed: bool = False) -> Self:
+    def _from_device_memory(cls, data: Any, exposed: bool) -> Self:
         """Create a spillabe buffer from device memory.
 
         No data is being copied.
@@ -211,16 +128,16 @@ def _from_device_memory(cls, data: Any, *, exposed: bool = False) -> Self:
         ----------
         data : device-buffer-like
             An object implementing the CUDA Array Interface.
-        exposed : bool, optional
+        exposed : bool
             Mark the buffer as permanently exposed (unspillable).
 
         Returns
         -------
-        SpillableBuffer
+        SpillableBufferOwner
             Buffer representing the same device memory as `data`
         """
-        ret = super()._from_device_memory(data)
-        ret._finalize_init(ptr_desc={"type": "gpu"}, exposed=exposed)
+        ret = super()._from_device_memory(data, exposed=exposed)
+        ret._finalize_init(ptr_desc={"type": "gpu"})
         return ret
 
     @classmethod
@@ -241,7 +158,7 @@ def _from_host_memory(cls, data: Any) -> Self:
 
         Returns
         -------
-        SpillableBuffer
+        SpillableBufferOwner
             Buffer representing a copy of `data`.
         """
 
@@ -257,20 +174,14 @@ def _from_host_memory(cls, data: Any) -> Self:
         ret._owner = None
         ret._ptr = 0
         ret._size = data.nbytes
-        ret._finalize_init(
-            ptr_desc={"type": "cpu", "memoryview": data}, exposed=False
-        )
+        ret._exposed = False
+        ret._finalize_init(ptr_desc={"type": "cpu", "memoryview": data})
         return ret
 
     @property
     def is_spilled(self) -> bool:
         return self._ptr_desc["type"] != "gpu"
 
-    def copy(self, deep: bool = True) -> Self:
-        spill_lock = SpillLock()
-        self.spill_lock(spill_lock=spill_lock)
-        return super().copy(deep=deep)
-
     def spill(self, target: str = "cpu") -> None:
         """Spill or un-spill this buffer in-place
 
@@ -343,10 +254,10 @@ def mark_exposed(self) -> None:
 
         self._manager.spill_to_device_limit()
         with self.lock:
-            if not self._exposed:
+            if not self.exposed:
                 self._manager.statistics.log_expose(self)
             self.spill(target="gpu")
-            self._exposed = True
+            super().mark_exposed()
             self._last_accessed = time.monotonic()
 
     def spill_lock(self, spill_lock: SpillLock) -> None:
@@ -415,25 +326,9 @@ def memory_info(self) -> Tuple[int, int, str]:
             ).__array_interface__["data"][0]
         return (ptr, self.nbytes, self._ptr_desc["type"])
 
-    @property
-    def owner(self) -> Any:
-        return self._owner
-
-    @property
-    def exposed(self) -> bool:
-        return self._exposed
-
     @property
     def spillable(self) -> bool:
-        return not self._exposed and len(self._spill_locks) == 0
-
-    @property
-    def size(self) -> int:
-        return self._size
-
-    @property
-    def nbytes(self) -> int:
-        return self._size
+        return not self.exposed and len(self._spill_locks) == 0
 
     @property
     def last_accessed(self) -> float:
@@ -465,148 +360,114 @@ def memoryview(
                 )
                 return ret
 
-    def _getitem(self, offset: int, size: int) -> SpillableBufferSlice:
-        return SpillableBufferSlice(base=self, offset=offset, size=size)
-
-    def serialize(self) -> Tuple[dict, list]:
-        """Serialize the Buffer
-
-        Normally, we would use `[self]` as the frames. This would work but
-        also mean that `self` becomes exposed permanently if the frames are
-        later accessed through `__cuda_array_interface__`, which is exactly
-        what libraries like Dask+UCX would do when communicating!
-
-        The sound solution is to modify Dask et al. so that they access the
-        frames through `.get_ptr()` and holds on to the `spill_lock` until
-        the frame has been transferred. However, until this adaptation we
-        use a hack where the frame is a `Buffer` with a `spill_lock` as the
-        owner, which makes `self` unspillable while the frame is alive but
-        doesn't expose `self` when `__cuda_array_interface__` is accessed.
-
-        Warning, this hack means that the returned frame must be copied before
-        given to `.deserialize()`, otherwise we would have a `Buffer` pointing
-        to memory already owned by an existing `SpillableBuffer`.
-        """
-        header: Dict[Any, Any]
-        frames: List[Buffer | memoryview]
-        with self.lock:
-            header = {}
-            header["type-serialized"] = pickle.dumps(self.__class__)
-            header["frame_count"] = 1
-            if self.is_spilled:
-                frames = [self.memoryview()]
-            else:
-                # TODO: Use `frames=[self]` instead of this hack, see doc above
-                spill_lock = SpillLock()
-                self.spill_lock(spill_lock)
-                ptr, size, _ = self.memory_info()
-                frames = [
-                    Buffer._from_device_memory(
-                        cuda_array_interface_wrapper(
-                            ptr=ptr,
-                            size=size,
-                            owner=(self._owner, spill_lock),
-                        )
-                    )
-                ]
-            return header, frames
-
-    def __repr__(self) -> str:
+    def __str__(self) -> str:
         if self._ptr_desc["type"] != "gpu":
             ptr_info = str(self._ptr_desc)
         else:
             ptr_info = str(hex(self._ptr))
         return (
-            f"<SpillableBuffer size={format_bytes(self._size)} "
+            f"<{self.__class__.__name__} size={format_bytes(self._size)} "
             f"spillable={self.spillable} exposed={self.exposed} "
             f"num-spill-locks={len(self._spill_locks)} "
             f"ptr={ptr_info} owner={repr(self._owner)}>"
         )
 
 
-class SpillableBufferSlice(SpillableBuffer):
+class SpillableBuffer(Buffer):
     """A slice of a spillable buffer
 
     This buffer applies the slicing and then delegates all
-    operations to its base buffer.
+    operations to its owning buffer.
 
     Parameters
     ----------
-    base : SpillableBuffer
-        The base of the view
+    owner : SpillableBufferOwner
+        The owner of the view
     offset : int
-        Memory offset into the base buffer
+        Memory offset into the owning buffer
     size : int
         Size of the view (in bytes)
     """
 
-    def __init__(self, base: SpillableBuffer, offset: int, size: int) -> None:
-        if size < 0:
-            raise ValueError("size cannot be negative")
-        if offset < 0:
-            raise ValueError("offset cannot be negative")
-        if offset + size > base.size:
-            raise ValueError(
-                "offset+size cannot be greater than the size of base"
-            )
-        self._base = base
-        self._offset = offset
-        self._size = size
-        self._owner = base
-        self.lock = base.lock
-
-    def get_ptr(self, *, mode: Literal["read", "write"]) -> int:
-        """
-        A passthrough method to `SpillableBuffer.get_ptr`
-        with factoring in the `offset`.
-        """
-        return self._base.get_ptr(mode=mode) + self._offset
-
-    def _getitem(self, offset: int, size: int) -> SpillableBufferSlice:
-        return SpillableBufferSlice(
-            base=self._base, offset=offset + self._offset, size=size
-        )
+    _owner: SpillableBufferOwner
 
-    @classmethod
-    def deserialize(cls, header: dict, frames: list):
-        # TODO: because of the hack in `SpillableBuffer.serialize()` where
-        # frames are of type `Buffer`, we always deserialize as if they are
-        # `SpillableBuffer`. In the future, we should be able to
-        # deserialize into `SpillableBufferSlice` when the frames hasn't been
-        # copied.
-        return SpillableBuffer.deserialize(header, frames)
-
-    def memoryview(
-        self, *, offset: int = 0, size: Optional[int] = None
-    ) -> memoryview:
-        size = self._size if size is None else size
-        return self._base.memoryview(offset=self._offset + offset, size=size)
-
-    def __repr__(self) -> str:
-        return (
-            f"<SpillableBufferSlice size={format_bytes(self._size)} "
-            f"offset={format_bytes(self._offset)} of {self._base} "
-        )
-
-    # The rest of the methods delegate to the base buffer.
     def spill(self, target: str = "cpu") -> None:
-        return self._base.spill(target=target)
+        return self._owner.spill(target=target)
 
     @property
     def is_spilled(self) -> bool:
-        return self._base.is_spilled
+        return self._owner.is_spilled
 
     @property
     def exposed(self) -> bool:
-        return self._base.exposed
+        return self._owner.exposed
 
     @property
     def spillable(self) -> bool:
-        return self._base.spillable
+        return self._owner.spillable
 
     def spill_lock(self, spill_lock: SpillLock) -> None:
-        self._base.spill_lock(spill_lock=spill_lock)
+        self._owner.spill_lock(spill_lock=spill_lock)
 
     def memory_info(self) -> Tuple[int, int, str]:
-        (ptr, _, device_type) = self._base.memory_info()
+        (ptr, _, device_type) = self._owner.memory_info()
         return (ptr + self._offset, self.nbytes, device_type)
+
+    def mark_exposed(self) -> None:
+        self._owner.mark_exposed()
+
+    def serialize(self) -> Tuple[dict, list]:
+        """Serialize the Buffer
+
+        Normally, we would use `[self]` as the frames. This would work but
+        also mean that `self` becomes exposed permanently if the frames are
+        later accessed through `__cuda_array_interface__`, which is exactly
+        what libraries like Dask+UCX would do when communicating!
+
+        The sound solution is to modify Dask et al. so that they access the
+        frames through `.get_ptr()` and holds on to the `spill_lock` until
+        the frame has been transferred. However, until this adaptation we
+        use a hack where the frame is a `Buffer` with a `spill_lock` as the
+        owner, which makes `self` unspillable while the frame is alive but
+        doesn't expose `self` when `__cuda_array_interface__` is accessed.
+
+        Warning, this hack means that the returned frame must be copied before
+        given to `.deserialize()`, otherwise we would have a `Buffer` pointing
+        to memory already owned by an existing `SpillableBufferOwner`.
+        """
+        header: Dict[str, Any] = {}
+        frames: List[Buffer | memoryview]
+        with self._owner.lock:
+            header["type-serialized"] = pickle.dumps(self.__class__)
+            header["owner-type-serialized"] = pickle.dumps(type(self._owner))
+            header["frame_count"] = 1
+            if self.is_spilled:
+                frames = [self.memoryview()]
+            else:
+                # TODO: Use `frames=[self]` instead of this hack, see doc above
+                spill_lock = SpillLock()
+                self.spill_lock(spill_lock)
+                ptr, size, _ = self.memory_info()
+                frames = [
+                    Buffer(
+                        owner=BufferOwner._from_device_memory(
+                            cuda_array_interface_wrapper(
+                                ptr=ptr,
+                                size=size,
+                                owner=(self._owner, spill_lock),
+                            ),
+                            exposed=False,
+                        )
+                    )
+                ]
+            return header, frames
+
+    @property
+    def __cuda_array_interface__(self) -> dict:
+        return {
+            "data": DelayedPointerTuple(self),
+            "shape": (self.size,),
+            "strides": None,
+            "typestr": "|u1",
+            "version": 0,
+        }
diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py
index 373be99ec96..c2ec7effd13 100644
--- a/python/cudf/cudf/core/buffer/utils.py
+++ b/python/cudf/cudf/core/buffer/utils.py
@@ -1,18 +1,51 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
 import threading
 from contextlib import ContextDecorator
-from typing import Any, Dict, Optional, Tuple, Union
-
-from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper
-from cudf.core.buffer.exposure_tracked_buffer import as_exposure_tracked_buffer
+from typing import Any, Dict, Optional, Tuple, Type, Union
+
+from cudf.core.buffer.buffer import (
+    Buffer,
+    BufferOwner,
+    cuda_array_interface_wrapper,
+    get_ptr_and_size,
+)
+from cudf.core.buffer.exposure_tracked_buffer import ExposureTrackedBuffer
 from cudf.core.buffer.spill_manager import get_global_manager
-from cudf.core.buffer.spillable_buffer import SpillLock, as_spillable_buffer
+from cudf.core.buffer.spillable_buffer import (
+    SpillableBuffer,
+    SpillableBufferOwner,
+    SpillLock,
+)
 from cudf.options import get_option
 
 
+def get_buffer_owner(data: Any) -> Optional[BufferOwner]:
+    """Get the owner of `data`, if one exists
+
+    Search through the stack of data owners in order to find an
+    owner BufferOwner (incl. subclasses).
+
+    Parameters
+    ----------
+    data
+        The data object to search for a BufferOwner instance
+
+    Return
+    ------
+    BufferOwner or None
+        The owner of `data` if found otherwise None.
+    """
+
+    if isinstance(data, BufferOwner):
+        return data
+    if hasattr(data, "owner"):
+        return get_buffer_owner(data.owner)
+    return None
+
+
 def as_buffer(
     data: Union[int, Any],
     *,
@@ -30,7 +63,17 @@ def as_buffer(
 
     If `data` is an integer, it is assumed to point to device memory.
 
-    Raises ValueError if data isn't C-contiguous.
+    Raises ValueError if `data` isn't C-contiguous.
+
+    If copy-on-write is enabled, an ExposureTrackedBuffer is returned.
+
+    If spilling is enabled, a SpillableBuffer that refers to a
+    SpillableBufferOwner is returned. If `data` is owned by a spillable buffer,
+    it must either be "exposed" or spill locked (called within an
+    acquire_spill_lock context). This is to guarantee that the memory of `data`
+    isn't spilled before this function gets to calculate the offset of the new
+    SpillableBuffer.
+
 
     Parameters
     ----------
@@ -73,13 +116,49 @@ def as_buffer(
             "`data` is a buffer-like or array-like object"
         )
 
-    if get_option("copy_on_write"):
-        return as_exposure_tracked_buffer(data, exposed=exposed)
+    # Find the buffer types to return based on the current config
+    owner_class: Type[BufferOwner]
+    buffer_class: Type[Buffer]
     if get_global_manager() is not None:
-        return as_spillable_buffer(data, exposed=exposed)
-    if hasattr(data, "__cuda_array_interface__"):
-        return Buffer._from_device_memory(data)
-    return Buffer._from_host_memory(data)
+        owner_class = SpillableBufferOwner
+        buffer_class = SpillableBuffer
+    elif get_option("copy_on_write"):
+        owner_class = BufferOwner
+        buffer_class = ExposureTrackedBuffer
+    else:
+        owner_class = BufferOwner
+        buffer_class = Buffer
+
+    # Handle host memory,
+    if not hasattr(data, "__cuda_array_interface__"):
+        if exposed:
+            raise ValueError("cannot created exposed host memory")
+        return buffer_class(owner=owner_class._from_host_memory(data))
+
+    # Check if `data` is owned by a known class
+    owner = get_buffer_owner(data)
+    if owner is None:  # `data` is new device memory
+        return buffer_class(
+            owner=owner_class._from_device_memory(data, exposed=exposed)
+        )
+
+    # At this point, we know that `data` is owned by a known class, which
+    # should be the same class as specified by the current config (see above)
+    assert owner.__class__ is owner_class
+    if (
+        isinstance(owner, SpillableBufferOwner)
+        and not owner.exposed
+        and get_spill_lock() is None
+    ):
+        raise ValueError(
+            "An owning spillable buffer must "
+            "either be exposed or spill locked."
+        )
+    ptr, size = get_ptr_and_size(data.__cuda_array_interface__)
+    base_ptr = owner.get_ptr(mode="read")
+    if size > 0 and base_ptr == 0:
+        raise ValueError("Cannot create a non-empty slice of a null buffer")
+    return buffer_class(owner=owner, offset=ptr - base_ptr, size=size)
 
 
 _thread_spill_locks: Dict[int, Tuple[Optional[SpillLock], int]] = {}
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index aba4ded4f9d..3dddcae85dc 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 """
 isort: skip_file
@@ -8,7 +8,6 @@
 from cudf.core.column.categorical import CategoricalColumn
 from cudf.core.column.column import (
     ColumnBase,
-    arange,
     as_column,
     build_categorical_column,
     build_column,
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 71143fa7a95..6b3ee0ba852 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -987,15 +987,16 @@ def to_pandas(
             .fillna(_DEFAULT_CATEGORICAL_VALUE)
             .values_host
         )
-        if isinstance(col.categories.dtype, IntervalDtype):
+        cats = col.categories
+        if cats.dtype.kind in "biuf":
+            cats = cats.nans_to_nulls().dropna()  # type: ignore[attr-defined]
+        elif not isinstance(cats.dtype, IntervalDtype):
             # leaving out dropna because it temporarily changes an interval
             # index into a struct and throws off results.
             # TODO: work on interval index dropna
-            categories = col.categories.to_pandas()
-        else:
-            categories = col.categories.dropna(drop_nan=True).to_pandas()
+            cats = cats.dropna()
         data = pd.Categorical.from_codes(
-            codes, categories=categories, ordered=col.ordered
+            codes, categories=cats.to_pandas(), ordered=col.ordered
         )
         return pd.Series(data, index=index)
 
@@ -1158,7 +1159,7 @@ def find_and_replace(
         new_cats_col = new_cats_col.apply_boolean_mask(bmask)
         new_cats = cudf.DataFrame._from_data(
             {
-                "index": cudf.core.column.arange(len(new_cats_col)),
+                "index": column.as_column(range(len(new_cats_col))),
                 "cats": new_cats_col,
             }
         )
@@ -1379,7 +1380,7 @@ def _concat(
 
         # Find the first non-null column:
         head = next(
-            (obj for obj in objs if not obj.null_count != len(obj)), objs[0]
+            (obj for obj in objs if obj.null_count != len(obj)), objs[0]
         )
 
         # Combine and de-dupe the categories
@@ -1530,9 +1531,13 @@ def _set_categories(
         )
         out_code_dtype = min_unsigned_type(max_cat_size)
 
-        cur_order = column.arange(len(cur_codes))
-        old_codes = column.arange(len(cur_cats), dtype=out_code_dtype)
-        new_codes = column.arange(len(new_cats), dtype=out_code_dtype)
+        cur_order = column.as_column(range(len(cur_codes)))
+        old_codes = column.as_column(
+            range(len(cur_cats)), dtype=out_code_dtype
+        )
+        new_codes = column.as_column(
+            range(len(new_cats)), dtype=out_code_dtype
+        )
 
         new_df = cudf.DataFrame._from_data(
             data={"new_codes": new_codes, "cats": new_cats}
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 81579b53bb7..705862c502a 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -109,16 +109,8 @@ class ColumnBase(Column, Serializable, BinaryOperand, Reducible):
         "min",
     }
 
-    def as_frame(self) -> "cudf.core.frame.Frame":
-        """
-        Converts a Column to Frame
-        """
-        return cudf.core.single_column_frame.SingleColumnFrame(
-            {None: self.copy(deep=False)}
-        )
-
     def data_array_view(
-        self, *, mode="write"
+        self, *, mode: Literal["write", "read"] = "write"
     ) -> "cuda.devicearray.DeviceNDArray":
         """
         View the data as a device array object
@@ -155,7 +147,7 @@ def data_array_view(
         return cuda.as_cuda_array(obj).view(self.dtype)
 
     def mask_array_view(
-        self, *, mode="write"
+        self, *, mode: Literal["write", "read"] = "write"
     ) -> "cuda.devicearray.DeviceNDArray":
         """
         View the mask as a device array
@@ -291,8 +283,7 @@ def any(self, skipna: bool = True) -> bool:
 
         return libcudf.reduce.reduce("any", self, dtype=np.bool_)
 
-    def dropna(self, drop_nan: bool = False) -> ColumnBase:
-        # The drop_nan argument is only used for numerical columns.
+    def dropna(self) -> ColumnBase:
         return drop_nulls([self])[0]._with_type_metadata(self.dtype)
 
     def to_arrow(self) -> pa.Array:
@@ -437,14 +428,6 @@ def nullmask(self) -> Buffer:
             raise ValueError("Column has no null mask")
         return self.mask_array_view(mode="read")
 
-    def force_deep_copy(self) -> Self:
-        """
-        A method to create deep copy irrespective of whether
-        `copy-on-write` is enabled.
-        """
-        result = libcudf.copying.copy_column(self)
-        return result._with_type_metadata(self.dtype)
-
     def copy(self, deep: bool = True) -> Self:
         """
         Makes a copy of the Column.
@@ -464,7 +447,8 @@ def copy(self, deep: bool = True) -> Self:
             them.
         """
         if deep:
-            return self.force_deep_copy()
+            result = libcudf.copying.copy_column(self)
+            return result._with_type_metadata(self.dtype)
         else:
             return cast(
                 Self,
@@ -570,10 +554,8 @@ def slice(
             ]._with_type_metadata(self.dtype)
         else:
             # Need to create a gather map for given slice with stride
-            gather_map = arange(
-                start=start,
-                stop=stop,
-                step=stride,
+            gather_map = as_column(
+                range(start, stop, stride),
                 dtype=cudf.dtype(np.int32),
             )
             return self.take(gather_map)
@@ -642,10 +624,8 @@ def _scatter_by_slice(
                 )
 
         # step != 1, create a scatter map with arange
-        scatter_map = arange(
-            start=start,
-            stop=stop,
-            step=step,
+        scatter_map = as_column(
+            range(start, stop, step),
             dtype=cudf.dtype(np.int32),
         )
 
@@ -761,7 +741,7 @@ def indices_of(
             assert len(value) == 1
         mask = libcudf.search.contains(value, self)
         return apply_boolean_mask(
-            [arange(0, len(self), dtype=size_type_dtype)], mask
+            [as_column(range(0, len(self)), dtype=size_type_dtype)], mask
         )[0]
 
     def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]:
@@ -1069,7 +1049,7 @@ def as_categorical_column(self, dtype) -> ColumnBase:
         )
         # columns include null index in factorization; remove:
         if self.has_nulls():
-            cats = cats.dropna(drop_nan=False)
+            cats = cats.dropna()
             min_type = min_unsigned_type(len(cats), 8)
             if cudf.dtype(min_type).itemsize < labels.dtype.itemsize:
                 labels = labels.astype(min_type)
@@ -1395,7 +1375,9 @@ def _return_sentinel_column():
             [self], [cats], how="left"
         )
         codes = libcudf.copying.gather(
-            [arange(len(cats), dtype=dtype)], right_gather_map, nullify=True
+            [as_column(range(len(cats)), dtype=dtype)],
+            right_gather_map,
+            nullify=True,
         )
         del right_gather_map
         # reorder `codes` so that its values correspond to the
@@ -1486,17 +1468,9 @@ def column_empty(
             ),
         )
     elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype):
-        data = None
+        data = as_buffer(rmm.DeviceBuffer(size=0))
         children = (
             full(row_count + 1, 0, dtype=libcudf.types.size_type_dtype),
-            build_column(
-                data=as_buffer(
-                    rmm.DeviceBuffer(
-                        size=row_count * cudf.dtype("int8").itemsize
-                    )
-                ),
-                dtype="int8",
-            ),
         )
     else:
         data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize))
@@ -1601,6 +1575,7 @@ def build_column(
         )
     elif dtype.type in (np.object_, np.str_):
         return cudf.core.column.StringColumn(
+            data=data,
             mask=mask,
             size=size,
             offset=offset,
@@ -1921,13 +1896,26 @@ def as_column(
     * Objects exposing ``__array_interface__``(e.g., numpy arrays)
     * pyarrow array
     * pandas.Categorical objects
+    * range objects
     """
-    if isinstance(arbitrary, ColumnBase):
+    if isinstance(arbitrary, (range, pd.RangeIndex, cudf.RangeIndex)):
+        column = libcudf.filling.sequence(
+            len(arbitrary),
+            as_device_scalar(arbitrary.start, dtype=cudf.dtype("int64")),
+            as_device_scalar(arbitrary.step, dtype=cudf.dtype("int64")),
+        )
+        if cudf.get_option("default_integer_bitwidth") and dtype is None:
+            dtype = cudf.dtype(
+                f'i{cudf.get_option("default_integer_bitwidth")//8}'
+            )
+        if dtype is not None:
+            column = column.astype(dtype)
+        return column
+    elif isinstance(arbitrary, ColumnBase):
         if dtype is not None:
             return arbitrary.astype(dtype)
         else:
             return arbitrary
-
     elif isinstance(arbitrary, cudf.Series):
         data = arbitrary._column
         if dtype is not None:
@@ -2630,70 +2618,6 @@ def deserialize_columns(headers: List[dict], frames: List) -> List[ColumnBase]:
     return columns
 
 
-def arange(
-    start: Union[int, float],
-    stop: Optional[Union[int, float]] = None,
-    step: Union[int, float] = 1,
-    dtype=None,
-) -> cudf.core.column.NumericalColumn:
-    """
-    Returns a column with evenly spaced values within a given interval.
-
-    Values are generated within the half-open interval [start, stop).
-    The first three arguments are mapped like the range built-in function,
-    i.e. start and step are optional.
-
-    Parameters
-    ----------
-    start : int/float
-        Start of the interval.
-    stop : int/float, default is None
-        Stop of the interval.
-    step : int/float, default 1
-        Step width between each pair of consecutive values.
-    dtype : default None
-        Data type specifier. It is inferred from other arguments by default.
-
-    Returns
-    -------
-    cudf.core.column.NumericalColumn
-
-    Examples
-    --------
-    >>> import cudf
-    >>> col = cudf.core.column.arange(2, 7, 1, dtype='int16')
-    >>> col
-    <cudf.core.column.numerical.NumericalColumn object at 0x7ff7998f8b90>
-    >>> cudf.Series(col)
-    0    2
-    1    3
-    2    4
-    3    5
-    4    6
-    dtype: int16
-    """
-    if stop is None:
-        stop = start
-        start = 0
-
-    if step is None:
-        step = 1
-
-    size = len(range(int(start), int(stop), int(step)))
-    if size == 0:
-        if dtype is None:
-            dtype = cudf.dtype("int64")
-        return cast(
-            cudf.core.column.NumericalColumn, column_empty(0, dtype=dtype)
-        )
-
-    return libcudf.filling.sequence(
-        size,
-        as_device_scalar(start, dtype=dtype),
-        as_device_scalar(step, dtype=dtype),
-    )
-
-
 def full(
     size: int, fill_value: ScalarLike, dtype: Optional[Dtype] = None
 ) -> ColumnBase:
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index 81059717b20..6a7e7729123 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -142,7 +142,4 @@ def element_indexing(self, index: int):
         result = super().element_indexing(index)
         if cudf.get_option("mode.pandas_compatible"):
             return pd.Interval(**result, closed=self._closed)
-        return {
-            field: value
-            for field, value in zip(self.dtype.fields, result.values())
-        }
+        return result
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 5461d1b13b5..0577e0f37ed 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -20,7 +20,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.stream_compaction import drop_nulls
 from cudf._lib.types import size_type_dtype
 from cudf._typing import (
     ColumnBinaryOperand,
@@ -421,10 +420,6 @@ def nan_count(self) -> int:
             self._nan_count = nan_col.sum()
         return self._nan_count
 
-    def dropna(self, drop_nan: bool = False) -> NumericalColumn:
-        col = self.nans_to_nulls() if drop_nan else self
-        return drop_nulls([col])[0]
-
     def _process_values_for_isin(
         self, values: Sequence
     ) -> Tuple[ColumnBase, ColumnBase]:
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 84333fc205a..c47088caebc 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5465,6 +5465,7 @@ class StringColumn(column.ColumnBase):
 
     def __init__(
         self,
+        data: Optional[Buffer] = None,
         mask: Optional[Buffer] = None,
         size: Optional[int] = None,  # TODO: make non-optional
         offset: int = 0,
@@ -5491,11 +5492,10 @@ def __init__(
             # all nulls-column:
             offsets = column.full(size + 1, 0, dtype=size_type_dtype)
 
-            chars = cudf.core.column.column_empty(0, dtype="int8")
-            children = (offsets, chars)
+            children = (offsets,)
 
         super().__init__(
-            data=None,
+            data=data,
             size=size,
             dtype=dtype,
             mask=mask,
@@ -5516,7 +5516,7 @@ def copy(self, deep: bool = True):
     def start_offset(self) -> int:
         if self._start_offset is None:
             if (
-                len(self.base_children) == 2
+                len(self.base_children) == 1
                 and self.offset < self.base_children[0].size
             ):
                 self._start_offset = int(
@@ -5531,7 +5531,7 @@ def start_offset(self) -> int:
     def end_offset(self) -> int:
         if self._end_offset is None:
             if (
-                len(self.base_children) == 2
+                len(self.base_children) == 1
                 and (self.offset + self.size) < self.base_children[0].size
             ):
                 self._end_offset = int(
@@ -5547,16 +5547,14 @@ def end_offset(self) -> int:
     @cached_property
     def memory_usage(self) -> int:
         n = 0
-        if len(self.base_children) == 2:
+        if self.data is not None:
+            n += self.data.size
+        if len(self.base_children) == 1:
             child0_size = (self.size + 1) * self.base_children[
                 0
             ].dtype.itemsize
 
-            child1_size = (
-                self.end_offset - self.start_offset
-            ) * self.base_children[1].dtype.itemsize
-
-            n += child0_size + child1_size
+            n += child0_size
         if self.nullable:
             n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size)
         return n
@@ -5568,6 +5566,24 @@ def base_size(self) -> int:
         else:
             return self.base_children[0].size - 1
 
+    # override for string column
+    @property
+    def data(self):
+        if self.base_data is None:
+            return None
+        if self._data is None:
+            if (
+                self.offset == 0
+                and len(self.base_children) > 0
+                and self.size == self.base_children[0].size - 1
+            ):
+                self._data = self.base_data
+            else:
+                self._data = self.base_data[
+                    self.start_offset : self.end_offset
+                ]
+        return self._data
+
     def data_array_view(
         self, *, mode="write"
     ) -> cuda.devicearray.DeviceNDArray:
@@ -5614,14 +5630,6 @@ def sum(
         else:
             return result_col
 
-    def set_base_data(self, value):
-        if value is not None:
-            raise RuntimeError(
-                "StringColumns do not use data attribute of Column, use "
-                "`set_base_children` instead"
-            )
-        super().set_base_data(value)
-
     def __contains__(self, item: ScalarLike) -> bool:
         if is_scalar(item):
             return True in libcudf.search.contains(
@@ -5938,15 +5946,12 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase":
         str_end_byte_offset = self.base_children[0].element_indexing(
             self.offset + self.size
         )
-        char_dtype_size = self.base_children[1].dtype.itemsize
 
-        n_bytes_to_view = (
-            str_end_byte_offset - str_byte_offset
-        ) * char_dtype_size
+        n_bytes_to_view = str_end_byte_offset - str_byte_offset
 
         to_view = column.build_column(
-            self.base_children[1].data,
-            dtype=self.base_children[1].dtype,
+            self.base_data,
+            dtype=cudf.api.types.dtype("int8"),
             offset=str_byte_offset,
             size=n_bytes_to_view,
         )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 51b661593fc..f9cf180ff44 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -342,10 +342,16 @@ def _getitem_tuple_arg(self, arg):
                         tmp_col_name = (tmp_col_name, *extra)
                         cantor_name = (cantor_name, *extra)
                     other_df = DataFrame(
-                        {tmp_col_name: column.arange(len(tmp_arg[0]))},
+                        {
+                            tmp_col_name: column.as_column(
+                                range(len(tmp_arg[0]))
+                            )
+                        },
                         index=as_index(tmp_arg[0]),
                     )
-                    columns_df[cantor_name] = column.arange(len(columns_df))
+                    columns_df[cantor_name] = column.as_column(
+                        range(len(columns_df))
+                    )
                     df = other_df.join(columns_df, how="inner")
                     # as join is not assigning any names to index,
                     # update it over here
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 6e1c5f6fd00..c97d6dcdd2d 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 import enum
 from collections import abc
@@ -482,7 +482,9 @@ def _get_data_buffer(
             dtype = self._dtype_from_cudfdtype(col_data.dtype)
 
         elif self.dtype[0] == _DtypeKind.STRING:
-            col_data = self._col.children[1]
+            col_data = build_column(
+                data=self._col.data, dtype=np.dtype("int8")
+            )
             dtype = self._dtype_from_cudfdtype(col_data.dtype)
 
         else:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 5f7a86e86d8..05104a3ef05 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -144,17 +144,6 @@ def _from_data(cls, data: MutableMapping):
     def _from_data_like_self(self, data: MutableMapping):
         return self._from_data(data)
 
-    @classmethod
-    @_cudf_nvtx_annotate
-    def _from_columns(
-        cls,
-        columns: List[ColumnBase],
-        column_names: abc.Iterable[str],
-    ):
-        """Construct a `Frame` object from a list of columns."""
-        data = {name: columns[i] for i, name in enumerate(column_names)}
-        return cls._from_data(data)
-
     @_cudf_nvtx_annotate
     def _from_columns_like_self(
         self,
@@ -169,7 +158,8 @@ def _from_columns_like_self(
         """
         if column_names is None:
             column_names = self._column_names
-        frame = self.__class__._from_columns(columns, column_names)
+        data = dict(zip(column_names, columns))
+        frame = self.__class__._from_data(data)
         return frame._copy_type_metadata(self, override_dtypes=override_dtypes)
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 73e6774f5ce..6c83bcd9efb 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import copy
 import itertools
@@ -23,7 +23,7 @@
 from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
 from cudf.api.types import is_bool_dtype, is_float_dtype, is_list_like
 from cudf.core.abc import Serializable
-from cudf.core.column.column import ColumnBase, arange, as_column
+from cudf.core.column.column import ColumnBase, as_column
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.join._join_helpers import _match_join_keys
 from cudf.core.mixins import Reducible, Scannable
@@ -377,7 +377,7 @@ def get_group(self, name, obj=None):
         if obj is None:
             obj = self.obj
 
-        return obj.loc[self.groups[name]]
+        return obj.loc[self.groups[name].drop_duplicates()]
 
     @_cudf_nvtx_annotate
     def size(self):
@@ -761,7 +761,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
             # subsample the gather map from the full input ordering,
             # rather than permuting the gather map of the output.
             _, (ordering,), _ = self._groupby.groups(
-                [arange(0, len(self.obj))]
+                [as_column(range(0, len(self.obj)))]
             )
             # Invert permutation from original order to groups on the
             # subset of entries we want.
@@ -2111,9 +2111,13 @@ def diff(self, periods=1, axis=0):
     def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries:
         """Internal implementation for `ffill` and `bfill`"""
         values = self.grouping.values
-        result = self.obj._from_columns(
-            self._groupby.replace_nulls([*values._columns], method),
-            values._column_names,
+        result = self.obj._from_data(
+            dict(
+                zip(
+                    values._column_names,
+                    self._groupby.replace_nulls([*values._columns], method),
+                )
+            )
         )
         result = self._mimic_pandas_order(result)
         return result._copy_type_metadata(values)
@@ -2305,9 +2309,15 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         else:
             fill_value = [fill_value] * len(values._data)
 
-        result = self.obj.__class__._from_columns(
-            self._groupby.shift([*values._columns], periods, fill_value)[0],
-            values._column_names,
+        result = self.obj.__class__._from_data(
+            dict(
+                zip(
+                    values._column_names,
+                    self._groupby.shift(
+                        [*values._columns], periods, fill_value
+                    )[0],
+                )
+            )
         )
         result = self._mimic_pandas_order(result)
         return result._copy_type_metadata(values)
@@ -2543,7 +2553,9 @@ def _mimic_pandas_order(
         # result coming back from libcudf has null_count few rows than
         # the input, so we must produce an ordering from the full
         # input range.
-        _, (ordering,), _ = self._groupby.groups([arange(0, len(self.obj))])
+        _, (ordering,), _ = self._groupby.groups(
+            [as_column(range(0, len(self.obj)))]
+        )
         if self._dropna and any(
             c.has_nulls(include_nan=True) > 0
             for c in self.grouping._key_columns
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 5c33cd09ad1..fa7173f1d0f 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -286,9 +286,7 @@ def _num_rows(self):
     @_cudf_nvtx_annotate
     def _values(self):
         if len(self) > 0:
-            return column.arange(
-                self._start, self._stop, self._step, dtype=self.dtype
-            )
+            return column.as_column(self._range, dtype=self.dtype)
         else:
             return column.column_empty(0, masked=False, dtype=self.dtype)
 
@@ -802,22 +800,22 @@ def sort_values(
     @_cudf_nvtx_annotate
     def _gather(self, gather_map, nullify=False, check_bounds=True):
         gather_map = cudf.core.column.as_column(gather_map)
-        return _dtype_to_index[self.dtype.type]._from_columns(
-            [self._values.take(gather_map, nullify, check_bounds)], [self.name]
+        return _dtype_to_index[self.dtype.type]._from_data(
+            {self.name: self._values.take(gather_map, nullify, check_bounds)}
         )
 
     @_cudf_nvtx_annotate
     def _apply_boolean_mask(self, boolean_mask):
-        return _dtype_to_index[self.dtype.type]._from_columns(
-            [self._values.apply_boolean_mask(boolean_mask)], [self.name]
+        return _dtype_to_index[self.dtype.type]._from_data(
+            {self.name: self._values.apply_boolean_mask(boolean_mask)}
         )
 
     def repeat(self, repeats, axis=None):
         return self._as_int_index().repeat(repeats, axis)
 
     def _split(self, splits):
-        return _dtype_to_index[self.dtype.type]._from_columns(
-            [self._as_int_index()._split(splits)], [self.name]
+        return _dtype_to_index[self.dtype.type]._from_data(
+            {self.name: self._as_int_index()._split(splits)}
         )
 
     def _binaryop(self, other, op: str):
@@ -2120,13 +2118,13 @@ def __init__(
         data=None,
         freq=None,
         tz=None,
-        normalize=False,
+        normalize: bool = False,
         closed=None,
-        ambiguous="raise",
-        dayfirst=False,
-        yearfirst=False,
+        ambiguous: Literal["raise"] = "raise",
+        dayfirst: bool = False,
+        yearfirst: bool = False,
         dtype=None,
-        copy=False,
+        copy: bool = False,
         name=None,
     ):
         # we should be more strict on what we accept here but
@@ -2149,22 +2147,20 @@ def __init__(
 
         self._freq = _validate_freq(freq)
 
-        valid_dtypes = tuple(
-            f"datetime64[{res}]" for res in ("s", "ms", "us", "ns")
-        )
         if dtype is None:
             # nanosecond default matches pandas
             dtype = "datetime64[ns]"
-        elif dtype not in valid_dtypes:
-            raise TypeError("Invalid dtype")
+        dtype = cudf.dtype(dtype)
+        if dtype.kind != "M":
+            raise TypeError("dtype must be a datetime type")
 
-        kwargs = _setdefault_name(data, name=name)
+        name = _setdefault_name(data, name=name)["name"]
         data = column.as_column(data, dtype=dtype)
 
         if copy:
             data = data.copy()
 
-        super().__init__(data, **kwargs)
+        super().__init__(data, name=name)
 
         if self._freq is not None:
             unique_vals = self.to_series().diff().unique()
@@ -2842,8 +2838,8 @@ def __init__(
         unit=None,
         freq=None,
         closed=None,
-        dtype="timedelta64[ns]",
-        copy=False,
+        dtype=None,
+        copy: bool = False,
         name=None,
     ):
         if freq is not None:
@@ -2855,19 +2851,19 @@ def __init__(
                 "dtype parameter is supported"
             )
 
-        valid_dtypes = tuple(
-            f"timedelta64[{res}]" for res in ("s", "ms", "us", "ns")
-        )
-        if dtype not in valid_dtypes:
-            raise TypeError("Invalid dtype")
+        if dtype is None:
+            dtype = "timedelta64[ns]"
+        dtype = cudf.dtype(dtype)
+        if dtype.kind != "m":
+            raise TypeError("dtype must be a timedelta type")
 
-        kwargs = _setdefault_name(data, name=name)
+        name = _setdefault_name(data, name=name)["name"]
         data = column.as_column(data, dtype=dtype)
 
         if copy:
             data = data.copy()
 
-        super().__init__(data, **kwargs)
+        super().__init__(data, name=name)
 
     def __getitem__(self, index):
         value = super().__getitem__(index)
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 5955e21fea0..70be5c3ad0f 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -182,12 +182,8 @@ def _indices_from_labels(obj, labels):
     # join is not guaranteed to maintain the index ordering
     # so we will sort it with its initial ordering which is stored
     # in column "__"
-    lhs = cudf.DataFrame(
-        {"__": cudf.core.column.arange(len(labels))}, index=labels
-    )
-    rhs = cudf.DataFrame(
-        {"_": cudf.core.column.arange(len(obj))}, index=obj.index
-    )
+    lhs = cudf.DataFrame({"__": as_column(range(len(labels)))}, index=labels)
+    rhs = cudf.DataFrame({"_": as_column(range(len(obj)))}, index=obj.index)
     return lhs.join(rhs).sort_values(by=["__", "_"])["_"]
 
 
@@ -295,19 +291,27 @@ def _from_data_like_self(self, data: MutableMapping):
         out._data._level_names = self._data._level_names
         return out
 
-    @classmethod
     @_cudf_nvtx_annotate
-    def _from_columns(
-        cls,
+    def _from_columns_like_self(
+        self,
         columns: List[ColumnBase],
-        column_names: List[str],
+        column_names: Optional[abc.Iterable[str]] = None,
         index_names: Optional[List[str]] = None,
-    ):
-        """Construct a `Frame` object from a list of columns.
+        *,
+        override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None,
+    ) -> Self:
+        """Construct a `Frame` from a list of columns with metadata from self.
 
         If `index_names` is set, the first `len(index_names)` columns are
         used to construct the index of the frame.
+
+        If override_dtypes is provided then any non-None entry will be
+        used for the dtype of the matching column in preference to the
+        dtype of the column in self.
         """
+        if column_names is None:
+            column_names = self._column_names
+
         data_columns = columns
         index = None
 
@@ -320,36 +324,11 @@ def _from_columns(
             else:
                 index.name = index_names[0]
 
-        out = super()._from_columns(data_columns, column_names)
+        data = dict(zip(column_names, data_columns))
+        frame = self.__class__._from_data(data)
 
         if index is not None:
-            out._index = index
-
-        return out
-
-    @_cudf_nvtx_annotate
-    def _from_columns_like_self(
-        self,
-        columns: List[ColumnBase],
-        column_names: Optional[abc.Iterable[str]] = None,
-        index_names: Optional[List[str]] = None,
-        *,
-        override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None,
-    ) -> Self:
-        """Construct a `Frame` from a list of columns with metadata from self.
-
-        If `index_names` is set, the first `len(index_names)` columns are
-        used to construct the index of the frame.
-
-        If override_dtypes is provided then any non-None entry will be
-        used for the dtype of the matching column in preference to the
-        dtype of the column in self.
-        """
-        if column_names is None:
-            column_names = self._column_names
-        frame = self.__class__._from_columns(
-            columns, column_names, index_names
-        )
+            frame._index = index
         return frame._copy_type_metadata(
             self,
             include_index=bool(index_names),
@@ -1897,10 +1876,8 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
         if stride != 1:
             return self._gather(
                 GatherMap.from_column_unchecked(
-                    cudf.core.column.arange(
-                        start,
-                        stop=stop,
-                        step=stride,
+                    as_column(
+                        range(start, stop, stride),
                         dtype=libcudf.types.size_type_dtype,
                     ),
                     len(self),
@@ -2541,9 +2518,9 @@ def _align_to_index(
         # to recover ordering after index alignment.
         sort_col_id = str(uuid4())
         if how == "left":
-            lhs[sort_col_id] = cudf.core.column.arange(len(lhs))
+            lhs[sort_col_id] = as_column(range(len(lhs)))
         elif how == "right":
-            rhs[sort_col_id] = cudf.core.column.arange(len(rhs))
+            rhs[sort_col_id] = as_column(range(len(rhs)))
 
         result = lhs.join(rhs, how=how, sort=sort)
         if how in ("left", "right"):
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 20f5b7989eb..86f0c8465ba 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import itertools
@@ -232,7 +232,11 @@ def _gather_maps(self, left_cols, right_cols):
         key_order = list(
             itertools.chain.from_iterable(
                 libcudf.copying.gather(
-                    [cudf.core.column.arange(n, dtype=size_type_dtype)],
+                    [
+                        cudf.core.column.as_column(
+                            range(n), dtype=size_type_dtype
+                        )
+                    ],
                     map_,
                     nullify=null,
                 )
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 489f0e74dd6..8ba47795437 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -27,6 +27,7 @@
 from cudf.core._compat import PANDAS_GE_150
 from cudf.core.frame import Frame
 from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index
+from cudf.utils.dtypes import is_column_like
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
@@ -501,9 +502,9 @@ def __repr__(self):
             # TODO: Update the following two arange calls to
             # a single arange call once arange has support for
             # a vector start/end points.
-            indices = column.arange(start=0, stop=n, step=1)
+            indices = column.as_column(range(n))
             indices = indices.append(
-                column.arange(start=len(self) - n, stop=len(self), step=1)
+                column.as_column(range(len(self) - n, len(self), 1))
             )
             preprocess = self.take(indices)
         else:
@@ -795,7 +796,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
             [
                 frame,
                 cudf.DataFrame(
-                    {"idx": cudf.Series(column.arange(len(frame)))}
+                    {"idx": cudf.Series(column.as_column(range(len(frame))))}
                 ),
             ],
             axis=1,
@@ -807,7 +808,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
         # obtain deterministic ordering.
         if cudf.get_option("mode.pandas_compatible"):
             lookup_order = "_" + "_".join(map(str, lookup._data.names))
-            lookup[lookup_order] = column.arange(len(lookup))
+            lookup[lookup_order] = column.as_column(range(len(lookup)))
             postprocess = operator.methodcaller(
                 "sort_values", by=[lookup_order, "idx"]
             )
@@ -840,14 +841,16 @@ def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
             ):
                 stop = row_tuple.stop or max_length
                 start, stop, step = row_tuple.indices(stop)
-                return column.arange(start, stop, step)
+                return column.as_column(range(start, stop, step))
             start_values = self._compute_validity_mask(
                 index, row_tuple.start, max_length
             )
             stop_values = self._compute_validity_mask(
                 index, row_tuple.stop, max_length
             )
-            return column.arange(start_values.min(), stop_values.max() + 1)
+            return column.as_column(
+                range(start_values.min(), stop_values.max() + 1)
+            )
         elif isinstance(row_tuple, numbers.Number):
             return row_tuple
         return self._compute_validity_mask(index, row_tuple, max_length)
@@ -1024,7 +1027,7 @@ def __getitem__(self, index):
             index = np.array(index)
         elif isinstance(index, slice):
             start, stop, step = index.indices(len(self))
-            index = column.arange(start, stop, step)
+            index = column.as_column(range(start, stop, step))
         result = MultiIndex.from_frame(
             self.to_frame(index=False, name=range(0, self.nlevels)).take(
                 index
@@ -1224,6 +1227,7 @@ def from_tuples(cls, tuples, names=None):
 
         See Also
         --------
+        MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
         MultiIndex.from_product : Make a MultiIndex from cartesian product
                                   of iterables.
         MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
@@ -1333,6 +1337,7 @@ def from_frame(cls, df, names=None):
 
         See Also
         --------
+        MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
         MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
         MultiIndex.from_product : Make a MultiIndex from cartesian product
                                   of iterables.
@@ -1427,6 +1432,66 @@ def from_product(cls, arrays, names=None):
         pdi = pd.MultiIndex.from_product(arrays, names=names)
         return cls.from_pandas(pdi)
 
+    @classmethod
+    @_cudf_nvtx_annotate
+    def from_arrays(
+        cls,
+        arrays,
+        sortorder=None,
+        names=None,
+    ) -> MultiIndex:
+        """
+        Convert arrays to MultiIndex.
+
+        Parameters
+        ----------
+        arrays : list / sequence of array-likes
+            Each array-like gives one level's value for each data point.
+            len(arrays) is the number of levels.
+        sortorder : optional int
+            Not yet supported
+        names : list / sequence of str, optional
+            Names for the levels in the index.
+
+        Returns
+        -------
+        MultiIndex
+
+        See Also
+        --------
+        MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
+        MultiIndex.from_product : Make a MultiIndex from cartesian product
+                                  of iterables.
+        MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
+
+        Examples
+        --------
+        >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
+        >>> cudf.MultiIndex.from_arrays(arrays, names=('number', 'color'))
+        MultiIndex([(1,  'red'),
+                    (1, 'blue'),
+                    (2,  'red'),
+                    (2, 'blue')],
+                   names=['number', 'color'])
+        """
+        # Imported here due to circular import
+        from cudf.core.algorithms import factorize
+
+        error_msg = "Input must be a list / sequence of array-likes."
+        if not is_list_like(arrays):
+            raise TypeError(error_msg)
+        codes = []
+        levels = []
+        for array in arrays:
+            if not (is_list_like(array) or is_column_like(array)):
+                raise TypeError(error_msg)
+            code, level = factorize(array, sort=True)
+            codes.append(code)
+            levels.append(level)
+        return cls(
+            codes=codes, levels=levels, sortorder=sortorder, names=names
+        )
+
     @_cudf_nvtx_annotate
     def _poplevels(self, level):
         """
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index df5a62b384e..bc1eaef86db 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -55,7 +55,6 @@
     DatetimeColumn,
     IntervalColumn,
     TimeDeltaColumn,
-    arange,
     as_column,
     full,
 )
@@ -1366,7 +1365,9 @@ def map(self, arg, na_action=None) -> "Series":
                 raise NotImplementedError(
                     "default values in dicts are currently not supported."
                 )
-            lhs = cudf.DataFrame({"x": self, "orig_order": arange(len(self))})
+            lhs = cudf.DataFrame(
+                {"x": self, "orig_order": as_column(range(len(self)))}
+            )
             rhs = cudf.DataFrame(
                 {
                     "x": arg.keys(),
@@ -1386,7 +1387,9 @@ def map(self, arg, na_action=None) -> "Series":
                     "Reindexing only valid with"
                     " uniquely valued Index objects"
                 )
-            lhs = cudf.DataFrame({"x": self, "orig_order": arange(len(self))})
+            lhs = cudf.DataFrame(
+                {"x": self, "orig_order": as_column(range(len(self)))}
+            )
             rhs = cudf.DataFrame(
                 {
                     "x": arg.keys(),
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index bfe5f5007fe..12baf1ea6d1 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -1,5 +1,6 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+import functools
 import os
 from typing import Any, Callable, Dict
 
@@ -17,10 +18,7 @@
 
 import rmm
 
-from cudf._lib.strings_udf import (
-    column_from_udf_string_array,
-    column_to_string_view_array,
-)
+from cudf._lib import strings_udf
 from cudf.api.types import is_scalar
 from cudf.core.column.column import as_column
 from cudf.core.dtypes import dtype
@@ -63,7 +61,15 @@
 precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
 launch_arg_getters: Dict[Any, Any] = {}
 
-_PTX_FILE = _get_ptx_file(os.path.dirname(__file__), "shim_")
+
+@functools.cache
+def _ptx_file():
+    return _get_ptx_file(
+        os.path.join(
+            os.path.dirname(strings_udf.__file__), "..", "core", "udf"
+        ),
+        "shim_",
+    )
 
 
 @_cudf_nvtx_annotate
@@ -286,7 +292,7 @@ def _get_kernel(kernel_string, globals_, sig, func):
     exec(kernel_string, globals_)
     _kernel = globals_["_kernel"]
     kernel = cuda.jit(
-        sig, link=[_PTX_FILE], extensions=[str_view_arg_handler]
+        sig, link=[_ptx_file()], extensions=[str_view_arg_handler]
     )(_kernel)
 
     return kernel
@@ -319,7 +325,7 @@ def _return_arr_from_dtype(dtype, size):
 
 def _post_process_output_col(col, retty):
     if retty == _cudf_str_dtype:
-        return column_from_udf_string_array(col)
+        return strings_udf.column_from_udf_string_array(col)
     return as_column(col, retty)
 
 
@@ -361,7 +367,7 @@ def set_malloc_heap_size(size=None):
 
 def column_to_string_view_array_init_heap(col):
     # lazily allocate heap only when a string needs to be returned
-    return column_to_string_view_array(col)
+    return strings_udf.column_to_string_view_array(col)
 
 
 class UDFError(RuntimeError):
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 8a92ea86d57..207fb469990 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION
+# Copyright (c) 2020-2024, NVIDIA CORPORATION
 
 import itertools
 
@@ -235,7 +235,7 @@ def _apply_agg_column(self, source_column, agg_name):
             start = as_column(start, dtype="int32")
             end = as_column(end, dtype="int32")
 
-            idx = cudf.core.column.arange(len(start))
+            idx = as_column(range(len(start)))
             preceding_window = (idx - start + cudf.Scalar(1, "int32")).astype(
                 "int32"
             )
@@ -531,7 +531,7 @@ def __init__(self, groupby, window, min_periods=None, center=False):
     def _window_to_window_sizes(self, window):
         if is_integer(window):
             return cudautils.grouped_window_sizes_from_offset(
-                column.arange(len(self.obj)).data_array_view(mode="read"),
+                as_column(range(len(self.obj))).data_array_view(mode="read"),
                 self._group_starts,
                 window,
             )
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index e1950c9f250..d3d99aab0cd 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 
 import cudf
@@ -35,12 +35,12 @@ def from_dlpack(pycapsule_obj):
     """
 
     columns = libdlpack.from_dlpack(pycapsule_obj)
-    column_names = range(len(columns))
+    data = dict(enumerate(columns))
 
     if len(columns) == 1:
-        return cudf.Series._from_columns(columns, column_names=column_names)
+        return cudf.Series._from_data(data)
     else:
-        return cudf.DataFrame._from_columns(columns, column_names=column_names)
+        return cudf.DataFrame._from_data(data)
 
 
 @ioutils.doc_to_dlpack()
@@ -71,7 +71,7 @@ def to_dlpack(cudf_obj):
     if isinstance(cudf_obj, (cudf.DataFrame, cudf.Series, cudf.BaseIndex)):
         gdf = cudf_obj
     elif isinstance(cudf_obj, ColumnBase):
-        gdf = cudf_obj.as_frame()
+        gdf = cudf.Series._from_data({None: cudf_obj})
     else:
         raise TypeError(
             f"Input of type {type(cudf_obj)} cannot be converted "
diff --git a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_0.16.pkl b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_0.16.pkl
deleted file mode 100644
index 97c745c1dd0..00000000000
Binary files a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_0.16.pkl and /dev/null differ
diff --git a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl
new file mode 100644
index 00000000000..1ec077d10f7
Binary files /dev/null and b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl differ
diff --git a/python/cudf/cudf/tests/groupby/test_indexing.py b/python/cudf/cudf/tests/groupby/test_indexing.py
index 06777c8e6af..57e8bc1c2d8 100644
--- a/python/cudf/cudf/tests/groupby/test_indexing.py
+++ b/python/cudf/cudf/tests/groupby/test_indexing.py
@@ -1 +1,12 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import cudf
+from cudf.testing._utils import assert_eq
+
+
+def test_rank_return_type_compatible_mode():
+    # in compatible mode, rank() always returns floats
+    df = cudf.DataFrame({"a": range(10), "b": [0] * 10}, index=[0] * 10)
+    pdf = df.to_pandas()
+    expect = pdf.groupby("b").get_group(0)
+    result = df.groupby("b").get_group(0)
+    assert_eq(expect, result)
diff --git a/python/cudf/cudf/tests/test_buffer.py b/python/cudf/cudf/tests/test_buffer.py
index 1c9e7475080..03637e05eae 100644
--- a/python/cudf/cudf/tests/test_buffer.py
+++ b/python/cudf/cudf/tests/test_buffer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import cupy as cp
 import pytest
@@ -64,7 +64,14 @@ def test_buffer_creation_from_any():
     assert isinstance(b, Buffer)
     assert ary.data.ptr == b.get_ptr(mode="read")
     assert ary.nbytes == b.size
-    assert b.owner.owner is ary
+    assert b.owner.owner.owner is ary
+
+
+@pytest.mark.parametrize("size", [10, 2**10 + 500, 2**20])
+def test_buffer_str(size):
+    ary = cp.arange(size, dtype="uint8")
+    buf = as_buffer(ary)
+    assert f"size={size}" in repr(buf)
 
 
 @pytest.mark.parametrize(
@@ -73,7 +80,7 @@ def test_buffer_creation_from_any():
 def test_buffer_repr(size, expect):
     ary = cp.arange(size, dtype="uint8")
     buf = as_buffer(ary)
-    assert f"size={expect}" in repr(buf)
+    assert f"size={expect}" in str(buf)
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index a4b27ae19ac..3d21994a8d5 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import cupy as cp
 import numpy as np
@@ -8,7 +8,7 @@
 
 import cudf
 from cudf._lib.transform import mask_to_bools
-from cudf.core.column.column import arange, as_column
+from cudf.core.column.column import as_column
 from cudf.testing._utils import assert_eq, assert_exceptions_equal
 from cudf.utils import dtypes as dtypeutils
 
@@ -552,9 +552,3 @@ def test_astype_with_aliases(alias, expect_dtype, data):
     gd_data = cudf.Series.from_pandas(pd_data)
 
     assert_eq(pd_data.astype(expect_dtype), gd_data.astype(alias))
-
-
-def test_arange_empty():
-    result = arange(0)
-    assert len(result) == 0
-    assert result.dtype == np.dtype(np.int64)
diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py
index 085774e9dbc..e737a73e86b 100644
--- a/python/cudf/cudf/tests/test_copying.py
+++ b/python/cudf/cudf/tests/test_copying.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import cupy as cp
 import numpy as np
@@ -113,11 +113,8 @@ def test_series_setitem_partial_slice_cow_on():
         assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5]))
 
         new_slice = actual[2:]
-        # TODO: when COW and spilling has been unified, find a clean way to
-        # test this without accessing the internal attributes _base and _ptr
         assert (
-            new_slice._column.base_data._base._ptr
-            == actual._column.base_data._base._ptr
+            new_slice._column.base_data.owner == actual._column.base_data.owner
         )
         new_slice[0:2] = 10
         assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4]))
@@ -134,9 +131,11 @@ def test_series_setitem_partial_slice_cow_off():
         assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5]))
 
         new_slice = actual[2:]
-        assert (
-            new_slice._column.base_data._ptr == actual._column.base_data._ptr
-        )
+        # Since COW is off, a slice should point to the same memory
+        ptr1 = new_slice._column.base_data.get_ptr(mode="read")
+        ptr2 = actual._column.base_data.get_ptr(mode="read")
+        assert ptr1 == ptr2
+
         new_slice[0:2] = 10
         assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4]))
         assert_eq(actual, cudf.Series([1, 2, 10, 10, 5]))
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index df53a174660..deddedbe3e8 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2464,3 +2464,11 @@ def test_to_datetime_dataframe_utc_true():
     result = cudf.to_datetime(data, utc=True)
     expected = pd.Series([datetime.datetime(2020, 1, 1)]).dt.tz_localize("UTC")
     assert_eq(result, expected)
+
+
+def test_datetimeindex_dtype_np_dtype():
+    dtype = np.dtype("datetime64[ns]")
+    data = [1]
+    gdti = cudf.DatetimeIndex(data, dtype=dtype)
+    pdti = pd.DatetimeIndex(data, dtype=dtype)
+    assert_eq(gdti, pdti)
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index d6134c7bb01..bffbade14d8 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from typing import Any, Tuple
 
@@ -112,7 +112,8 @@ def assert_column_equal(col: _CuDFColumn, cudfcol):
         assert col.get_buffers()["offsets"] is None
 
     elif col.dtype[0] == _DtypeKind.STRING:
-        assert_buffer_equal(col.get_buffers()["data"], cudfcol.children[1])
+        chars_col = build_column(data=cudfcol.data, dtype="int8")
+        assert_buffer_equal(col.get_buffers()["data"], chars_col)
         assert_buffer_equal(col.get_buffers()["offsets"], cudfcol.children[0])
 
     else:
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 2d5a4d1d782..78bce89f2a8 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 """
 Test related to MultiIndex
@@ -2085,12 +2085,7 @@ def test_multiindex_eq_other_multiindex():
     params=[
         "from_product",
         "from_tuples",
-        pytest.param(
-            "from_arrays",
-            marks=pytest.mark.xfail(
-                reason="TODO: from_arrays is not implemented"
-            ),
-        ),
+        "from_arrays",
         "init",
     ]
 )
@@ -2100,7 +2095,7 @@ def midx(request):
     elif request.param == "from_tuples":
         return cudf.MultiIndex.from_tuples([(0, 1), (0, 0), (1, 1), (1, 0)])
     elif request.param == "from_arrays":
-        return cudf.MultiIndex.from_arrays([0, 0, 1, 1], [1, 0, 1, 0])
+        return cudf.MultiIndex.from_arrays([[0, 0, 1, 1], [1, 0, 1, 0]])
     elif request.param == "init":
         return cudf.MultiIndex(
             levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [1, 0, 1, 0]]
@@ -2112,3 +2107,30 @@ def midx(request):
 def test_multindex_constructor_levels_always_indexes(midx):
     assert_eq(midx.levels[0], cudf.Index([0, 1]))
     assert_eq(midx.levels[1], cudf.Index([0, 1]))
+
+
+@pytest.mark.parametrize(
+    "array",
+    [
+        list,
+        tuple,
+        np.array,
+        cp.array,
+        pd.Index,
+        cudf.Index,
+        pd.Series,
+        cudf.Series,
+    ],
+)
+def test_multiindex_from_arrays(array):
+    pd_data = [[0, 0, 1, 1], [1, 0, 1, 0]]
+    cudf_data = [array(lst) for lst in pd_data]
+    result = pd.MultiIndex.from_arrays(pd_data)
+    expected = cudf.MultiIndex.from_arrays(cudf_data)
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("arg", ["foo", ["foo"]])
+def test_multiindex_from_arrays_wrong_arg(arg):
+    with pytest.raises(TypeError):
+        cudf.MultiIndex.from_arrays(arg)
diff --git a/python/cudf/cudf/tests/test_no_device.py b/python/cudf/cudf/tests/test_no_device.py
new file mode 100644
index 00000000000..722762b2d0c
--- /dev/null
+++ b/python/cudf/cudf/tests/test_no_device.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import os
+import subprocess
+
+
+def test_cudf_import_no_device():
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = "-1"
+    output = subprocess.run(
+        ["python", "-c", "import cudf"],
+        env=env,
+        capture_output=True,
+        text=True,
+        cwd="/",
+    )
+    assert output.returncode == 0
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index cac170cce55..87efe6bbbcc 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
+import itertools
 import pickle
 
 import msgpack
@@ -115,6 +116,7 @@
             ]
         ),
     ],
+    ids=itertools.count(),
 )
 @pytest.mark.parametrize("to_host", [True, False])
 def test_serialize(df, to_host):
@@ -368,8 +370,8 @@ def test_serialize_string_check_buffer_sizes():
     assert expect == got
 
 
-def test_deserialize_cudf_0_16(datadir):
-    fname = datadir / "pkl" / "stringColumnWithRangeIndex_cudf_0.16.pkl"
+def test_deserialize_cudf_23_12(datadir):
+    fname = datadir / "pkl" / "stringColumnWithRangeIndex_cudf_23.12.pkl"
 
     expected = cudf.DataFrame({"a": ["hi", "hello", "world", None]})
     with open(fname, "rb") as f:
diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py
index 88ce908aa5f..7e66a7ab4ba 100644
--- a/python/cudf/cudf/tests/test_spilling.py
+++ b/python/cudf/cudf/tests/test_spilling.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 import importlib
 import random
@@ -34,7 +34,7 @@
 )
 from cudf.core.buffer.spillable_buffer import (
     SpillableBuffer,
-    SpillableBufferSlice,
+    SpillableBufferOwner,
     SpillLock,
 )
 from cudf.testing._utils import assert_eq
@@ -196,10 +196,10 @@ def test_creations(manager: SpillManager):
 def test_spillable_df_groupby(manager: SpillManager):
     df = cudf.DataFrame({"a": [1, 1, 1]})
     gb = df.groupby("a")
-    assert len(single_column_df_base_data(df)._spill_locks) == 0
+    assert len(single_column_df_base_data(df).owner._spill_locks) == 0
     gb._groupby
     # `gb._groupby`, which is cached on `gb`, holds a spill lock
-    assert len(single_column_df_base_data(df)._spill_locks) == 1
+    assert len(single_column_df_base_data(df).owner._spill_locks) == 1
     assert not single_column_df_data(df).spillable
     del gb
     assert single_column_df_data(df).spillable
@@ -375,7 +375,7 @@ def test_get_ptr(manager: SpillManager, target):
         mem = np.empty(10, dtype="u1")
     buf = as_buffer(data=mem, exposed=False)
     assert buf.spillable
-    assert len(buf._spill_locks) == 0
+    assert len(buf.owner._spill_locks) == 0
     with acquire_spill_lock():
         buf.get_ptr(mode="read")
         assert not buf.spillable
@@ -496,8 +496,8 @@ def test_serialize_cuda_dataframe(manager: SpillManager):
     header, frames = protocol.serialize(
         df1, serializers=("cuda",), on_error="raise"
     )
-    buf: SpillableBufferSlice = single_column_df_data(df1)
-    assert len(buf._base._spill_locks) == 1
+    buf: SpillableBuffer = single_column_df_data(df1)
+    assert len(buf.owner._spill_locks) == 1
     assert len(frames) == 1
     assert isinstance(frames[0], Buffer)
     assert frames[0].get_ptr(mode="read") == buf.get_ptr(mode="read")
@@ -543,13 +543,14 @@ def test_as_buffer_of_spillable_buffer(manager: SpillManager):
     data = cupy.arange(10, dtype="u1")
     b1 = as_buffer(data, exposed=False)
     assert isinstance(b1, SpillableBuffer)
-    assert b1.owner is data
+    assert isinstance(b1.owner, SpillableBufferOwner)
+    assert b1.owner.owner is data
     b2 = as_buffer(b1)
     assert b1 is b2
 
     with pytest.raises(
         ValueError,
-        match="buffer must either be exposed or spilled locked",
+        match="owning spillable buffer must either be exposed or spill locked",
     ):
         # Use `memory_info` to access device point _without_ making
         # the buffer unspillable.
@@ -557,21 +558,21 @@ def test_as_buffer_of_spillable_buffer(manager: SpillManager):
 
     with acquire_spill_lock():
         b3 = as_buffer(b1.get_ptr(mode="read"), size=b1.size, owner=b1)
-    assert isinstance(b3, SpillableBufferSlice)
-    assert b3.owner is b1
+    assert isinstance(b3, SpillableBuffer)
+    assert b3.owner is b1.owner
 
     b4 = as_buffer(
         b1.get_ptr(mode="write") + data.itemsize,
         size=b1.size - data.itemsize,
         owner=b3,
     )
-    assert isinstance(b4, SpillableBufferSlice)
-    assert b4.owner is b1
+    assert isinstance(b4, SpillableBuffer)
+    assert b4.owner is b1.owner
     assert all(cupy.array(b4.memoryview()) == data[1:])
 
     b5 = as_buffer(b4.get_ptr(mode="write"), size=b4.size - 1, owner=b4)
-    assert isinstance(b5, SpillableBufferSlice)
-    assert b5.owner is b1
+    assert isinstance(b5, SpillableBuffer)
+    assert b5.owner is b1.owner
     assert all(cupy.array(b5.memoryview()) == data[1:-1])
 
 
diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py
index 88c73ccf964..5dbb86fe27d 100644
--- a/python/cudf/cudf/tests/test_string_udfs.py
+++ b/python/cudf/cudf/tests/test_string_udfs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 import numba
 import numpy as np
@@ -20,10 +20,12 @@
     string_view,
     udf_string,
 )
-from cudf.core.udf.utils import _PTX_FILE, _get_extensionty_size
+from cudf.core.udf.utils import _get_extensionty_size, _ptx_file
 from cudf.testing._utils import assert_eq, sv_to_udf_str
 from cudf.utils._numba import _CUDFNumbaConfig
 
+_PTX_FILE = _ptx_file()
+
 
 def get_kernels(func, dtype, size):
     """
diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py
index e6658040663..3024c8e2e7b 100644
--- a/python/cudf/cudf/tests/test_testing.py
+++ b/python/cudf/cudf/tests/test_testing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -431,8 +431,8 @@ def test_assert_column_memory_basic_same(arrow_arrays):
     data = cudf.core.column.ColumnBase.from_arrow(arrow_arrays)
     buf = cudf.core.buffer.as_buffer(data.base_data)
 
-    left = cudf.core.column.build_column(buf, dtype=np.int32)
-    right = cudf.core.column.build_column(buf, dtype=np.int32)
+    left = cudf.core.column.build_column(buf, dtype=np.int8)
+    right = cudf.core.column.build_column(buf, dtype=np.int8)
 
     assert_column_memory_eq(left, right)
     with pytest.raises(AssertionError):
diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index fc45f60cdaf..6d00fd397df 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -1,28 +1,27 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 import glob
 import os
 import sys
-import warnings
+from functools import lru_cache
 
 from numba import config as numba_config
 
-try:
-    from pynvjitlink.patch import (
-        patch_numba_linker as patch_numba_linker_pynvjitlink,
-    )
-except ImportError:
-
-    def patch_numba_linker_pynvjitlink():
-        warnings.warn(
-            "CUDA Toolkit is newer than CUDA driver. "
-            "Numba features will not work in this configuration. "
-        )
 
+# Use an lru_cache with a single value to allow a delayed import of
+# strings_udf. This is the easiest way to break an otherwise circular import
+# loop of _lib.*->cudautils->_numba->_lib.strings_udf
+@lru_cache
+def _get_cc_60_ptx_file():
+    from cudf._lib import strings_udf
 
-CC_60_PTX_FILE = os.path.join(
-    os.path.dirname(__file__), "../core/udf/shim_60.ptx"
-)
+    return os.path.join(
+        os.path.dirname(strings_udf.__file__),
+        "..",
+        "core",
+        "udf",
+        "shim_60.ptx",
+    )
 
 
 def _get_best_ptx_file(archs, max_compute_capability):
@@ -105,11 +104,13 @@ def _setup_numba():
     version of the CUDA Toolkit used to build the PTX files shipped
     with the user cuDF package.
     """
-    # ptxcompiler is a requirement for cuda 11.x packages but not
-    # cuda 12.x packages. However its version checking machinery
-    # is still necessary. If a user happens to have ptxcompiler
-    # in a cuda 12 environment, it's use for the purposes of
-    # checking the driver and runtime versions is harmless
+
+    # Either ptxcompiler, or our vendored version (_ptxcompiler.py)
+    # is needed to determine the driver and runtime CUDA versions in
+    # the environment. In a CUDA 11.x environment, ptxcompiler is used
+    # to provide MVC directly, whereas for CUDA 12.x this is provided
+    # through pynvjitlink. The presence of either package does not
+    # perturb cuDF's operation in situations where they aren't used.
     try:
         from ptxcompiler.patch import NO_DRIVER, safe_get_versions
     except ModuleNotFoundError:
@@ -119,7 +120,9 @@ def _setup_numba():
     versions = safe_get_versions()
     if versions != NO_DRIVER:
         driver_version, runtime_version = versions
-        ptx_toolkit_version = _get_cuda_version_from_ptx_file(CC_60_PTX_FILE)
+        ptx_toolkit_version = _get_cuda_version_from_ptx_file(
+            _get_cc_60_ptx_file()
+        )
 
         # MVC is required whenever any PTX is newer than the driver
         # This could be the shipped PTX file or the PTX emitted by
@@ -131,7 +134,9 @@ def _setup_numba():
             if driver_version < (12, 0):
                 patch_numba_linker_cuda_11()
             else:
-                patch_numba_linker_pynvjitlink()
+                from pynvjitlink.patch import patch_numba_linker
+
+                patch_numba_linker()
 
 
 def _get_cuda_version_from_ptx_file(path):
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 7c3f4a97a5e..c7b66abea27 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -59,7 +59,7 @@ test = [
     "msgpack",
     "pytest",
     "pytest-benchmark",
-    "pytest-cases",
+    "pytest-cases>=3.8.2",
     "pytest-cov",
     "pytest-xdist",
     "python-snappy>=0.6.0",
diff --git a/python/cudf/udf_cpp/CMakeLists.txt b/python/cudf/udf_cpp/CMakeLists.txt
index 7d6dc84b322..57b52559f00 100644
--- a/python/cudf/udf_cpp/CMakeLists.txt
+++ b/python/cudf/udf_cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -55,30 +55,6 @@ target_compile_options(
 target_link_libraries(cudf_strings_udf PUBLIC cudf::cudf)
 install(TARGETS cudf_strings_udf DESTINATION ./cudf/_lib/)
 
-# This function will copy the generated PTX file from its generator-specific location in the build
-# tree into a specified location in the build tree from which we can install it.
-function(copy_ptx_to_location target destination new_name)
-  set(cmake_generated_file
-      "${CMAKE_CURRENT_BINARY_DIR}/cmake/cp_${target}_$<LOWER_CASE:$<CONFIG>>_ptx.cmake"
-  )
-  file(
-    GENERATE
-    OUTPUT "${cmake_generated_file}"
-    CONTENT
-      "
-set(ptx_path \"$<TARGET_OBJECTS:${target}>\")
-file(MAKE_DIRECTORY \"${destination}\")
-file(COPY_FILE \${ptx_path} \"${destination}/${new_name}\")"
-  )
-
-  add_custom_target(
-    ${target}_cp_ptx ALL
-    COMMAND ${CMAKE_COMMAND} -P "${cmake_generated_file}"
-    DEPENDS $<TARGET_OBJECTS:${target}>
-    COMMENT "Copying PTX files to '${destination}'"
-  )
-endfunction()
-
 # Create the shim library for each architecture.
 set(SHIM_CUDA_FLAGS --expt-relaxed-constexpr -rdc=true)
 
@@ -104,10 +80,9 @@ foreach(arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
   target_compile_options(${tgt} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${SHIM_CUDA_FLAGS}>")
   target_link_libraries(${tgt} PUBLIC cudf::cudf)
 
-  copy_ptx_to_location(${tgt} "${CMAKE_CURRENT_BINARY_DIR}/../udf" ${tgt}.ptx)
   install(
     FILES $<TARGET_OBJECTS:${tgt}>
-    DESTINATION ./cudf/core/udf/
+    DESTINATION cudf/core/udf/
     RENAME ${tgt}.ptx
   )
 endforeach()
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index cd764381b3a..08c03235484 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -26,7 +26,7 @@
 
 from dask_cudf import sorting
 from dask_cudf.accessors import ListMethods, StructMethods
-from dask_cudf.sorting import _get_shuffle_method
+from dask_cudf.sorting import _deprecate_shuffle_kwarg, _get_shuffle_method
 
 
 class _Frame(dd.core._Frame, OperatorMethodMixin):
@@ -111,6 +111,7 @@ def do_apply_rows(df, func, incols, outcols, kwargs):
             do_apply_rows, func, incols, outcols, kwargs, meta=meta
         )
 
+    @_deprecate_shuffle_kwarg
     @_dask_cudf_nvtx_annotate
     def merge(self, other, shuffle_method=None, **kwargs):
         on = kwargs.pop("on", None)
@@ -123,6 +124,7 @@ def merge(self, other, shuffle_method=None, **kwargs):
             **kwargs,
         )
 
+    @_deprecate_shuffle_kwarg
     @_dask_cudf_nvtx_annotate
     def join(self, other, shuffle_method=None, **kwargs):
         # CuDF doesn't support "right" join yet
@@ -141,6 +143,7 @@ def join(self, other, shuffle_method=None, **kwargs):
             **kwargs,
         )
 
+    @_deprecate_shuffle_kwarg
     @_dask_cudf_nvtx_annotate
     def set_index(
         self,
@@ -216,6 +219,7 @@ def set_index(
             **kwargs,
         )
 
+    @_deprecate_shuffle_kwarg
     @_dask_cudf_nvtx_annotate
     def sort_values(
         self,
@@ -298,6 +302,7 @@ def var(
         else:
             return _parallel_var(self, meta, skipna, split_every, out)
 
+    @_deprecate_shuffle_kwarg
     @_dask_cudf_nvtx_annotate
     def shuffle(self, *args, shuffle_method=None, **kwargs):
         """Wraps dask.dataframe DataFrame.shuffle method"""
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index fde78cd4450..43ad4f0fee3 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -17,6 +17,8 @@
 import cudf
 from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
 
+from dask_cudf.sorting import _deprecate_shuffle_kwarg
+
 # aggregations that are dask-cudf optimized
 OPTIMIZED_AGGS = (
     "count",
@@ -189,8 +191,11 @@ def last(self, split_every=None, split_out=1):
             split_out,
         )
 
+    @_deprecate_shuffle_kwarg
     @_dask_cudf_nvtx_annotate
-    def aggregate(self, arg, split_every=None, split_out=1, shuffle=None):
+    def aggregate(
+        self, arg, split_every=None, split_out=1, shuffle_method=None
+    ):
         if arg == "size":
             return self.size()
 
@@ -211,7 +216,7 @@ def aggregate(self, arg, split_every=None, split_out=1, shuffle=None):
                 sep=self.sep,
                 sort=self.sort,
                 as_index=self.as_index,
-                shuffle_method=shuffle,
+                shuffle_method=shuffle_method,
                 **self.dropna,
             )
 
@@ -219,7 +224,7 @@ def aggregate(self, arg, split_every=None, split_out=1, shuffle=None):
             arg,
             split_every=split_every,
             split_out=split_out,
-            shuffle=shuffle,
+            shuffle_method=shuffle_method,
         )
 
 
@@ -330,8 +335,11 @@ def last(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
+    @_deprecate_shuffle_kwarg
     @_dask_cudf_nvtx_annotate
-    def aggregate(self, arg, split_every=None, split_out=1, shuffle=None):
+    def aggregate(
+        self, arg, split_every=None, split_out=1, shuffle_method=None
+    ):
         if arg == "size":
             return self.size()
 
@@ -342,14 +350,14 @@ def aggregate(self, arg, split_every=None, split_out=1, shuffle=None):
 
         if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS):
             return _make_groupby_agg_call(
-                self, arg, split_every, split_out, shuffle
+                self, arg, split_every, split_out, shuffle_method
             )[self._slice]
 
         return super().aggregate(
             arg,
             split_every=split_every,
             split_out=split_out,
-            shuffle=shuffle,
+            shuffle_method=shuffle_method,
         )
 
 
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index f89682c092a..2e71202f151 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+import warnings
 from collections.abc import Iterator
+from functools import wraps
 
 import cupy
 import numpy as np
@@ -21,6 +23,31 @@
 _SHUFFLE_SUPPORT = ("tasks", "p2p")  # "disk" not supported
 
 
+def _deprecate_shuffle_kwarg(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        old_arg_value = kwargs.pop("shuffle", None)
+
+        if old_arg_value is not None:
+            new_arg_value = old_arg_value
+            msg = (
+                "the 'shuffle' keyword is deprecated, "
+                "use 'shuffle_method' instead."
+            )
+
+            warnings.warn(msg, FutureWarning)
+            if kwargs.get("shuffle_method") is not None:
+                msg = (
+                    "Can only specify 'shuffle' "
+                    "or 'shuffle_method', not both."
+                )
+                raise TypeError(msg)
+            kwargs["shuffle_method"] = new_arg_value
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
 @_dask_cudf_nvtx_annotate
 def set_index_post(df, index_name, drop, column_dtype):
     df2 = df.set_index(index_name, drop=drop)
@@ -229,6 +256,7 @@ def quantile_divisions(df, by, npartitions):
     return divisions
 
 
+@_deprecate_shuffle_kwarg
 @_dask_cudf_nvtx_annotate
 def sort_values(
     df,
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 996457e4861..0dc57d8df55 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -834,26 +834,38 @@ def test_groupby_shuffle():
 
     # Sorted aggregation, single-partition output
     # (sort=True, split_out=1)
-    got = gddf.groupby("a", sort=True).agg(spec, shuffle=True, split_out=1)
+    got = gddf.groupby("a", sort=True).agg(
+        spec, shuffle_method=True, split_out=1
+    )
     dd.assert_eq(expect, got)
 
     # Sorted aggregation, multi-partition output
     # (sort=True, split_out=2)
-    got = gddf.groupby("a", sort=True).agg(spec, shuffle=True, split_out=2)
+    got = gddf.groupby("a", sort=True).agg(
+        spec, shuffle_method=True, split_out=2
+    )
     dd.assert_eq(expect, got)
 
     # Un-sorted aggregation, single-partition output
     # (sort=False, split_out=1)
-    got = gddf.groupby("a", sort=False).agg(spec, shuffle=True, split_out=1)
+    got = gddf.groupby("a", sort=False).agg(
+        spec, shuffle_method=True, split_out=1
+    )
     dd.assert_eq(expect.sort_index(), got.compute().sort_index())
 
     # Un-sorted aggregation, multi-partition output
     # (sort=False, split_out=2)
-    # NOTE: `shuffle=True` should be default
+    # NOTE: `shuffle_method=True` should be default
     got = gddf.groupby("a", sort=False).agg(spec, split_out=2)
     dd.assert_eq(expect, got.compute().sort_index())
 
     # Sorted aggregation fails with split_out>1 when shuffle is False
-    # (sort=True, split_out=2, shuffle=False)
+    # (sort=True, split_out=2, shuffle_method=False)
     with pytest.raises(ValueError):
-        gddf.groupby("a", sort=True).agg(spec, shuffle=False, split_out=2)
+        gddf.groupby("a", sort=True).agg(
+            spec, shuffle_method=False, split_out=2
+        )
+
+    # Check shuffle kwarg deprecation
+    with pytest.warns(match="'shuffle' keyword is deprecated"):
+        gddf.groupby("a", sort=True).agg(spec, shuffle=False)