From a8ae8cc88bd46ab0229361395d942833cd188435 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Fri, 10 Mar 2023 14:48:26 -0600
Subject: [PATCH 01/31] Chunked pack implementation

Co-authored-by: Dave Baranec <dbaranec@nvidia.com>

Signed-off-by: Alessandro Bellina <abellina@nvidia.com>
---
 cpp/benchmarks/copying/contiguous_split.cu   |   92 +-
 cpp/include/cudf/contiguous_split.hpp        |  135 ++
 cpp/include/cudf/detail/contiguous_split.hpp |    2 +-
 cpp/src/copying/contiguous_split.cu          | 1435 +++++++++++++-----
 cpp/src/copying/pack.cpp                     |    2 +
 cpp/tests/copying/split_tests.cpp            |  953 +++++++++---
 6 files changed, 2033 insertions(+), 586 deletions(-)
diff --git a/cpp/benchmarks/copying/contiguous_split.cu b/cpp/benchmarks/copying/contiguous_split.cu
index aff90039cb9..5b95370faf0 100644
--- a/cpp/benchmarks/copying/contiguous_split.cu
+++ b/cpp/benchmarks/copying/contiguous_split.cu
@@ -25,12 +25,32 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
-template <typename T>
+void contiguous_split(cudf::table_view const& src_table, std::vector<cudf::size_type> const& splits)
+{
+  auto result = cudf::contiguous_split(src_table, splits);
+}
+
+void chunked_pack(cudf::table_view const& src_table, std::vector<cudf::size_type> const&)
+{
+  auto mr     = rmm::mr::get_current_device_resource();
+  auto stream = cudf::get_default_stream();
+  rmm::device_buffer user_buffer(100L * 1024 * 1024, stream, mr);
+  auto chunked_pack = cudf::make_chunked_pack(src_table, user_buffer.size(), mr);
+  auto user_buffer_span =
+    cudf::device_span<uint8_t>(static_cast<uint8_t*>(user_buffer.data()), user_buffer.size());
+  while (chunked_pack->has_next()) {
+    auto iter_size = chunked_pack->next(user_buffer_span);
+  }
+  stream.synchronize();
+}
+
+template <typename T, typename ContigSplitImpl>
 void BM_contiguous_split_common(benchmark::State& state,
                                 std::vector<T>& src_cols,
                                 int64_t num_rows,
                                 int64_t num_splits,
-                                int64_t bytes_total)
+                                int64_t bytes_total,
+                                ContigSplitImpl& impl)
 {
   // generate splits
   std::vector<cudf::size_type> splits;
@@ -57,7 +77,7 @@ void BM_contiguous_split_common(benchmark::State& state,
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    auto result = cudf::contiguous_split(src_table, splits);
+    impl(src_table, splits);
   }
 
   // it's 2x bytes_total because we're both reading and writing.
@@ -65,8 +85,10 @@ void BM_contiguous_split_common(benchmark::State& state,
 }
 
 class ContiguousSplit : public cudf::benchmark {};
+class ChunkedPack : public cudf::benchmark {};
 
-void BM_contiguous_split(benchmark::State& state)
+template <typename ContiguousSplitImpl>
+void BM_contiguous_split(benchmark::State& state, ContiguousSplitImpl& impl)
 {
   int64_t const total_desired_bytes = state.range(0);
   cudf::size_type const num_cols    = state.range(1);
@@ -91,12 +113,14 @@ void BM_contiguous_split(benchmark::State& state)
     (include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols)
                       : 0);
 
-  BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes);
+  BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes, impl);
 }
 
 class ContiguousSplitStrings : public cudf::benchmark {};
+class ChunkedPackStrings : public cudf::benchmark {};
 
-void BM_contiguous_split_strings(benchmark::State& state)
+template <typename ContiguousSplitImpl>
+void BM_contiguous_split_strings(benchmark::State& state, ContiguousSplitImpl& impl)
 {
   int64_t const total_desired_bytes = state.range(0);
   cudf::size_type const num_cols    = state.range(1);
@@ -133,13 +157,13 @@ void BM_contiguous_split_strings(benchmark::State& state)
     (include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols)
                       : 0);
 
-  BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes);
+  BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes, impl);
 }
 
 #define CSBM_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \
   BENCHMARK_DEFINE_F(ContiguousSplit, name)(::benchmark::State & state)      \
   {                                                                          \
-    BM_contiguous_split(state);                                              \
+    BM_contiguous_split(state, contiguous_split);                            \
   }                                                                          \
   BENCHMARK_REGISTER_F(ContiguousSplit, name)                                \
     ->Args({size, num_columns, num_splits, validity})                        \
@@ -168,7 +192,7 @@ CSBM_BENCHMARK_DEFINE(1Gb1ColValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024,
 #define CSBM_STRINGS_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \
   BENCHMARK_DEFINE_F(ContiguousSplitStrings, name)(::benchmark::State & state)       \
   {                                                                                  \
-    BM_contiguous_split_strings(state);                                              \
+    BM_contiguous_split_strings(state, contiguous_split);                            \
   }                                                                                  \
   BENCHMARK_REGISTER_F(ContiguousSplitStrings, name)                                 \
     ->Args({size, num_columns, num_splits, validity})                                \
@@ -189,3 +213,53 @@ CSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 10
 CSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 1);
 CSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 0);
 CSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1);
+
+#define CCSBM_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \
+  BENCHMARK_DEFINE_F(ChunkedPack, name)(::benchmark::State & state)           \
+  {                                                                           \
+    BM_contiguous_split(state, chunked_pack);                                 \
+  }                                                                           \
+  BENCHMARK_REGISTER_F(ChunkedPack, name)                                     \
+    ->Args({size, num_columns, num_splits, validity})                         \
+    ->Unit(benchmark::kMillisecond)                                           \
+    ->UseManualTime()                                                         \
+    ->Iterations(8)
+CCSBM_BENCHMARK_DEFINE(6Gb512ColsNoValidity, (int64_t)6 * 1024 * 1024 * 1024, 512, 0, 0);
+CCSBM_BENCHMARK_DEFINE(6Gb512ColsValidity, (int64_t)6 * 1024 * 1024 * 1024, 512, 0, 1);
+CCSBM_BENCHMARK_DEFINE(6Gb10ColsNoValidity, (int64_t)6 * 1024 * 1024 * 1024, 10, 0, 0);
+CCSBM_BENCHMARK_DEFINE(6Gb10ColsValidity, (int64_t)6 * 1024 * 1024 * 1024, 10, 0, 1);
+
+CCSBM_BENCHMARK_DEFINE(4Gb512ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 0, 0);
+CCSBM_BENCHMARK_DEFINE(4Gb512ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 0, 1);
+CCSBM_BENCHMARK_DEFINE(4Gb10ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 0, 0);
+CCSBM_BENCHMARK_DEFINE(4Gb10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 0, 1);
+CCSBM_BENCHMARK_DEFINE(4Gb4ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1);
+
+CCSBM_BENCHMARK_DEFINE(1Gb512ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 0, 0);
+CCSBM_BENCHMARK_DEFINE(1Gb512ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 0, 1);
+CCSBM_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 0, 0);
+CCSBM_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 0, 1);
+CCSBM_BENCHMARK_DEFINE(1Gb1ColValidity, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1);
+
+#define CCSBM_STRINGS_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \
+  BENCHMARK_DEFINE_F(ChunkedPackStrings, name)(::benchmark::State & state)            \
+  {                                                                                   \
+    BM_contiguous_split_strings(state, chunked_pack);                                 \
+  }                                                                                   \
+  BENCHMARK_REGISTER_F(ChunkedPackStrings, name)                                      \
+    ->Args({size, num_columns, num_splits, validity})                                 \
+    ->Unit(benchmark::kMillisecond)                                                   \
+    ->UseManualTime()                                                                 \
+    ->Iterations(8)
+
+CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb512ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 0, 0);
+CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb512ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 0, 1);
+CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb10ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 0, 0);
+CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 0, 1);
+CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb4ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1);
+
+CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb512ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 0, 0);
+CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb512ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 0, 1);
+CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 0, 0);
+CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 0, 1);
+CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColValidity, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1);
diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp
index 62d668a98cb..eacfcfcb803 100644
--- a/cpp/include/cudf/contiguous_split.hpp
+++ b/cpp/include/cudf/contiguous_split.hpp
@@ -127,6 +127,141 @@ std::vector<packed_table> contiguous_split(
   std::vector<size_type> const& splits,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+namespace detail {
+struct contiguous_split_state;
+};
+
+/**
+ * @brief Perform a chunked "pack" operation of the input `table_view` using a user provided
+ * buffer of size `user_buffer_size`.
+ *
+ * The intent of this operation is to be used in a streamed fashion at times of GPU
+ * out-of-memory, where we want to minimize the number of small cudaMemcpy calls and
+ * tracking of all the metadata associated with cudf tables. Because of the memory constraints,
+ * all thrust and scratch memory allocations are using the passed-in memory resource exclusively,
+ * not a per-device memory resource.
+ *
+ * The caller has two methods it can use to carry out the chunked_pack: has_next and next.
+ * Here is an example:
+ *
+ * // Create a table_view
+ * cudf::table_view tv = ...;
+ *
+ * // Choose a memory resource. This memory resource is used for scratch/thrust temporary
+ * // data. In memory constrained cases, this can be used to set aside scratch memory
+ * // for `chunked_pack` at the beginning of a program.
+ * auto mr = rmm::mr::get_current_device_resource();
+ *
+ * auto stream = cudf::get_default_stream();
+ *
+ * // Define a bounce buffer size: the larger the bounce buffer is, the more SMs can be
+ * // occupied by this algorithm.
+ * std::size_t user_buffer_size = 128*1024*1024;
+ *
+ * auto chunked_packer = make_chunked_pack(tv, user_buffer_size, stream, mr);
+ *
+ * std::size_t host_offset = 0;
+ * auto host_buffer = ...; // obtain a host buffer you would like to copy to
+ *
+ * while (chunked_packer->has_next()) {
+ *   // get a user buffer of size `user_buffer_size`
+ *   cudf::device_span<uint8_t> user_buffer = ...;
+ *   std::size_t bytes_copied = chunked_packer->next(user_buffer);
+ *
+ *   // buffer will hold the contents of at most `user_buffer_size` bytes
+ *   // of the contiguously packed input `table_view`. You are now free to copy
+ *   // this memory somewhere else, for example, to host.
+ *   cudaMemcpyAsync(
+ *     host_buffer.data() + host_offset,
+ *     user_buffer.data(),
+ *     bytes_copied,
+ *     cudaMemcpyDefault,
+ *     stream);
+ *
+ *   host_offset += bytes_copied;
+ * }
+ */
+class chunked_pack {
+ public:
+  /**
+   * @brief Construct a `chunked_pack` class.
+   *
+   * @param input source `table_view` to pack
+   * @param user_buffer_size bounce buffer size (in bytes) that will be passed on `next`. Must be
+   *                         at least 1MB
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr RMM memory resource to be used for temporary and scratch allocations only
+   */
+  explicit chunked_pack(cudf::table_view const& input,
+                        std::size_t user_buffer_size,
+                        rmm::cuda_stream_view stream,
+                        rmm::mr::device_memory_resource* mr);
+
+  /**
+   * @brief Destructor that will be implemented as default, required because
+   * contiguous_split_state is incomplete at this stage.
+   */
+  ~chunked_pack();
+
+  /**
+   * @brief Obtain the total size of the contiguously packed `table_view`.
+   *
+   * @return total size (in bytes) of all the chunks
+   */
+  [[nodiscard]] std::size_t get_total_contiguous_size() const;
+
+  /**
+   * @brief Function to check if there are chunks left to be copied.
+   *
+   * @return true if there are chunks left to be copied, and false otherwise
+   */
+  [[nodiscard]] bool has_next() const;
+
+  /**
+   * @brief Packs the next chunk into `user_buffer`. This should be call as long as
+   * `has_next` returns true. If `next` is called when `has_next` is false, an exception
+   * is thrown.
+   *
+   * @throws cudf::logic_error If the size of `user_buffer` is different than `user_buffer_size`
+   * @throws cudf::logic_error If called after all chunks have been copied
+   *
+   * @param user_buffer device span representing a bounce buffer. The size of this span
+   *                    must equal the `user_buffer_size` parameter passed at construction
+   * @return The number of bytes that were written to `user_buffer` (at most
+   *          `user_buffer_size`)
+   *
+   */
+  [[nodiscard]] std::size_t next(cudf::device_span<uint8_t> const& user_buffer);
+
+  /**
+   * @brief Build the opaque metadata for all added columns.
+   *
+   * @return A vector containing the serialized column metadata
+   */
+  [[nodiscard]] std::unique_ptr<std::vector<uint8_t>> build_metadata() const;
+
+ private:
+  // internal state of contiguous split
+  std::unique_ptr<detail::contiguous_split_state> state;
+};
+
+/**
+ * @brief Created a `chunked_pack` instance to perform a "pack" of the `table_view`
+ * "input", where a bounce buffer of `user_buffer_size` is filled with chunks of the
+ * overall operation. This operation can be used in cases where GPU memory is constrained.
+ *
+ * @throws cudf::logic_error When user_buffer_size is less than 1MB
+ *
+ * @param input source `table_view` to pack
+ * @param user_buffer_size bounce buffer size (in bytes) that will be passed on `next`. Must be
+ *                         at least 1MB
+ * @param mr RMM memory resource to be used for temporary and scratch allocations only
+ * @return a unique_ptr of chunked_pack
+ */
+std::unique_ptr<chunked_pack> make_chunked_pack(cudf::table_view const& input,
+                                                std::size_t user_buffer_size,
+                                                rmm::mr::device_memory_resource* mr);
+
 /**
  * @brief Deep-copy a `table_view` into a serialized contiguous memory format.
  *
diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp
index 4c6d19739cf..8ba76a5e8af 100644
--- a/cpp/include/cudf/detail/contiguous_split.hpp
+++ b/cpp/include/cudf/detail/contiguous_split.hpp
@@ -67,7 +67,7 @@ class metadata_builder {
    * @brief Destructor that will be implemented as default, required because metadata_builder_impl
    * is incomplete at this stage.
    */
-  ~metadata_builder() = default;
+  ~metadata_builder();
 
   /**
    * @brief Add a column to this metadata builder.
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 4c3b4eddb8d..e94ce1d8156 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -21,7 +21,6 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
@@ -43,6 +42,7 @@
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
 #include <thrust/transform.h>
+#include <thrust/transform_scan.h>
 #include <thrust/tuple.h>
 
 #include <cstddef>
@@ -270,7 +270,7 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
  *
  * @param src_bufs Input source buffers
  * @param dst_bufs Destination buffers
- * @param buf_info Information on the range of values to be copied for each destination buffer.
+ * @param buf_info Information on the range of values to be copied for each destination buffer
  */
 template <int block_size>
 __global__ void copy_partitions(uint8_t const** src_bufs,
@@ -296,6 +296,43 @@ __global__ void copy_partitions(uint8_t const** src_bufs,
     buf_info[buf_index].valid_count > 0 ? &buf_info[buf_index].valid_count : nullptr);
 }
 
+/**
+ * @brief Kernel which copies data from multiple source buffers to multiple
+ * destination buffers.
+ *
+ * When doing a contiguous_split on X columns comprising N total internal buffers
+ * with M splits, we end up having to copy N*M source/destination buffer pairs.
+ * These copies are further subdivided into batches to distribute the amount of work
+ * to be done as evenly as possible across the multiprocessors on the device.
+ * This kernel is arranged such that each block copies 1 source/destination pair.
+ *
+ * @param src_bufs Input source buffers
+ * @param dst_bufs Destination buffers
+ * @param buf_info Information on the range of values to be copied for each destination buffer
+ */
+template <int block_size>
+__global__ void copy_partitions(uint8_t const** src_bufs,
+                                uint8_t* user_buffer,
+                                dst_buf_info* buf_info)
+{
+  auto const buf_index     = blockIdx.x;
+  auto const src_buf_index = buf_info[buf_index].src_buf_index;
+
+  // copy, shifting offsets and validity bits as needed
+  copy_buffer<block_size>(
+    user_buffer + buf_info[buf_index].dst_offset,
+    src_bufs[src_buf_index],
+    threadIdx.x,
+    buf_info[buf_index].num_elements,
+    buf_info[buf_index].element_size,
+    buf_info[buf_index].src_element_index,
+    blockDim.x,
+    buf_info[buf_index].value_shift,
+    buf_info[buf_index].bit_shift,
+    buf_info[buf_index].num_rows,
+    buf_info[buf_index].valid_count > 0 ? &buf_info[buf_index].valid_count : nullptr);
+}
+
 // The block of functions below are all related:
 //
 // compute_offset_stack_size()
@@ -336,7 +373,7 @@ bool is_offset_type(type_id id) { return (id == type_id::STRING or id == type_id
  * @param end End of input columns
  * @param offset_depth Current offset nesting depth
  *
- * @returns Total offset stack size needed for this range of columns.
+ * @returns Total offset stack size needed for this range of columns
  */
 template <typename InputIter>
 std::size_t compute_offset_stack_size(InputIter begin, InputIter end, int offset_depth = 0)
@@ -400,7 +437,8 @@ template <typename InputIter>
 size_type count_src_bufs(InputIter begin, InputIter end)
 {
   auto buf_iter = thrust::make_transform_iterator(begin, [](column_view const& col) {
-    return 1 + (col.nullable() ? 1 : 0) + count_src_bufs(col.child_begin(), col.child_end());
+    auto children_counts = count_src_bufs(col.child_begin(), col.child_end());
+    return 1 + (col.nullable() ? 1 : 0) + children_counts;
   });
   return std::accumulate(buf_iter, buf_iter + std::distance(begin, end), 0);
 }
@@ -661,6 +699,66 @@ std::pair<src_buf_info*, size_type> setup_source_buf_info(InputIter begin,
   return {current, offset_stack_pos};
 }
 
+/**
+ * @brief Given a column, processed split buffers, and a metadata builder, populate
+ * the metadata for this column in the builder, and return a tuple of:
+ * column size, data offset, bitmask offset and null count.
+ *
+ * @param src column_view to create metadata from
+ * @param current_info dst_buf_info pointer reference, pointing to this column's buffer info
+ *                     This is a pointer reference because it is updated by this function as the
+ *                     columns's validity and data buffers are visited
+ * @param mb A metadata_builder instance to update with the column's packed metadata
+ * @param use_src_null_count True for the chunked_pack case where current_info has invalid null
+ *                           count information. The null count should be taken
+ *                           from `src` because this case is restricted to a single partition
+ *                           (no splits)
+ * @returns a std::tuple<size_type, int64_t, int64_t, size_type> containing:
+ *          column size, data offset, bitmask offset, and null count
+ */
+template <typename BufInfo>
+std::tuple<size_type, int64_t, int64_t, size_type> build_output_column_metadata(
+  column_view const& src,
+  BufInfo& current_info,
+  detail::metadata_builder& mb,
+  bool use_src_null_count)
+{
+  auto [bitmask_offset, null_count] = [&]() {
+    if (src.nullable()) {
+      // offsets in the existing serialized_column metadata are int64_t
+      // that's the reason for the casting in this code.
+      int64_t const bitmask_offset =
+        current_info->num_elements == 0
+          ? -1  // this means that the bitmask buffer pointer should be nullptr
+          : static_cast<int64_t>(current_info->dst_offset);
+
+      // use_src_null_count is used for the chunked contig split case, where we have
+      // no splits: the null_count is just the source column's null_count
+      size_type const null_count = use_src_null_count
+                                     ? src.null_count()
+                                     : (current_info->num_elements == 0
+                                          ? 0
+                                          : (current_info->num_rows - current_info->valid_count));
+
+      ++current_info;
+      return std::pair(bitmask_offset, null_count);
+    }
+    return std::pair(static_cast<int64_t>(-1), 0);
+  }();
+
+  // size/data pointer for the column
+  auto const col_size       = static_cast<size_type>(current_info->num_elements);
+  int64_t const data_offset = src.num_children() > 0 || col_size == 0 || src.head() == nullptr
+                                ? -1
+                                : static_cast<int64_t>(current_info->dst_offset);
+
+  mb.add_column_info_to_meta(
+    src.type(), col_size, null_count, data_offset, bitmask_offset, src.num_children());
+
+  ++current_info;
+  return std::make_tuple(col_size, data_offset, bitmask_offset, null_count);
+}
+
 /**
  * @brief Given a set of input columns and processed split buffers, produce
  * output columns.
@@ -678,6 +776,7 @@ std::pair<src_buf_info*, size_type> setup_source_buf_info(InputIter begin,
  * copied buffer
  * @param out_begin Output iterator of column views
  * @param base_ptr Pointer to the base address of copied data for the working partition
+ * @param mb packed column metadata builder
  *
  * @returns new dst_buf_info iterator after processing this range of input columns
  */
@@ -686,39 +785,72 @@ BufInfo build_output_columns(InputIter begin,
                              InputIter end,
                              BufInfo info_begin,
                              Output out_begin,
-                             uint8_t const* const base_ptr)
+                             uint8_t const* const base_ptr,
+                             detail::metadata_builder& mb)
 {
   auto current_info = info_begin;
-  std::transform(begin, end, out_begin, [&current_info, base_ptr](column_view const& src) {
-    auto [bitmask_ptr, null_count] = [&]() {
-      if (src.nullable()) {
-        auto const ptr =
-          current_info->num_elements == 0
-            ? nullptr
-            : reinterpret_cast<bitmask_type const*>(base_ptr + current_info->dst_offset);
-        auto const null_count = current_info->num_elements == 0
-                                  ? 0
-                                  : (current_info->num_rows - current_info->valid_count);
-        ++current_info;
-        return std::pair(ptr, null_count);
-      }
-      return std::pair(static_cast<bitmask_type const*>(nullptr), 0);
-    }();
+  std::transform(begin, end, out_begin, [&current_info, base_ptr, &mb](column_view const& src) {
+    size_type col_size, null_count;
+    int64_t bitmask_offset;
+    int64_t data_offset;
+    std::tie(col_size, data_offset, bitmask_offset, null_count) =
+      build_output_column_metadata<BufInfo>(src, current_info, mb, false);
+
+    auto bitmask_ptr =
+      base_ptr != nullptr && bitmask_offset != -1
+        ? reinterpret_cast<bitmask_type const*>(base_ptr + static_cast<uint64_t>(bitmask_offset))
+        : nullptr;
 
     // size/data pointer for the column
-    auto const size = current_info->num_elements;
-    uint8_t const* data_ptr =
-      size == 0 || src.head() == nullptr ? nullptr : base_ptr + current_info->dst_offset;
-    ++current_info;
+    uint8_t const* data_ptr = base_ptr != nullptr && data_offset != -1
+                                ? base_ptr + static_cast<uint64_t>(data_offset)
+                                : nullptr;
 
     // children
     auto children = std::vector<column_view>{};
     children.reserve(src.num_children());
 
     current_info = build_output_columns(
-      src.child_begin(), src.child_end(), current_info, std::back_inserter(children), base_ptr);
+      src.child_begin(), src.child_end(), current_info, std::back_inserter(children), base_ptr, mb);
+
+    return column_view{
+      src.type(), col_size, data_ptr, bitmask_ptr, null_count, 0, std::move(children)};
+  });
+
+  return current_info;
+}
+
+/**
+ * @brief Given a set of input columns, processed split buffers, and a metadata_builder,
+ * append column metadata using the builder.
+ *
+ * After performing the split we are left with 1 large buffer per incoming split
+ * partition.  We need to traverse this buffer and distribute the individual
+ * subpieces that represent individual columns and children to produce the final
+ * output columns.
+ *
+ * This function is called recursively in the case of nested types.
+ *
+ * @param begin Beginning of input columns
+ * @param end End of input columns
+ * @param info_begin Iterator of dst_buf_info structs containing information about each
+ * copied buffer
+ * @param mb packed column metadata builder
+ *
+ * @returns new dst_buf_info iterator after processing this range of input columns
+ */
+template <typename InputIter, typename BufInfo>
+BufInfo populate_metadata(InputIter begin,
+                          InputIter end,
+                          BufInfo info_begin,
+                          detail::metadata_builder& mb)
+{
+  auto current_info = info_begin;
+  std::for_each(begin, end, [&current_info, &mb](column_view const& src) {
+    build_output_column_metadata<BufInfo>(src, current_info, mb, true);
 
-    return column_view{src.type(), size, data_ptr, bitmask_ptr, null_count, 0, std::move(children)};
+    // children
+    current_info = populate_metadata(src.child_begin(), src.child_end(), current_info, mb);
   });
 
   return current_info;
@@ -739,8 +871,8 @@ struct buf_size_functor {
  * The key is simply the partition index.
  */
 struct split_key_functor {
-  int num_columns;
-  int operator() __device__(int buf_index) { return buf_index / num_columns; }
+  int num_src_bufs;
+  int operator() __device__(int buf_index) { return buf_index / num_src_bufs; }
 };
 
 /**
@@ -813,282 +945,278 @@ struct size_of_helper {
 };
 
 /**
- * @brief Functor for returning the number of chunks an input buffer is being
+ * @brief Functor for returning the number of batches an input buffer is being
  * subdivided into during the repartitioning step.
  *
  * Note: columns types which themselves inherently have no data (strings, lists,
  * structs) return 0.
  */
-struct num_chunks_func {
-  thrust::pair<std::size_t, std::size_t> const* chunks;
-  __device__ std::size_t operator()(size_type i) const { return thrust::get<0>(chunks[i]); }
+struct num_batches_func {
+  thrust::pair<std::size_t, std::size_t> const* batches;
+  __device__ std::size_t operator()(size_type i) const { return thrust::get<0>(batches[i]); }
 };
 
-void copy_data(int num_bufs,
-               int num_src_bufs,
-               uint8_t const** d_src_bufs,
-               uint8_t** d_dst_bufs,
-               dst_buf_info* _d_dst_buf_info,
-               rmm::cuda_stream_view stream)
-{
-  // Since we parallelize at one block per copy, we are vulnerable to situations where we
-  // have small numbers of copies to do (a combination of small numbers of splits and/or columns),
-  // so we will take the actual set of outgoing source/destination buffers and further partition
-  // them into much smaller chunks in order to drive up the number of blocks and overall occupancy.
-  auto const desired_chunk_size = std::size_t{1 * 1024 * 1024};
-  rmm::device_uvector<thrust::pair<std::size_t, std::size_t>> chunks(num_bufs, stream);
-  thrust::transform(
-    rmm::exec_policy(stream),
-    _d_dst_buf_info,
-    _d_dst_buf_info + num_bufs,
-    chunks.begin(),
-    [desired_chunk_size] __device__(
-      dst_buf_info const& buf) -> thrust::pair<std::size_t, std::size_t> {
-      // Total bytes for this incoming partition
+/**
+ * @brief Get the size in bytes of a batch described by `dst_buf_info`.
+ */
+struct batch_byte_size_function {
+  size_type num_batches;
+  dst_buf_info const* infos;
+  __device__ std::size_t operator()(size_type i) const
+  {
+    if (i == num_batches) {
+      return 0;
+    } else {
+      auto& buf = *(infos + i);
       std::size_t const bytes =
         static_cast<std::size_t>(buf.num_elements) * static_cast<std::size_t>(buf.element_size);
+      return util::round_up_unsafe(bytes, split_align);
+    }
+  }
+};
 
-      // This clause handles nested data types (e.g. list or string) that store no data in the row
-      // columns, only in their children.
-      if (bytes == 0) { return {1, 0}; }
+/**
+ * @brief Get the input buffer index given the output buffer index.
+ */
+struct out_to_in_index_function {
+  offset_type const* batch_offsets;
+  int num_bufs;
+  __device__ int operator()(size_type i) const
+  {
+    return static_cast<size_type>(
+             thrust::upper_bound(thrust::seq, batch_offsets, batch_offsets + num_bufs + 1, i) -
+             batch_offsets) -
+           1;
+  }
+};
 
-      // The number of chunks we want to subdivide this buffer into
-      std::size_t const num_chunks =
-        max(std::size_t{1}, util::round_up_unsafe(bytes, desired_chunk_size) / desired_chunk_size);
+};  // anonymous namespace
 
-      // NOTE: leaving chunk size as a separate parameter for future tuning
-      // possibilities, even though in the current implementation it will be a
-      // constant.
-      return {num_chunks, desired_chunk_size};
-    });
+namespace detail {
 
-  rmm::device_uvector<offset_type> chunk_offsets(num_bufs + 1, stream);
-  auto buf_count_iter = cudf::detail::make_counting_transform_iterator(
-    0, [num_bufs, num_chunks = num_chunks_func{chunks.begin()}] __device__(size_type i) {
-      return i == num_bufs ? 0 : num_chunks(i);
-    });
-  thrust::exclusive_scan(rmm::exec_policy(stream),
-                         buf_count_iter,
-                         buf_count_iter + num_bufs + 1,
-                         chunk_offsets.begin(),
-                         0);
+// packed block of memory 1: split indices and src_buf_info structs
+struct packed_split_indices_and_src_buf_info {
+  explicit packed_split_indices_and_src_buf_info(cudf::table_view const& input,
+                                                 std::vector<size_type> const& splits,
+                                                 std::size_t num_partitions,
+                                                 cudf::size_type num_src_bufs,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
+  {
+    indices_size = cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align);
+    src_buf_info_size = cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align);
+
+    // host-side
+    h_indices_and_source_info = std::vector<uint8_t>(indices_size + src_buf_info_size);
+    h_indices                 = reinterpret_cast<size_type*>(h_indices_and_source_info.data());
+    h_src_buf_info =
+      reinterpret_cast<src_buf_info*>(h_indices_and_source_info.data() + indices_size);
+
+    // compute splits -> indices.
+    // these are row numbers per split
+    h_indices[0]              = 0;
+    h_indices[num_partitions] = input.column(0).size();
+    std::copy(splits.begin(), splits.end(), std::next(h_indices));
+
+    // setup source buf info
+    setup_source_buf_info(input.begin(), input.end(), h_src_buf_info, h_src_buf_info, stream);
+
+    offset_stack_partition_size = compute_offset_stack_size(input.begin(), input.end());
+    offset_stack_size           = offset_stack_partition_size * num_partitions * sizeof(size_type);
+    // device-side
+    // gpu-only : stack space needed for nested list offset calculation
+    d_indices_and_source_info =
+      rmm::device_buffer(indices_size + src_buf_info_size + offset_stack_size, stream, mr);
+    d_indices      = reinterpret_cast<size_type*>(d_indices_and_source_info.data());
+    d_src_buf_info = reinterpret_cast<src_buf_info*>(
+      reinterpret_cast<uint8_t*>(d_indices_and_source_info.data()) + indices_size);
+    d_offset_stack =
+      reinterpret_cast<size_type*>(reinterpret_cast<uint8_t*>(d_indices_and_source_info.data()) +
+                                   indices_size + src_buf_info_size);
+
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
+      d_indices, h_indices, indices_size + src_buf_info_size, cudaMemcpyDefault, stream.value()));
+  }
 
-  auto out_to_in_index = [chunk_offsets = chunk_offsets.begin(), num_bufs] __device__(size_type i) {
-    return static_cast<size_type>(
-             thrust::upper_bound(thrust::seq, chunk_offsets, chunk_offsets + num_bufs + 1, i) -
-             chunk_offsets) -
-           1;
-  };
-
-  // apply the chunking.
-  auto const num_chunks =
-    cudf::detail::make_counting_transform_iterator(0, num_chunks_func{chunks.begin()});
-  size_type const new_buf_count =
-    thrust::reduce(rmm::exec_policy(stream), num_chunks, num_chunks + chunks.size());
-  rmm::device_uvector<dst_buf_info> d_dst_buf_info(new_buf_count, stream);
-  auto iter = thrust::make_counting_iterator(0);
-  thrust::for_each(
-    rmm::exec_policy(stream),
-    iter,
-    iter + new_buf_count,
-    [_d_dst_buf_info,
-     d_dst_buf_info = d_dst_buf_info.begin(),
-     chunks         = chunks.begin(),
-     chunk_offsets  = chunk_offsets.begin(),
-     num_bufs,
-     num_src_bufs,
-     out_to_in_index] __device__(size_type i) {
-      size_type const in_buf_index = out_to_in_index(i);
-      size_type const chunk_index  = i - chunk_offsets[in_buf_index];
-      auto const chunk_size        = thrust::get<1>(chunks[in_buf_index]);
-      dst_buf_info const& in       = _d_dst_buf_info[in_buf_index];
+  size_type indices_size;
+  std::size_t src_buf_info_size;
+  std::size_t offset_stack_size;
 
-      // adjust info
-      dst_buf_info& out = d_dst_buf_info[i];
-      out.element_size  = in.element_size;
-      out.value_shift   = in.value_shift;
-      out.bit_shift     = in.bit_shift;
-      out.valid_count =
-        in.valid_count;  // valid count will be set to 1 if this is a validity buffer
-      out.src_buf_index = in.src_buf_index;
-      out.dst_buf_index = in.dst_buf_index;
+  std::vector<uint8_t> h_indices_and_source_info;
+  rmm::device_buffer d_indices_and_source_info;
 
-      size_type const elements_per_chunk =
-        out.element_size == 0 ? 0 : chunk_size / out.element_size;
-      out.num_elements = ((chunk_index + 1) * elements_per_chunk) > in.num_elements
-                           ? in.num_elements - (chunk_index * elements_per_chunk)
-                           : elements_per_chunk;
+  size_type* h_indices;
+  src_buf_info* h_src_buf_info;
 
-      size_type const rows_per_chunk =
-        // if this is a validity buffer, each element is a bitmask_type, which
-        // corresponds to 32 rows.
-        out.valid_count > 0
-          ? elements_per_chunk * static_cast<size_type>(detail::size_in_bits<bitmask_type>())
-          : elements_per_chunk;
-      out.num_rows = ((chunk_index + 1) * rows_per_chunk) > in.num_rows
-                       ? in.num_rows - (chunk_index * rows_per_chunk)
-                       : rows_per_chunk;
+  int offset_stack_partition_size;
+  size_type* d_indices;
+  src_buf_info* d_src_buf_info;
+  size_type* d_offset_stack;
+};
 
-      out.src_element_index = in.src_element_index + (chunk_index * elements_per_chunk);
-      out.dst_offset        = in.dst_offset + (chunk_index * chunk_size);
+// packed block of memory 2: partition buffer sizes and dst_buf_info structs
+struct packed_partition_buf_size_and_dst_buf_info {
+  packed_partition_buf_size_and_dst_buf_info(cudf::table_view const& input,
+                                             std::vector<size_type> const& splits,
+                                             std::size_t num_partitions,
+                                             cudf::size_type num_src_bufs,
+                                             std::size_t num_bufs,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
+    : stream(stream)
+  {
+    buf_sizes_size = cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align);
+    dst_buf_info_size = cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align);
+    // host-side
+    h_buf_sizes_and_dst_info = std::vector<uint8_t>(buf_sizes_size + dst_buf_info_size);
+    h_buf_sizes              = reinterpret_cast<std::size_t*>(h_buf_sizes_and_dst_info.data());
+    h_dst_buf_info =
+      reinterpret_cast<dst_buf_info*>(h_buf_sizes_and_dst_info.data() + buf_sizes_size);
+
+    // device-side
+    d_buf_sizes_and_dst_info = rmm::device_buffer(buf_sizes_size + dst_buf_info_size, stream, mr);
+    d_buf_sizes              = reinterpret_cast<std::size_t*>(d_buf_sizes_and_dst_info.data());
+
+    //// destination buffer info
+    d_dst_buf_info = reinterpret_cast<dst_buf_info*>(
+      static_cast<uint8_t*>(d_buf_sizes_and_dst_info.data()) + buf_sizes_size);
+  }
 
-      // out.bytes and out.buf_size are unneeded here because they are only used to
-      // calculate real output buffer sizes. the data we are generating here is
-      // purely intermediate for the purposes of doing more uniform copying of data
-      // underneath the final structure of the output
-    });
+  void copy_to_host()
+  {
+    // DtoH buf sizes and col info back to the host
+    CUDF_CUDA_TRY(cudaMemcpyAsync(h_buf_sizes,
+                                  d_buf_sizes,
+                                  buf_sizes_size + dst_buf_info_size,
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+  }
 
-  // perform the copy
-  constexpr size_type block_size = 256;
-  copy_partitions<block_size><<<new_buf_count, block_size, 0, stream.value()>>>(
-    d_src_bufs, d_dst_bufs, d_dst_buf_info.data());
-
-  // postprocess valid_counts
-  auto keys = cudf::detail::make_counting_transform_iterator(
-    0, [out_to_in_index] __device__(size_type i) { return out_to_in_index(i); });
-  auto values = thrust::make_transform_iterator(
-    d_dst_buf_info.begin(), [] __device__(dst_buf_info const& info) { return info.valid_count; });
-  thrust::reduce_by_key(rmm::exec_policy(stream),
-                        keys,
-                        keys + new_buf_count,
-                        values,
-                        thrust::make_discard_iterator(),
-                        dst_valid_count_output_iterator{_d_dst_buf_info});
-}
+  rmm::cuda_stream_view stream;
 
-};  // anonymous namespace
+  // buffer sizes and destination info (used in batched copies)
+  std::size_t buf_sizes_size;
+  std::size_t dst_buf_info_size;
 
-namespace detail {
+  std::vector<uint8_t> h_buf_sizes_and_dst_info;
+  std::size_t* h_buf_sizes;
+  dst_buf_info* h_dst_buf_info;
 
-std::vector<packed_table> contiguous_split(cudf::table_view const& input,
-                                           std::vector<size_type> const& splits,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
-{
-  if (input.num_columns() == 0) { return {}; }
-  if (splits.size() > 0) {
-    CUDF_EXPECTS(splits.back() <= input.column(0).size(),
-                 "splits can't exceed size of input columns");
+  rmm::device_buffer d_buf_sizes_and_dst_info;
+  std::size_t* d_buf_sizes;
+  dst_buf_info* d_dst_buf_info;
+};
+
+// Packed block of memory 3:
+// Pointers to source and destination buffers (and stack space on the
+// gpu for offset computation)
+struct packed_src_and_dst_pointers {
+  packed_src_and_dst_pointers(cudf::table_view const& input,
+                              std::size_t num_partitions,
+                              cudf::size_type num_src_bufs,
+                              int num_iterations,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
+    : stream(stream)
+  {
+    src_bufs_size = cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align);
+
+    dst_bufs_size = cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align);
+
+    // host-side
+    h_src_and_dst_buffers = std::vector<uint8_t>(src_bufs_size + dst_bufs_size);
+    h_src_bufs            = reinterpret_cast<uint8_t const**>(h_src_and_dst_buffers.data());
+    h_dst_bufs = reinterpret_cast<uint8_t**>(h_src_and_dst_buffers.data() + src_bufs_size);
+
+    // device-side
+    d_src_and_dst_buffers = rmm::device_buffer(src_bufs_size + dst_bufs_size, stream, mr);
+    d_src_bufs            = reinterpret_cast<uint8_t const**>(d_src_and_dst_buffers.data());
+    d_dst_bufs            = reinterpret_cast<uint8_t**>(
+      reinterpret_cast<uint8_t*>(d_src_and_dst_buffers.data()) + src_bufs_size);
+
+    // setup src buffers
+    setup_src_buf_data(input.begin(), input.end(), h_src_bufs);
   }
+
+  void copy_to_device()
   {
-    size_type begin = 0;
-    for (std::size_t i = 0; i < splits.size(); i++) {
-      size_type end = splits[i];
-      CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative.");
-      CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index.");
-      CUDF_EXPECTS(end <= input.column(0).size(), "Slice range out of bounds.");
-      begin = end;
-    }
+    CUDF_CUDA_TRY(cudaMemcpyAsync(d_src_and_dst_buffers.data(),
+                                  h_src_and_dst_buffers.data(),
+                                  src_bufs_size + dst_bufs_size,
+                                  cudaMemcpyDefault,
+                                  stream.value()));
   }
 
-  std::size_t const num_partitions   = splits.size() + 1;
-  std::size_t const num_root_columns = input.num_columns();
+  const rmm::cuda_stream_view stream;
 
-  // if inputs are empty, just return num_partitions empty tables
-  if (input.column(0).size() == 0) {
-    // sanitize the inputs (to handle corner cases like sliced tables)
-    std::vector<std::unique_ptr<column>> empty_columns;
-    empty_columns.reserve(input.num_columns());
-    std::transform(
-      input.begin(), input.end(), std::back_inserter(empty_columns), [](column_view const& col) {
-        return cudf::empty_like(col);
-      });
-    std::vector<cudf::column_view> empty_column_views;
-    empty_column_views.reserve(input.num_columns());
-    std::transform(empty_columns.begin(),
-                   empty_columns.end(),
-                   std::back_inserter(empty_column_views),
-                   [](std::unique_ptr<column> const& col) { return col->view(); });
-    table_view empty_inputs(empty_column_views);
+  std::vector<uint8_t> h_src_and_dst_buffers;
+  rmm::device_buffer d_src_and_dst_buffers;
+  std::size_t src_bufs_size;
+  std::size_t dst_bufs_size;
+  const uint8_t** h_src_bufs;
+  const uint8_t** d_src_bufs;
+  uint8_t** h_dst_bufs;
+  uint8_t** d_dst_bufs;
+};
 
-    // build the empty results
-    std::vector<packed_table> result;
-    result.reserve(num_partitions);
-    auto iter = thrust::make_counting_iterator(0);
-    std::transform(iter,
-                   iter + num_partitions,
-                   std::back_inserter(result),
-                   [&empty_inputs](int partition_index) {
-                     return packed_table{
-                       empty_inputs,
-                       packed_columns{std::make_unique<std::vector<uint8_t>>(pack_metadata(
-                                        empty_inputs, static_cast<uint8_t const*>(nullptr), 0)),
-                                      std::make_unique<rmm::device_buffer>()}};
-                   });
+std::unique_ptr<packed_src_and_dst_pointers> setup_src_and_dst_pointers(
+  cudf::table_view const& input,
+  std::size_t num_partitions,
+  cudf::size_type num_src_bufs,
+  int num_iterations,
+  std::vector<rmm::device_buffer>& out_buffers,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto src_and_dst_pointers = std::make_unique<packed_src_and_dst_pointers>(
+    input, num_partitions, num_src_bufs, num_iterations, stream, mr);
 
-    return result;
-  }
+  std::transform(
+    out_buffers.begin(), out_buffers.end(), src_and_dst_pointers->h_dst_bufs, [](auto& buf) {
+      return static_cast<uint8_t*>(buf.data());
+    });
+
+  src_and_dst_pointers->copy_to_device();
 
-  // compute # of source buffers (column data, validity, children), # of partitions
-  // and total # of buffers
-  size_type const num_src_bufs = count_src_bufs(input.begin(), input.end());
-  std::size_t const num_bufs   = num_src_bufs * num_partitions;
-
-  // packed block of memory 1. split indices and src_buf_info structs
-  std::size_t const indices_size =
-    cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align);
-  std::size_t const src_buf_info_size =
-    cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align);
-  // host-side
-  std::vector<uint8_t> h_indices_and_source_info(indices_size + src_buf_info_size);
-  size_type* h_indices = reinterpret_cast<size_type*>(h_indices_and_source_info.data());
-  src_buf_info* h_src_buf_info =
-    reinterpret_cast<src_buf_info*>(h_indices_and_source_info.data() + indices_size);
-  // device-side
-  // gpu-only : stack space needed for nested list offset calculation
-  int const offset_stack_partition_size = compute_offset_stack_size(input.begin(), input.end());
-  std::size_t const offset_stack_size =
-    offset_stack_partition_size * num_partitions * sizeof(size_type);
-  rmm::device_buffer d_indices_and_source_info(indices_size + src_buf_info_size + offset_stack_size,
-                                               stream,
-                                               rmm::mr::get_current_device_resource());
-  auto* d_indices              = reinterpret_cast<size_type*>(d_indices_and_source_info.data());
-  src_buf_info* d_src_buf_info = reinterpret_cast<src_buf_info*>(
-    reinterpret_cast<uint8_t*>(d_indices_and_source_info.data()) + indices_size);
-  size_type* d_offset_stack =
-    reinterpret_cast<size_type*>(reinterpret_cast<uint8_t*>(d_indices_and_source_info.data()) +
-                                 indices_size + src_buf_info_size);
-
-  // compute splits -> indices.
-  h_indices[0]              = 0;
-  h_indices[num_partitions] = input.column(0).size();
-  std::copy(splits.begin(), splits.end(), std::next(h_indices));
-
-  // setup source buf info
-  setup_source_buf_info(input.begin(), input.end(), h_src_buf_info, h_src_buf_info, stream);
-
-  // HtoD indices and source buf info to device
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    d_indices, h_indices, indices_size + src_buf_info_size, cudaMemcpyDefault, stream.value()));
-
-  // packed block of memory 2. partition buffer sizes and dst_buf_info structs
-  std::size_t const buf_sizes_size =
-    cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align);
-  std::size_t const dst_buf_info_size =
-    cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align);
-  // host-side
-  std::vector<uint8_t> h_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size);
-  std::size_t* h_buf_sizes = reinterpret_cast<std::size_t*>(h_buf_sizes_and_dst_info.data());
-  dst_buf_info* h_dst_buf_info =
-    reinterpret_cast<dst_buf_info*>(h_buf_sizes_and_dst_info.data() + buf_sizes_size);
-  // device-side
-  rmm::device_buffer d_buf_sizes_and_dst_info(
-    buf_sizes_size + dst_buf_info_size, stream, rmm::mr::get_current_device_resource());
-  std::size_t* d_buf_sizes     = reinterpret_cast<std::size_t*>(d_buf_sizes_and_dst_info.data());
-  dst_buf_info* d_dst_buf_info = reinterpret_cast<dst_buf_info*>(
-    static_cast<uint8_t*>(d_buf_sizes_and_dst_info.data()) + buf_sizes_size);
+  return src_and_dst_pointers;
+}
+
+std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
+  cudf::table_view const& input,
+  std::vector<size_type> const& splits,
+  std::size_t num_partitions,
+  cudf::size_type num_src_bufs,
+  std::size_t num_bufs,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto partition_buf_size_and_dst_buf_info =
+    std::make_unique<packed_partition_buf_size_and_dst_buf_info>(
+      input, splits, num_partitions, num_src_bufs, num_bufs, stream, mr);
+
+  auto d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info;
+  auto h_buf_sizes    = partition_buf_size_and_dst_buf_info->h_buf_sizes;
+  auto d_buf_sizes    = partition_buf_size_and_dst_buf_info->d_buf_sizes;
+
+  auto split_indices_and_src_buf_info =
+    packed_split_indices_and_src_buf_info(input, splits, num_partitions, num_src_bufs, stream, mr);
+
+  auto const d_src_buf_info        = split_indices_and_src_buf_info.d_src_buf_info;
+  auto offset_stack_partition_size = split_indices_and_src_buf_info.offset_stack_partition_size;
+  auto d_offset_stack              = split_indices_and_src_buf_info.d_offset_stack;
+  auto d_indices                   = split_indices_and_src_buf_info.d_indices;
 
   // compute sizes of each column in each partition, including alignment.
   thrust::transform(
-    rmm::exec_policy(stream),
+    rmm::exec_policy(stream, mr),
     thrust::make_counting_iterator<std::size_t>(0),
     thrust::make_counting_iterator<std::size_t>(num_bufs),
     d_dst_buf_info,
-    [num_src_bufs,
-     d_indices,
-     d_src_buf_info,
+    [d_src_buf_info,
+     offset_stack_partition_size,
      d_offset_stack,
-     offset_stack_partition_size] __device__(std::size_t t) {
+     d_indices,
+     num_src_bufs] __device__(std::size_t t) {
       int const split_index   = t / num_src_bufs;
       int const src_buf_index = t % num_src_bufs;
       auto const& src_info    = d_src_buf_info[src_buf_index];
@@ -1158,14 +1286,14 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
     });
 
   // compute total size of each partition
+  // key is the split index
   {
-    // key is split index
     auto keys = cudf::detail::make_counting_transform_iterator(
       0, split_key_functor{static_cast<int>(num_src_bufs)});
     auto values =
       cudf::detail::make_counting_transform_iterator(0, buf_size_functor{d_dst_buf_info});
 
-    thrust::reduce_by_key(rmm::exec_policy(stream),
+    thrust::reduce_by_key(rmm::exec_policy(stream, mr),
                           keys,
                           keys + num_bufs,
                           values,
@@ -1173,14 +1301,14 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
                           d_buf_sizes);
   }
 
-  // compute start offset for each output buffer
+  // compute start offset for each output buffer for each split
   {
     auto keys = cudf::detail::make_counting_transform_iterator(
       0, split_key_functor{static_cast<int>(num_src_bufs)});
     auto values =
       cudf::detail::make_counting_transform_iterator(0, buf_size_functor{d_dst_buf_info});
 
-    thrust::exclusive_scan_by_key(rmm::exec_policy(stream),
+    thrust::exclusive_scan_by_key(rmm::exec_policy(stream, mr),
                                   keys,
                                   keys + num_bufs,
                                   values,
@@ -1188,86 +1316,672 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
                                   std::size_t{0});
   }
 
-  // DtoH buf sizes and col info back to the host
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_buf_sizes,
-                                d_buf_sizes,
-                                buf_sizes_size + dst_buf_info_size,
-                                cudaMemcpyDefault,
-                                stream.value()));
+  partition_buf_size_and_dst_buf_info->copy_to_host();
+
   stream.synchronize();
 
-  // allocate output partition buffers
-  std::vector<rmm::device_buffer> out_buffers;
-  out_buffers.reserve(num_partitions);
-  std::transform(h_buf_sizes,
-                 h_buf_sizes + num_partitions,
-                 std::back_inserter(out_buffers),
-                 [stream, mr](std::size_t bytes) {
-                   return rmm::device_buffer{bytes, stream, mr};
-                 });
-
-  // packed block of memory 3. pointers to source and destination buffers (and stack space on the
-  // gpu for offset computation)
-  std::size_t const src_bufs_size =
-    cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align);
-  std::size_t const dst_bufs_size =
-    cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align);
-  // host-side
-  std::vector<uint8_t> h_src_and_dst_buffers(src_bufs_size + dst_bufs_size);
-  uint8_t const** h_src_bufs = reinterpret_cast<uint8_t const**>(h_src_and_dst_buffers.data());
-  uint8_t** h_dst_bufs = reinterpret_cast<uint8_t**>(h_src_and_dst_buffers.data() + src_bufs_size);
-  // device-side
-  rmm::device_buffer d_src_and_dst_buffers(src_bufs_size + dst_bufs_size + offset_stack_size,
-                                           stream,
-                                           rmm::mr::get_current_device_resource());
-  auto const** d_src_bufs = reinterpret_cast<uint8_t const**>(d_src_and_dst_buffers.data());
-  uint8_t** d_dst_bufs    = reinterpret_cast<uint8_t**>(
-    reinterpret_cast<uint8_t*>(d_src_and_dst_buffers.data()) + src_bufs_size);
-
-  // setup src buffers
-  setup_src_buf_data(input.begin(), input.end(), h_src_bufs);
-
-  // setup dst buffers
-  std::transform(out_buffers.begin(), out_buffers.end(), h_dst_bufs, [](auto& buf) {
-    return static_cast<uint8_t*>(buf.data());
-  });
+  return partition_buf_size_and_dst_buf_info;
+}
+/**
+ * @brief Struct containing information about the actual batches we will send to the
+ * `copy_partitions` kernel and the number of iterations we need to carry out this copy.
+ *
+ * For the non-chunked contiguous_split case, this contains the batched dst_buf_infos and the
+ * number of iterations are going to be 1, since the non-chunked case is single pass.
+ *
+ * For the chunked_pack case, this also contains the batched dst_buf_infos for all
+ * iterations in addition to helping keep the state about what batches have been copied so far
+ * and what are the sizes (in bytes) of each iteration.
+ */
+struct chunk_iteration_state {
+  chunk_iteration_state(rmm::device_uvector<dst_buf_info> _d_batched_dst_buf_info,
+                        rmm::device_uvector<offset_type> _d_batch_offsets,
+                        std::vector<std::size_t> _h_num_buffs_per_iteration,
+                        std::vector<std::size_t> _h_size_of_buffs_per_iteration,
+                        std::size_t total_size)
+    : num_iterations(_h_num_buffs_per_iteration.size()),
+      current_iteration(0),
+      starting_buff(0),
+      d_batched_dst_buf_info(std::move(_d_batched_dst_buf_info)),
+      d_batch_offsets(std::move(_d_batch_offsets)),
+      h_num_buffs_per_iteration(std::move(_h_num_buffs_per_iteration)),
+      h_size_of_buffs_per_iteration(std::move(_h_size_of_buffs_per_iteration)),
+      total_size(total_size)
+  {
+  }
+
+  std::pair<std::size_t, std::size_t> get_current_starting_index_and_buff_count() const
+  {
+    CUDF_EXPECTS(current_iteration < num_iterations,
+                 "current_iteration cannot exceed num_iterations");
+    auto count_for_current = h_num_buffs_per_iteration[current_iteration];
+    return std::make_pair(starting_buff, count_for_current);
+  }
+
+  std::size_t advance_iteration()
+  {
+    CUDF_EXPECTS(current_iteration < num_iterations,
+                 "current_iteration cannot exceed num_iterations");
+    std::size_t bytes_copied = h_size_of_buffs_per_iteration[current_iteration];
+    starting_buff += h_num_buffs_per_iteration[current_iteration];
+    ++current_iteration;
+    return bytes_copied;
+  }
 
-  // HtoD src and dest buffers
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    d_src_bufs, h_src_bufs, src_bufs_size + dst_bufs_size, cudaMemcpyDefault, stream.value()));
+  bool has_more_copies() const { return current_iteration < num_iterations; }
 
-  // perform the copy.
-  copy_data(num_bufs, num_src_bufs, d_src_bufs, d_dst_bufs, d_dst_buf_info, stream);
+  rmm::device_uvector<dst_buf_info> d_batched_dst_buf_info;
+  rmm::device_uvector<offset_type> d_batch_offsets;
+  std::size_t total_size;
+  int num_iterations;
+  int current_iteration;
 
-  // DtoH dst info (to retrieve null counts)
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    h_dst_buf_info, d_dst_buf_info, dst_buf_info_size, cudaMemcpyDefault, stream.value()));
+ private:
+  std::size_t starting_buff;
+  std::vector<std::size_t> h_num_buffs_per_iteration;
+  std::vector<std::size_t> h_size_of_buffs_per_iteration;
+};
 
-  stream.synchronize();
+std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
+  rmm::device_uvector<thrust::pair<std::size_t, std::size_t>> const& batches,
+  int num_bufs,
+  dst_buf_info* d_orig_dst_buf_info,
+  std::size_t const* const h_buf_sizes,
+  std::size_t num_partitions,
+  std::size_t user_buffer_size,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  rmm::device_uvector<offset_type> d_batch_offsets(num_bufs + 1, stream, mr);
+
+  auto buf_count_iter = cudf::detail::make_counting_transform_iterator(
+    0, [num_bufs, num_batches = num_batches_func{batches.begin()}] __device__(size_type i) {
+      return i == num_bufs ? 0 : num_batches(i);
+    });
+
+  thrust::exclusive_scan(rmm::exec_policy(stream, mr),
+                         buf_count_iter,
+                         buf_count_iter + num_bufs + 1,
+                         d_batch_offsets.begin(),
+                         0);
+
+  auto const num_batches_iter =
+    cudf::detail::make_counting_transform_iterator(0, num_batches_func{batches.begin()});
+  size_type const num_batches = thrust::reduce(
+    rmm::exec_policy(stream, mr), num_batches_iter, num_batches_iter + batches.size());
+
+  auto out_to_in_index = out_to_in_index_function{d_batch_offsets.begin(), num_bufs};
+
+  auto iter = thrust::make_counting_iterator(0);
+
+  // load up the batches as d_dst_buf_info
+  rmm::device_uvector<dst_buf_info> d_batched_dst_buf_info(num_batches, stream, mr);
+
+  thrust::for_each(
+    rmm::exec_policy(stream, mr),
+    iter,
+    iter + num_batches,
+    [d_orig_dst_buf_info,
+     d_batched_dst_buf_info = d_batched_dst_buf_info.begin(),
+     batches                = batches.begin(),
+     d_batch_offsets        = d_batch_offsets.begin(),
+     out_to_in_index] __device__(size_type i) {
+      size_type const in_buf_index = out_to_in_index(i);
+      size_type const batch_index  = i - d_batch_offsets[in_buf_index];
+      auto const batch_size        = thrust::get<1>(batches[in_buf_index]);
+      dst_buf_info const& in       = d_orig_dst_buf_info[in_buf_index];
+
+      // adjust info
+      dst_buf_info& out = d_batched_dst_buf_info[i];
+      out.element_size  = in.element_size;
+      out.value_shift   = in.value_shift;
+      out.bit_shift     = in.bit_shift;
+      out.valid_count =
+        in.valid_count;  // valid count will be set to 1 if this is a validity buffer
+      out.src_buf_index = in.src_buf_index;
+      out.dst_buf_index = in.dst_buf_index;
+
+      size_type const elements_per_batch =
+        out.element_size == 0 ? 0 : batch_size / out.element_size;
+      out.num_elements = ((batch_index + 1) * elements_per_batch) > in.num_elements
+                           ? in.num_elements - (batch_index * elements_per_batch)
+                           : elements_per_batch;
+
+      size_type const rows_per_batch =
+        // if this is a validity buffer, each element is a bitmask_type, which
+        // corresponds to 32 rows.
+        out.valid_count > 0
+          ? elements_per_batch * static_cast<size_type>(cudf::detail::size_in_bits<bitmask_type>())
+          : elements_per_batch;
+      out.num_rows = ((batch_index + 1) * rows_per_batch) > in.num_rows
+                       ? in.num_rows - (batch_index * rows_per_batch)
+                       : rows_per_batch;
+
+      out.src_element_index = in.src_element_index + (batch_index * elements_per_batch);
+      out.dst_offset        = in.dst_offset + (batch_index * batch_size);
+
+      // out.bytes and out.buf_size are unneeded here because they are only used to
+      // calculate real output buffer sizes. the data we are generating here is
+      // purely intermediate for the purposes of doing more uniform copying of data
+      // underneath the final structure of the output
+    });
+
+  if (user_buffer_size != 0) {
+    // copy the batch offsets back to host
+    std::vector<std::size_t> h_offsets(num_batches + 1);
+    {
+      rmm::device_uvector<std::size_t> offsets(h_offsets.size(), stream, mr);
+      auto batch_byte_size_iter = cudf::detail::make_counting_transform_iterator(
+        0, batch_byte_size_function{num_batches, d_batched_dst_buf_info.begin()});
+
+      thrust::exclusive_scan(rmm::exec_policy(stream, mr),
+                             batch_byte_size_iter,
+                             batch_byte_size_iter + num_batches + 1,
+                             offsets.begin());
+
+      CUDF_CUDA_TRY(cudaMemcpyAsync(h_offsets.data(),
+                                    offsets.data(),
+                                    sizeof(std::size_t) * offsets.size(),
+                                    cudaMemcpyDefault,
+                                    stream.value()));
+
+      // the next part is working on the CPU, so we want to synchronize here
+      stream.synchronize();
+    }
+
+    std::vector<std::size_t> num_batches_per_iteration;
+    std::vector<std::size_t> size_of_batches_per_iteration;
+    std::vector<std::size_t> accum_size_per_iteration;
+    std::size_t accum_size = 0;
+    {
+      auto current_offset_it = h_offsets.begin();
+      // figure out how many iterations we need, while fitting batches to iterations
+      // with no more than user_buffer_size bytes worth of batches
+      while (current_offset_it != h_offsets.end()) {
+        // next_iteration_it points to the batch right above the boundary (the batch
+        // that didn't fit).
+        auto next_iteration_it =
+          std::lower_bound(current_offset_it,
+                           h_offsets.end(),
+                           // We add the cumulative size + 1 because we want to find what would fit
+                           // within a bounce buffer of user_buffer_size (up to user_buffer_size).
+                           // Since h_offsets is a prefix scan, we add the size we accumulated so
+                           // far so we are looking for the next user_buffer_sized boundary.
+                           user_buffer_size + accum_size + 1);
+
+        // we subtract 1 from the number of batch here because next_iteration_it points
+        // to the batch that didn't fit, so it's one off.
+        auto batches_in_iter = std::distance(current_offset_it, next_iteration_it) - 1;
+
+        // to get the amount of bytes in this iteration we get the prefix scan size
+        // and subtract the cumulative size so far, leaving the bytes belonging to this
+        // iteration
+        auto iter_size_bytes = *(current_offset_it + batches_in_iter) - accum_size;
+        accum_size += iter_size_bytes;
+
+        num_batches_per_iteration.push_back(batches_in_iter);
+        size_of_batches_per_iteration.push_back(iter_size_bytes);
+        accum_size_per_iteration.push_back(accum_size);
+
+        if (next_iteration_it == h_offsets.end()) {
+          break;
+        }
+
+        current_offset_it += batches_in_iter;
+      }
+    }
+
+    // apply changed offset
+    {
+      rmm::device_uvector<std::size_t> d_accum_size_per_iteration(
+        accum_size_per_iteration.size(), stream, mr);
+
+      CUDF_CUDA_TRY(cudaMemcpyAsync(d_accum_size_per_iteration.data(),
+                                    accum_size_per_iteration.data(),
+                                    accum_size_per_iteration.size() * sizeof(std::size_t),
+                                    cudaMemcpyDefault,
+                                    stream.value()));
+
+      // we want to update the offset of batches for every iteration, except the first one (because
+      // offsets in the first iteration are all 0 based)
+      auto num_batches_in_first_iteration = num_batches_per_iteration[0];
+      auto iter           = thrust::make_counting_iterator(num_batches_in_first_iteration);
+      auto num_iterations = accum_size_per_iteration.size();
+      thrust::for_each(
+        rmm::exec_policy(stream, mr),
+        iter,
+        iter + num_batches - num_batches_in_first_iteration,
+        [num_iterations,
+         d_batched_dst_buf_info     = d_batched_dst_buf_info.begin(),
+         d_accum_size_per_iteration = d_accum_size_per_iteration.begin()] __device__(size_type i) {
+          auto prior_iteration_size =
+            thrust::upper_bound(thrust::seq,
+                                d_accum_size_per_iteration,
+                                d_accum_size_per_iteration + num_iterations,
+                                d_batched_dst_buf_info[i].dst_offset) -
+            1;
+          d_batched_dst_buf_info[i].dst_offset -= *prior_iteration_size;
+        });
+    }
+    return std::make_unique<chunk_iteration_state>(std::move(d_batched_dst_buf_info),
+                                                   std::move(d_batch_offsets),
+                                                   std::move(num_batches_per_iteration),
+                                                   std::move(size_of_batches_per_iteration),
+                                                   accum_size);
+
+  } else {
+    // we instantiate an "iteration state" for the regular single pass contiguous_split
+    // consisting of 1 iteration with all of the batches and totalling `total_size` bytes.
+    auto total_size = std::reduce(h_buf_sizes, h_buf_sizes + num_partitions);
+
+    // 1 iteration with the whole size
+    return std::make_unique<chunk_iteration_state>(
+      std::move(d_batched_dst_buf_info),
+      std::move(d_batch_offsets),
+      std::move(std::vector<std::size_t>{static_cast<std::size_t>(num_batches)}),
+      std::move(std::vector<std::size_t>{total_size}),
+      total_size);
+  }
+}
 
-  // build the output.
-  std::vector<packed_table> result;
-  result.reserve(num_partitions);
-  std::vector<column_view> cols;
-  cols.reserve(num_root_columns);
-  auto cur_dst_buf_info = h_dst_buf_info;
-  for (std::size_t idx = 0; idx < num_partitions; idx++) {
-    // traverse the buffers and build the columns.
-    cur_dst_buf_info = build_output_columns(
-      input.begin(), input.end(), cur_dst_buf_info, std::back_inserter(cols), h_dst_bufs[idx]);
-
-    // pack the columns
-    cudf::table_view t{cols};
-    result.push_back(packed_table{
-      t,
-      packed_columns{
-        std::make_unique<std::vector<uint8_t>>(cudf::pack_metadata(
-          t, reinterpret_cast<uint8_t const*>(out_buffers[idx].data()), out_buffers[idx].size())),
-        std::make_unique<rmm::device_buffer>(std::move(out_buffers[idx]))}});
-
-    cols.clear();
+void copy_data(int num_batches_to_copy,
+               int starting_batch,
+               uint8_t const** d_src_bufs,
+               uint8_t** d_dst_bufs,
+               rmm::device_uvector<dst_buf_info>& d_dst_buf_info,
+               uint8_t* user_buffer,
+               rmm::cuda_stream_view stream)
+{
+  constexpr size_type block_size = 256;
+  if (user_buffer != nullptr) {
+    copy_partitions<block_size><<<num_batches_to_copy, block_size, 0, stream.value()>>>(
+      d_src_bufs, user_buffer, d_dst_buf_info.data() + starting_batch);
+  } else {
+    copy_partitions<block_size><<<num_batches_to_copy, block_size, 0, stream.value()>>>(
+      d_src_bufs, d_dst_bufs, d_dst_buf_info.data() + starting_batch);
   }
-  return result;
+}
+
+/**
+ * @brief Function that checks an input table_view and splits for specific edge cases.
+ *
+ * It will return true if the input is "empty" (no rows or columns), which means
+ * special handling has to happen in the calling code.
+ *
+ * @param input table_view of source table to be split
+ * @param splits the splits specified by the user, or an empty vector if no splits
+ * @returns true if the input is empty, false otherwise
+ */
+bool check_inputs(cudf::table_view const& input, std::vector<size_type> const& splits)
+{
+  if (input.num_columns() == 0) { return true; }
+  if (splits.size() > 0) {
+    CUDF_EXPECTS(splits.back() <= input.column(0).size(),
+                 "splits can't exceed size of input columns");
+  }
+  {
+    size_type begin = 0;
+    for (std::size_t i = 0; i < splits.size(); i++) {
+      size_type end = splits[i];
+      CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative.");
+      CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index.");
+      CUDF_EXPECTS(end <= input.column(0).size(), "Slice range out of bounds.");
+      begin = end;
+    }
+  }
+  return input.column(0).size() == 0;
+}
+
+/**
+ * @brief A helper struct containing the state of contiguous_split, whether the caller
+ * is using the single-pass contiguous_split or chunked_pack.
+ *
+ * It exposes an iterator-like pattern where contiguous_split_state::has_next()
+ * return true when there is work to be done, and false otherwise.
+ *
+ * contiguous_split_state::contiguous_split() performs a single-pass contiguous_split
+ * and is only valid iff contiguous_split_state is instantiated with 0 for the user_buffer_size.
+ *
+ * contiguous_split_state::contiguous_split_chunk(device_span) is only valid when
+ * user_buffer_size > 0. It should be called as long as has_next() returns true. The
+ * device_span passed to contiguous_split_chunk must be allocated in stream `stream` by
+ * the user.
+ *
+ * None of the methods are thread safe.
+ */
+struct contiguous_split_state {
+  static const std::size_t desired_batch_size = 1 * 1024 * 1024;
+
+  contiguous_split_state(cudf::table_view const& input,
+                         std::size_t user_buffer_size,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
+    : contiguous_split_state(input, {}, user_buffer_size, stream, mr)
+  {
+  }
+
+  contiguous_split_state(cudf::table_view const& input,
+                         std::vector<size_type> const& splits,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
+    : contiguous_split_state(input, splits, 0, stream, mr)
+  {
+  }
+
+  contiguous_split_state(cudf::table_view const& input,
+                         std::vector<size_type> const& splits,
+                         std::size_t user_buffer_size,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
+    : input(input), user_buffer_size(user_buffer_size), stream(stream), mr(mr)
+  {
+    is_empty       = check_inputs(input, splits);
+    num_partitions = splits.size() + 1;
+    num_src_bufs   = count_src_bufs(input.begin(), input.end());
+    num_bufs       = num_src_bufs * num_partitions;
+
+    // if the table we are about to contig split is empty, we have special
+    // handling where metadata is produced and a 0-byte contiguous buffer
+    // is the result.
+    if (is_empty) { return; }
+
+    partition_buf_size_and_dst_buf_info =
+      std::move(compute_splits(input, splits, num_partitions, num_src_bufs, num_bufs, stream, mr));
+
+    compute_batches();
+
+    // allocate output partition buffers, in the non-chunked case
+    if (user_buffer_size == 0) {
+      out_buffers.reserve(num_partitions);
+      auto h_buf_sizes = partition_buf_size_and_dst_buf_info->h_buf_sizes;
+      std::transform(h_buf_sizes,
+                     h_buf_sizes + num_partitions,
+                     std::back_inserter(out_buffers),
+                     [stream = stream, mr = mr](std::size_t bytes) {
+                       return rmm::device_buffer{bytes, stream, mr};
+                     });
+    }
+
+    src_and_dst_pointers = std::move(setup_src_and_dst_pointers(input,
+                                                                num_partitions,
+                                                                num_src_bufs,
+                                                                chunk_iter_state->num_iterations,
+                                                                out_buffers,
+                                                                stream,
+                                                                mr));
+  }
+
+  bool has_next() const { return !is_empty && chunk_iter_state->has_more_copies(); }
+
+  std::size_t get_total_contiguous_size() const
+  {
+    return is_empty ? 0 : chunk_iter_state->total_size;
+  }
+
+  void compute_batches()
+  {
+    // Since we parallelize at one block per copy, we are vulnerable to situations where we
+    // have small numbers of copies to do (a combination of small numbers of splits and/or columns),
+    // so we will take the actual set of outgoing source/destination buffers and further partition
+    // them into much smaller batches in order to drive up the number of blocks and overall
+    // occupancy.
+    rmm::device_uvector<thrust::pair<std::size_t, std::size_t>> batches(num_bufs, stream, mr);
+    auto d_dst_buf_info     = partition_buf_size_and_dst_buf_info->d_dst_buf_info;
+    auto h_buf_sizes        = partition_buf_size_and_dst_buf_info->h_buf_sizes;
+    auto desired_batch_size = contiguous_split_state::desired_batch_size;
+    thrust::transform(
+      rmm::exec_policy(stream, mr),
+      d_dst_buf_info,
+      d_dst_buf_info + num_bufs,
+      batches.begin(),
+      [desired_batch_size] __device__(
+        dst_buf_info const& buf) -> thrust::pair<std::size_t, std::size_t> {
+        // Total bytes for this incoming partition
+        std::size_t const bytes =
+          static_cast<std::size_t>(buf.num_elements) * static_cast<std::size_t>(buf.element_size);
+
+        // This clause handles nested data types (e.g. list or string) that store no data in the row
+        // columns, only in their children.
+        if (bytes == 0) { return {1, 0}; }
+
+        // The number of batches we want to subdivide this buffer into
+        std::size_t const num_batches = std::max(
+          std::size_t{1}, util::round_up_unsafe(bytes, desired_batch_size) / desired_batch_size);
+
+        // NOTE: leaving batch size as a separate parameter for future tuning
+        // possibilities, even though in the current implementation it will be a
+        // constant.
+        return {num_batches, desired_batch_size};
+      });
+
+    chunk_iter_state = make_chunk_iteration_state(
+      batches, num_bufs, d_dst_buf_info, h_buf_sizes, num_partitions, user_buffer_size, stream, mr);
+  }
+
+  std::vector<packed_table> contiguous_split()
+  {
+    CUDF_EXPECTS(user_buffer_size == 0, "Cannot contiguous split with a user buffer");
+    if (is_empty || input.num_columns() == 0) { return make_packed_tables(); }
+
+    std::size_t num_batches_total;
+    std::tie(std::ignore, num_batches_total) =
+      chunk_iter_state->get_current_starting_index_and_buff_count();
+
+    // perform the copy.
+    copy_data(num_batches_total,
+              0 /* starting at buffer for single-shot 0*/,
+              src_and_dst_pointers->d_src_bufs,
+              src_and_dst_pointers->d_dst_bufs,
+              chunk_iter_state->d_batched_dst_buf_info,
+              nullptr,
+              stream);
+
+    // these "orig" dst_buf_info pointers describe the prior-to-batching destination
+    // buffers per partition
+    auto d_orig_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info;
+    auto h_orig_dst_buf_info = partition_buf_size_and_dst_buf_info->h_dst_buf_info;
+
+    // postprocess valid_counts: apply the valid counts computed by copy_data for each
+    // batch back to the original dst_buf_infos
+    auto keys = cudf::detail::make_counting_transform_iterator(
+      0, out_to_in_index_function{chunk_iter_state->d_batch_offsets.begin(), (int)num_bufs});
+
+    auto values = thrust::make_transform_iterator(
+      chunk_iter_state->d_batched_dst_buf_info.begin(),
+      [] __device__(dst_buf_info const& info) { return info.valid_count; });
+
+    thrust::reduce_by_key(rmm::exec_policy(stream, mr),
+                          keys,
+                          keys + num_batches_total,
+                          values,
+                          thrust::make_discard_iterator(),
+                          dst_valid_count_output_iterator{d_orig_dst_buf_info});
+
+    CUDF_CUDA_TRY(cudaMemcpyAsync(h_orig_dst_buf_info,
+                                  d_orig_dst_buf_info,
+                                  partition_buf_size_and_dst_buf_info->dst_buf_info_size,
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+
+    stream.synchronize();
+
+    // not necessary for the non-chunked case, but it makes it so further calls to has_next
+    // return false, just in case
+    chunk_iter_state->advance_iteration();
+
+    return make_packed_tables();
+  }
+
+  cudf::size_type contiguous_split_chunk(cudf::device_span<uint8_t> const& user_buffer)
+  {
+    CUDF_FUNC_RANGE()
+    CUDF_EXPECTS(
+      user_buffer.size() == user_buffer_size,
+      "Cannot use a device span smaller than the output buffer size configured at instantiation!");
+    CUDF_EXPECTS(has_next(), "Cannot call contiguous_split_chunk with has_next() == false!");
+
+    std::size_t starting_batch, num_batches_to_copy;
+    std::tie(starting_batch, num_batches_to_copy) =
+      chunk_iter_state->get_current_starting_index_and_buff_count();
+
+    // perform the copy.
+    copy_data(num_batches_to_copy,
+              starting_batch,
+              src_and_dst_pointers->d_src_bufs,
+              src_and_dst_pointers->d_dst_bufs,
+              chunk_iter_state->d_batched_dst_buf_info,
+              user_buffer.data(),
+              stream);
+
+    // We do not need to post-process null counts since the null count info is
+    // taken from the source table in the contiguous_split_chunk case (no splits)
+    return chunk_iter_state->advance_iteration();
+  }
+
+  std::vector<packed_table> make_empty_packed_table()
+  {
+    // sanitize the inputs (to handle corner cases like sliced tables)
+    std::vector<std::unique_ptr<column>> empty_columns;
+    empty_columns.reserve(input.num_columns());
+    std::transform(
+      input.begin(), input.end(), std::back_inserter(empty_columns), [](column_view const& col) {
+        return cudf::empty_like(col);
+      });
+    std::vector<cudf::column_view> empty_column_views;
+    empty_column_views.reserve(input.num_columns());
+    std::transform(empty_columns.begin(),
+                   empty_columns.end(),
+                   std::back_inserter(empty_column_views),
+                   [](std::unique_ptr<column> const& col) { return col->view(); });
+    table_view empty_inputs(empty_column_views);
+
+    // build the empty results
+    std::vector<packed_table> result;
+    result.reserve(num_partitions);
+    auto iter = thrust::make_counting_iterator(0);
+    std::transform(iter,
+                   iter + num_partitions,
+                   std::back_inserter(result),
+                   [&empty_inputs](int partition_index) {
+                     return packed_table{
+                       empty_inputs,
+                       packed_columns{std::make_unique<std::vector<uint8_t>>(pack_metadata(
+                                        empty_inputs, static_cast<uint8_t const*>(nullptr), 0)),
+                                      std::make_unique<rmm::device_buffer>()}};
+                   });
+
+    return result;
+  }
+
+  std::unique_ptr<std::vector<uint8_t>> build_packed_column_metadata()
+  {
+    CUDF_EXPECTS(num_partitions == 1, "build_packed_column_metadata supported only without splits");
+
+    if (input.num_columns() == 0) { return std::unique_ptr<std::vector<uint8_t>>(); }
+
+    if (is_empty) {
+      // this is a bit ugly, but it was done to re-use make_empty_packed_table between the
+      // regular contiguous_split and chunked_pack cases.
+      auto empty_packed_tables = std::move(make_empty_packed_table()[0]);
+      return std::move(empty_packed_tables.data.metadata);
+    }
+
+    auto& h_dst_buf_info  = partition_buf_size_and_dst_buf_info->h_dst_buf_info;
+    auto cur_dst_buf_info = h_dst_buf_info;
+    metadata_builder mb{input.num_columns()};
+
+    populate_metadata(input.begin(), input.end(), cur_dst_buf_info, mb);
+
+    return std::make_unique<std::vector<uint8_t>>(std::move(mb.build()));
+  }
+
+  std::vector<packed_table> make_packed_tables()
+  {
+    if (input.num_columns() == 0) { return std::vector<packed_table>(); }
+    if (is_empty) { return make_empty_packed_table(); }
+    std::vector<packed_table> result;
+    result.reserve(num_partitions);
+    std::vector<column_view> cols;
+    cols.reserve(input.num_columns());
+
+    auto& h_dst_buf_info = partition_buf_size_and_dst_buf_info->h_dst_buf_info;
+    auto& h_dst_bufs     = src_and_dst_pointers->h_dst_bufs;
+
+    auto cur_dst_buf_info = h_dst_buf_info;
+    for (std::size_t idx = 0; idx < num_partitions; idx++) {
+      // traverse the buffers and build the columns.
+      metadata_builder mb(input.num_columns());
+      cur_dst_buf_info = cudf::build_output_columns(input.begin(),
+                                                    input.end(),
+                                                    cur_dst_buf_info,
+                                                    std::back_inserter(cols),
+                                                    h_dst_bufs[idx],
+                                                    mb);
+
+      // pack the columns
+      cudf::table_view t{cols};
+      result.push_back(packed_table{
+        t,
+        packed_columns{std::make_unique<std::vector<uint8_t>>(mb.build()),
+                       std::make_unique<rmm::device_buffer>(std::move(out_buffers[idx]))}});
+
+      cols.clear();
+    }
+
+    return result;
+  }
+
+  cudf::table_view const input;
+  rmm::cuda_stream_view stream;
+  rmm::mr::device_memory_resource* mr;
+
+  std::size_t num_partitions;
+
+  // number of source buffers including children * number of splits
+  std::size_t num_bufs;
+
+  // number of source buffers including children
+  size_type num_src_bufs;
+
+  std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> partition_buf_size_and_dst_buf_info;
+
+  std::unique_ptr<packed_src_and_dst_pointers> src_and_dst_pointers;
+
+  // whether the table was empty to begin with (0 rows or 0 columns) and should be metadata-only
+  bool is_empty;
+
+  //
+  // State around the chunked pattern
+  //
+
+  // chunked_pack will 1 or more "chunks" to iterate on, defined in chunk_iter_state
+  // contiguous_split will have a single "chunk" in chunk_iter_state, so no iteration.
+  std::unique_ptr<chunk_iteration_state> chunk_iter_state;
+
+  // Two modes are allowed:
+  //  - user provided buffer: as the name implies, the user has provided a buffer that must be at
+  //  least 1MB.
+  //    contiguous_split will behave in a "chunked" mode in this scenario, as it will contiguously
+  //    copy up until the user's buffer size limit, exposing a next() call for the user to invoke.
+  //    Note that in this mode, contig split is not partitioning the original table, it is instead
+  //    only placing cuDF buffers contiguously in the user's bounce buffer.
+  //
+  //  - single shot contiguous_split (default): when the user doesn't provide their own buffer,
+  //    contiguous_split will allocate a buffer per partition and will place contiguous results in
+  //    each buffer.
+  //
+  std::vector<rmm::device_buffer> out_buffers;
+
+  std::size_t user_buffer_size;
+};
+
+std::vector<packed_table> contiguous_split(cudf::table_view const& input,
+                                           std::vector<size_type> const& splits,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  auto state = contiguous_split_state(input, splits, stream, mr);
+  return state.contiguous_split();
 }
 
 };  // namespace detail
@@ -1280,4 +1994,41 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   return detail::contiguous_split(input, splits, cudf::get_default_stream(), mr);
 }
 
+chunked_pack::chunked_pack(cudf::table_view const& input,
+                           std::size_t user_buffer_size,
+                           rmm::cuda_stream_view stream,
+                           rmm::mr::device_memory_resource* mr)
+{
+  state = std::make_unique<detail::contiguous_split_state>(input, user_buffer_size, stream, mr);
+}
+
+// required for the unique_ptr to work with a non-complete type (contiguous_split_state)
+chunked_pack::~chunked_pack() = default;
+
+std::size_t chunked_pack::get_total_contiguous_size() const
+{
+  return state->get_total_contiguous_size();
+}
+
+bool chunked_pack::has_next() const { return state->has_next(); }
+
+std::size_t chunked_pack::next(cudf::device_span<uint8_t> const& user_buffer)
+{
+  return state->contiguous_split_chunk(user_buffer);
+}
+
+std::unique_ptr<std::vector<uint8_t>> chunked_pack::build_metadata() const
+{
+  return state->build_packed_column_metadata();
+}
+
+std::unique_ptr<chunked_pack> make_chunked_pack(cudf::table_view const& input,
+                                                std::size_t user_buffer_size,
+                                                rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(user_buffer_size >= detail::contiguous_split_state::desired_batch_size,
+               "The output buffer size must be at least 1MB in size");
+  return std::make_unique<chunked_pack>(input, user_buffer_size, cudf::get_default_stream(), mr);
+}
+
 };  // namespace cudf
diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp
index bac9aac1886..02d96b62639 100644
--- a/cpp/src/copying/pack.cpp
+++ b/cpp/src/copying/pack.cpp
@@ -235,6 +235,8 @@ metadata_builder::metadata_builder(size_type const num_root_columns)
   impl->add_column_info_to_meta(data_type{type_id::EMPTY}, num_root_columns, 0, -1, -1, 0);
 }
 
+metadata_builder::~metadata_builder() = default;
+
 void metadata_builder::add_column_info_to_meta(data_type const col_type,
                                                size_type const col_size,
                                                size_type const col_null_count,
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index 5d47a123c58..51c42bd7cf0 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -31,6 +31,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/device_buffer.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -450,10 +451,8 @@ void split_end_to_size(SplitFunc Split, CompareFunc Compare)
 }
 
 template <typename SplitFunc>
-void split_empty_table(SplitFunc Split)
+void split_empty_table(SplitFunc Split, std::vector<cudf::size_type> const& splits = {2, 5, 6})
 {
-  std::vector<cudf::size_type> splits{2, 5, 9};
-
   cudf::table src_table{};
   auto result = Split(src_table, splits);
 
@@ -530,7 +529,9 @@ void split_negative_value(SplitFunc Split)
 }
 
 template <typename SplitFunc, typename CompareFunc>
-void split_empty_output_column_value(SplitFunc Split, CompareFunc Compare)
+void split_empty_output_column_value(SplitFunc Split,
+                                     CompareFunc Compare,
+                                     std::vector<cudf::size_type> const& splits = {0, 2, 2})
 {
   cudf::size_type start    = 0;
   cudf::size_type col_size = 10;
@@ -540,8 +541,6 @@ void split_empty_output_column_value(SplitFunc Split, CompareFunc Compare)
   cudf::size_type num_cols = 5;
   cudf::table src_table    = create_fixed_table<int8_t>(num_cols, start, col_size, valids);
 
-  std::vector<cudf::size_type> splits{0, 2, 2};
-
   EXPECT_NO_THROW(Split(src_table, splits));
 
   auto result = Split(src_table, splits);
@@ -622,7 +621,9 @@ TEST_F(SplitTableCornerCases, EmptyOutputColumn)
 }
 
 template <typename SplitFunc, typename CompareFunc>
-void split_string_with_invalids(SplitFunc Split, CompareFunc Compare)
+void split_string_with_invalids(SplitFunc Split,
+                                CompareFunc Compare,
+                                std::vector<cudf::size_type> splits = {2, 5, 9})
 {
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
@@ -638,8 +639,6 @@ void split_string_with_invalids(SplitFunc Split, CompareFunc Compare)
   scols.push_back(sw[1].release());
   cudf::table src_table(std::move(scols));
 
-  std::vector<cudf::size_type> splits{2, 5, 9};
-
   std::vector<cudf::table> expected =
     create_expected_string_tables_for_splits(strings, splits, true);
 
@@ -653,7 +652,9 @@ void split_string_with_invalids(SplitFunc Split, CompareFunc Compare)
 }
 
 template <typename SplitFunc, typename CompareFunc>
-void split_empty_output_strings_column_value(SplitFunc Split, CompareFunc Compare)
+void split_empty_output_strings_column_value(SplitFunc Split,
+                                             CompareFunc Compare,
+                                             std::vector<cudf::size_type> const& splits = {0, 2, 2})
 {
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
@@ -671,8 +672,6 @@ void split_empty_output_strings_column_value(SplitFunc Split, CompareFunc Compar
 
   cudf::size_type num_cols = 2;
 
-  std::vector<cudf::size_type> splits{0, 2, 2};
-
   EXPECT_NO_THROW(Split(src_table, splits));
 
   auto result = Split(src_table, splits);
@@ -759,7 +758,7 @@ struct SplitNestedTypesTest : public cudf::test::BaseFixture {};
 
 // common functions for testing split/contiguous_split
 template <typename T, typename SplitFunc, typename CompareFunc>
-void split_lists(SplitFunc Split, CompareFunc Compare)
+void split_lists(SplitFunc Split, CompareFunc Compare, bool split = true)
 {
   using LCW = cudf::test::lists_column_wrapper<T>;
 
@@ -775,22 +774,28 @@ void split_lists(SplitFunc Split, CompareFunc Compare)
                                              {-10},
                                              {-100, -200}};
 
-    std::vector<cudf::size_type> splits{0, 1, 4, 5, 6, 9};
-
-    std::vector<cudf::test::lists_column_wrapper<T>> expected;
-    expected.push_back(LCW{});
-    expected.push_back(LCW{{1, 2, 3}});
-    expected.push_back(LCW{{4, 5}, {6}, {7, 8}});
-    expected.push_back(LCW{{9, 10, 11}});
-    expected.push_back(LCW{LCW{}});
-    expected.push_back(LCW{LCW{}, {-1, -2, -3, -4, -5}, {-10}});
-    expected.push_back(LCW{{-100, -200}});
-
-    auto result = Split(list, splits);
-    EXPECT_EQ(expected.size(), result.size());
-
-    for (unsigned long index = 0; index < result.size(); index++) {
-      Compare(expected[index], result[index]);
+    if (split) {
+      std::vector<cudf::size_type> splits{0, 1, 4, 5, 6, 9};
+
+      std::vector<cudf::test::lists_column_wrapper<T>> expected;
+      expected.push_back(LCW{});
+      expected.push_back(LCW{{1, 2, 3}});
+      expected.push_back(LCW{{4, 5}, {6}, {7, 8}});
+      expected.push_back(LCW{{9, 10, 11}});
+      expected.push_back(LCW{LCW{}});
+      expected.push_back(LCW{LCW{}, {-1, -2, -3, -4, -5}, {-10}});
+      expected.push_back(LCW{{-100, -200}});
+
+      auto result = Split(list, splits);
+      EXPECT_EQ(expected.size(), result.size());
+
+      for (unsigned long index = 0; index < result.size(); index++) {
+        Compare(expected[index], result[index]);
+      }
+    } else {
+      auto result = Split(list, {});
+      EXPECT_EQ(1, result.size());
+      Compare(list, result[0]);
     }
   }
 
@@ -803,25 +808,31 @@ void split_lists(SplitFunc Split, CompareFunc Compare)
                                              {LCW{}},
                                              {{-10}, {-100, -200}}};
 
-    std::vector<cudf::size_type> splits{1, 3, 4};
-
-    std::vector<cudf::test::lists_column_wrapper<T>> expected;
-    expected.push_back(LCW{{{1, 2, 3}, {4, 5}}});
-    expected.push_back(LCW{{LCW{}, LCW{}, {7, 8}, LCW{}}, {LCW{6}}});
-    expected.push_back(LCW{{{7, 8}, {9, 10, 11}, LCW{}}});
-    expected.push_back(LCW{{LCW{}, {-1, -2, -3, -4, -5}}, {LCW{}}, {{-10}, {-100, -200}}});
-
-    auto result = Split(list, splits);
-    EXPECT_EQ(expected.size(), result.size());
-
-    for (unsigned long index = 0; index < result.size(); index++) {
-      Compare(expected[index], result[index]);
+    if (split) {
+      std::vector<cudf::size_type> splits{1, 3, 4};
+
+      std::vector<cudf::test::lists_column_wrapper<T>> expected;
+      expected.push_back(LCW{{{1, 2, 3}, {4, 5}}});
+      expected.push_back(LCW{{LCW{}, LCW{}, {7, 8}, LCW{}}, {LCW{6}}});
+      expected.push_back(LCW{{{7, 8}, {9, 10, 11}, LCW{}}});
+      expected.push_back(LCW{{LCW{}, {-1, -2, -3, -4, -5}}, {LCW{}}, {{-10}, {-100, -200}}});
+
+      auto result = Split(list, splits);
+      EXPECT_EQ(expected.size(), result.size());
+
+      for (unsigned long index = 0; index < result.size(); index++) {
+        Compare(expected[index], result[index]);
+      }
+    } else {
+      auto result = Split(list, {});
+      EXPECT_EQ(1, result.size());
+      Compare(list, result[0]);
     }
   }
 }
 
 template <typename T, typename SplitFunc, typename CompareFunc>
-void split_lists_with_nulls(SplitFunc Split, CompareFunc Compare)
+void split_lists_with_nulls(SplitFunc Split, CompareFunc Compare, bool split = true)
 {
   using LCW = cudf::test::lists_column_wrapper<int>;
 
@@ -840,22 +851,28 @@ void split_lists_with_nulls(SplitFunc Split, CompareFunc Compare)
                                              {-10},
                                              {{-100, -200}, valids}};
 
-    std::vector<cudf::size_type> splits{0, 1, 4, 5, 6, 9};
-
-    std::vector<cudf::test::lists_column_wrapper<T>> expected;
-    expected.push_back(LCW{});
-    expected.push_back(LCW{{1, 2, 3}});
-    expected.push_back(LCW{{4, 5}, {6}, {{7, 8}, valids}});
-    expected.push_back(LCW{{9, 10, 11}});
-    expected.push_back(LCW{LCW{}});
-    expected.push_back(LCW{LCW{}, {{-1, -2, -3, -4, -5}, valids}, {-10}});
-    expected.push_back(LCW{{{-100, -200}, valids}});
-
-    auto result = Split(list, splits);
-    EXPECT_EQ(expected.size(), result.size());
-
-    for (unsigned long index = 0; index < result.size(); index++) {
-      Compare(expected[index], result[index]);
+    if (split) {
+      std::vector<cudf::size_type> splits{0, 1, 4, 5, 6, 9};
+
+      std::vector<cudf::test::lists_column_wrapper<T>> expected;
+      expected.push_back(LCW{});
+      expected.push_back(LCW{{1, 2, 3}});
+      expected.push_back(LCW{{4, 5}, {6}, {{7, 8}, valids}});
+      expected.push_back(LCW{{9, 10, 11}});
+      expected.push_back(LCW{LCW{}});
+      expected.push_back(LCW{LCW{}, {{-1, -2, -3, -4, -5}, valids}, {-10}});
+      expected.push_back(LCW{{{-100, -200}, valids}});
+
+      auto result = Split(list, splits);
+      EXPECT_EQ(expected.size(), result.size());
+
+      for (unsigned long index = 0; index < result.size(); index++) {
+        Compare(expected[index], result[index]);
+      }
+    } else {
+      auto result = Split(list, {});
+      EXPECT_EQ(1, result.size());
+      Compare(list, result[0]);
     }
   }
 
@@ -868,26 +885,32 @@ void split_lists_with_nulls(SplitFunc Split, CompareFunc Compare)
                                              {LCW{}},
                                              {{-10}, {-100, -200}}};
 
-    std::vector<cudf::size_type> splits{1, 3, 4};
-
-    std::vector<cudf::test::lists_column_wrapper<T>> expected;
-    expected.push_back(LCW{{{{1, 2, 3}, valids}, {4, 5}}});
-    expected.push_back(LCW{{{LCW{}, LCW{}, {7, 8}, LCW{}}, valids}, {{{6}}}});
-    expected.push_back(LCW{{{{7, 8}, {{9, 10, 11}, valids}, LCW{}}, valids}});
-    expected.push_back(
-      LCW{{{LCW{}, {-1, -2, -3, -4, -5}}, valids}, {LCW{}}, {{-10}, {-100, -200}}});
-
-    auto result = Split(list, splits);
-    EXPECT_EQ(expected.size(), result.size());
-
-    for (unsigned long index = 0; index < result.size(); index++) {
-      Compare(expected[index], result[index]);
+    if (split) {
+      std::vector<cudf::size_type> splits{1, 3, 4};
+
+      std::vector<cudf::test::lists_column_wrapper<T>> expected;
+      expected.push_back(LCW{{{{1, 2, 3}, valids}, {4, 5}}});
+      expected.push_back(LCW{{{LCW{}, LCW{}, {7, 8}, LCW{}}, valids}, {{{6}}}});
+      expected.push_back(LCW{{{{7, 8}, {{9, 10, 11}, valids}, LCW{}}, valids}});
+      expected.push_back(
+        LCW{{{LCW{}, {-1, -2, -3, -4, -5}}, valids}, {LCW{}}, {{-10}, {-100, -200}}});
+
+      auto result = Split(list, splits);
+      EXPECT_EQ(expected.size(), result.size());
+
+      for (unsigned long index = 0; index < result.size(); index++) {
+        Compare(expected[index], result[index]);
+      }
+    } else {
+      auto result = Split(list, {});
+      EXPECT_EQ(1, result.size());
+      Compare(list, result[0]);
     }
   }
 }
 
 template <typename SplitFunc, typename CompareFunc>
-void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare)
+void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare, bool split = true)
 {
   // 1. String "names" column.
   std::vector<std::string> names{
@@ -921,7 +944,8 @@ void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare)
       : cudf::test::structs_column_wrapper({names_column, ages_column, is_human_col});
 
   // split
-  std::vector<cudf::size_type> splits{0, 1, 3, 8};
+  std::vector<cudf::size_type> splits;
+  if (split) { splits = std::vector<cudf::size_type>({0, 1, 3, 8}); }
   auto result = Split(struct_column, splits);
 
   // expected outputs
@@ -952,20 +976,26 @@ void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare)
 }
 
 template <typename SplitFunc, typename CompareFunc>
-void split_structs_no_children(SplitFunc Split, CompareFunc Compare)
+void split_structs_no_children(SplitFunc Split, CompareFunc Compare, bool split = true)
 {
   // no nulls
   {
     auto struct_column = cudf::make_structs_column(4, {}, 0, rmm::device_buffer{});
-    auto expected      = cudf::make_structs_column(2, {}, 0, rmm::device_buffer{});
-
-    // split
-    std::vector<cudf::size_type> splits{2};
-    auto result = Split(*struct_column, splits);
-
-    EXPECT_EQ(result.size(), 2ul);
-    Compare(*expected, result[0]);
-    Compare(*expected, result[1]);
+    if (split) {
+      auto expected = cudf::make_structs_column(2, {}, 0, rmm::device_buffer{});
+
+      // split
+      std::vector<cudf::size_type> splits{2};
+      auto result = Split(*struct_column, splits);
+
+      EXPECT_EQ(result.size(), 2ul);
+      Compare(*expected, result[0]);
+      Compare(*expected, result[1]);
+    } else {
+      auto result = Split(*struct_column, {});
+      EXPECT_EQ(1, result.size());
+      Compare(*struct_column, result[0]);
+    }
   }
 
   // all nulls
@@ -975,33 +1005,45 @@ void split_structs_no_children(SplitFunc Split, CompareFunc Compare)
       cudf::test::detail::make_null_mask(struct_validity.begin(), struct_validity.end());
     auto struct_column = cudf::make_structs_column(4, {}, null_count, std::move(null_mask));
 
-    std::vector<bool> expected_validity{false, false};
-    std::tie(null_mask, null_count) =
-      cudf::test::detail::make_null_mask(expected_validity.begin(), expected_validity.end());
-    auto expected = cudf::make_structs_column(2, {}, null_count, std::move(null_mask));
-
-    // split
-    std::vector<cudf::size_type> splits{2};
-    auto result = Split(*struct_column, splits);
-
-    EXPECT_EQ(result.size(), 2ul);
-    Compare(*expected, result[0]);
-    Compare(*expected, result[1]);
+    if (split) {
+      std::vector<bool> expected_validity{false, false};
+      std::tie(null_mask, null_count) =
+        cudf::test::detail::make_null_mask(expected_validity.begin(), expected_validity.end());
+      auto expected = cudf::make_structs_column(2, {}, null_count, std::move(null_mask));
+
+      // split
+      std::vector<cudf::size_type> splits{2};
+      auto result = Split(*struct_column, splits);
+
+      EXPECT_EQ(result.size(), 2ul);
+      Compare(*expected, result[0]);
+      Compare(*expected, result[1]);
+    } else {
+      auto result = Split(*struct_column, {});
+      EXPECT_EQ(1, result.size());
+      Compare(*struct_column, result[0]);
+    }
   }
 
   // no nulls, empty output column
   {
     auto struct_column = cudf::make_structs_column(4, {}, 0, rmm::device_buffer{});
-    auto expected0     = cudf::make_structs_column(4, {}, 0, rmm::device_buffer{});
-    auto expected1     = cudf::make_structs_column(0, {}, 0, rmm::device_buffer{});
-
-    // split
-    std::vector<cudf::size_type> splits{4};
-    auto result = Split(*struct_column, splits);
-
-    EXPECT_EQ(result.size(), 2ul);
-    Compare(*expected0, result[0]);
-    Compare(*expected1, result[1]);
+    if (split) {
+      auto expected0 = cudf::make_structs_column(4, {}, 0, rmm::device_buffer{});
+      auto expected1 = cudf::make_structs_column(0, {}, 0, rmm::device_buffer{});
+
+      // split
+      std::vector<cudf::size_type> splits{4};
+      auto result = Split(*struct_column, splits);
+
+      EXPECT_EQ(result.size(), 2ul);
+      Compare(*expected0, result[0]);
+      Compare(*expected1, result[1]);
+    } else {
+      auto result = Split(*struct_column, {});
+      EXPECT_EQ(1, result.size());
+      Compare(*struct_column, result[0]);
+    }
   }
 
   // all nulls, empty output column
@@ -1011,25 +1053,31 @@ void split_structs_no_children(SplitFunc Split, CompareFunc Compare)
       cudf::test::detail::make_null_mask(struct_validity.begin(), struct_validity.end());
     auto struct_column = cudf::make_structs_column(4, {}, null_count, std::move(null_mask));
 
-    std::vector<bool> expected_validity0{false, false, false, false};
-    std::tie(null_mask, null_count) =
-      cudf::test::detail::make_null_mask(expected_validity0.begin(), expected_validity0.end());
-    auto expected0 = cudf::make_structs_column(4, {}, null_count, std::move(null_mask));
-
-    auto expected1 = cudf::make_structs_column(0, {}, 0, rmm::device_buffer{});
-
-    // split
-    std::vector<cudf::size_type> splits{4};
-    auto result = Split(*struct_column, splits);
-
-    EXPECT_EQ(result.size(), 2ul);
-    Compare(*expected0, result[0]);
-    Compare(*expected1, result[1]);
+    if (split) {
+      std::vector<bool> expected_validity0{false, false, false, false};
+      std::tie(null_mask, null_count) =
+        cudf::test::detail::make_null_mask(expected_validity0.begin(), expected_validity0.end());
+      auto expected0 = cudf::make_structs_column(4, {}, null_count, std::move(null_mask));
+
+      auto expected1 = cudf::make_structs_column(0, {}, 0, rmm::device_buffer{});
+
+      // split
+      std::vector<cudf::size_type> splits{4};
+      auto result = Split(*struct_column, splits);
+
+      EXPECT_EQ(result.size(), 2ul);
+      Compare(*expected0, result[0]);
+      Compare(*expected1, result[1]);
+    } else {
+      auto result = Split(*struct_column, {});
+      EXPECT_EQ(1, result.size());
+      Compare(*struct_column, result[0]);
+    }
   }
 }
 
 template <typename SplitFunc, typename CompareFunc>
-void split_nested_struct_of_list(SplitFunc Split, CompareFunc Compare)
+void split_nested_struct_of_list(SplitFunc Split, CompareFunc Compare, bool split = true)
 {
   // Struct<List<List>>
   using LCW = cudf::test::lists_column_wrapper<float>;
@@ -1064,33 +1112,134 @@ void split_nested_struct_of_list(SplitFunc Split, CompareFunc Compare)
   auto struct_column =
     cudf::test::structs_column_wrapper({names_column, ages_column, list}, struct_validity.begin());
 
-  // split
-  std::vector<cudf::size_type> splits{1, 3, 8};
-  auto result = Split(struct_column, splits);
+  if (split) {
+    std::vector<cudf::size_type> splits{1, 3, 8};
+    auto result = Split(struct_column, splits);
+    // expected results
+    auto expected_names = create_expected_string_columns_for_splits(names, splits, names_validity);
+    auto expected_ages  = create_expected_columns_for_splits<int>(splits, ages, ages_validity);
+    std::vector<cudf::test::lists_column_wrapper<float>> expected_lists;
+    expected_lists.push_back(LCW({{{1, 2, 3}, {4}}}));
+    expected_lists.push_back(LCW({{{-1, -2}, LCW{}}, LCW{}}));
+    std::vector<bool> ex_v{1, 1, 0, 1, 0};
+    expected_lists.push_back(LCW({{{10}, {20, 30, 40}, {100, -100}},
+                                  {LCW{}, LCW{}, {8, 9}},
+                                  LCW{},
+                                  {{8}, {10, 9, 8, 7, 6, 5}},
+                                  {{5, 6}, LCW{}, {8}}},
+                                 ex_v.begin()));
+    expected_lists.push_back(LCW({{LCW{-3, 4, -5}}}));
+
+    auto expected_struct_validity = create_expected_validity(splits, struct_validity);
+    EXPECT_EQ(expected_names.size(), result.size());
+
+    for (std::size_t index = 0; index < result.size(); index++) {
+      auto expected = cudf::test::structs_column_wrapper(
+        {expected_names[index], expected_ages[index], expected_lists[index]},
+        expected_struct_validity[index]);
+      Compare(expected, result[index]);
+    }
+  } else {
+    auto result = Split(struct_column, {});
+    Compare(struct_column, result[0]);
+  }
+}
 
-  // expected results
-  auto expected_names = create_expected_string_columns_for_splits(names, splits, names_validity);
-  auto expected_ages  = create_expected_columns_for_splits<int>(splits, ages, ages_validity);
-  std::vector<cudf::test::lists_column_wrapper<float>> expected_lists;
-  expected_lists.push_back(LCW({{{1, 2, 3}, {4}}}));
-  expected_lists.push_back(LCW({{{-1, -2}, LCW{}}, LCW{}}));
-  std::vector<bool> ex_v{1, 1, 0, 1, 0};
-  expected_lists.push_back(LCW({{{10}, {20, 30, 40}, {100, -100}},
-                                {LCW{}, LCW{}, {8, 9}},
-                                LCW{},
-                                {{8}, {10, 9, 8, 7, 6, 5}},
-                                {{5, 6}, LCW{}, {8}}},
-                               ex_v.begin()));
-  expected_lists.push_back(LCW({{LCW{-3, 4, -5}}}));
+template <typename SplitFunc, typename CompareFunc>
+void split_nested_list_of_structs(SplitFunc Split, CompareFunc Compare, bool split = true)
+{
+  // List<Struct<List<>>
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
 
-  auto expected_struct_validity = create_expected_validity(splits, struct_validity);
-  EXPECT_EQ(expected_names.size(), result.size());
+  // 1. String "names" column.
+  std::vector<std::string> names{"Vimes",
+                                 "Carrot",
+                                 "Angua",
+                                 "Cheery",
+                                 "Detritus",
+                                 "Slant",
+                                 "Fred",
+                                 "Todd",
+                                 "Kevin",
+                                 "Jason",
+                                 "Clark",
+                                 "Bob",
+                                 "Mithun",
+                                 "Sameer",
+                                 "Tim",
+                                 "Mark",
+                                 "Herman",
+                                 "Will"};
+  std::vector<bool> names_validity{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  cudf::test::strings_column_wrapper names_column(names.begin(), names.end());
 
-  for (std::size_t index = 0; index < result.size(); index++) {
-    auto expected = cudf::test::structs_column_wrapper(
-      {expected_names[index], expected_ages[index], expected_lists[index]},
-      expected_struct_validity[index]);
-    Compare(expected, result[index]);
+  // 2. Numeric "ages" column.
+  std::vector<int> ages{5, 10, 15, 20, 25, 30, 100, 101, 102, 26, 64, 12, 17, 16, 120, 44, 23, 50};
+  std::vector<bool> ages_validity = {1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0};
+  auto ages_column =
+    cudf::test::fixed_width_column_wrapper<int>(ages.begin(), ages.end(), ages_validity.begin());
+
+  // 3. List column
+  std::vector<bool> list_validity{1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1};
+  cudf::test::lists_column_wrapper<cudf::string_view> list(
+    {{"ab", "cd", "ef"},
+     LCW{"gh"},
+     {"ijk", "lmn"},
+     LCW{},
+     LCW{"o"},
+     {"pqr", "stu", "vwx"},
+     {"yz", "aaaa"},
+     LCW{"bbbb"},
+     {"cccc", "ddd", "eee", "fff", "ggg", "hh"},
+     {"b", "cdr", "efh", "um"},
+     LCW{"gh", "iu"},
+     {"lmn"},
+     LCW{"org"},
+     LCW{},
+     {"stu", "vwx"},
+     {"yz", "aaaa", "kem"},
+     LCW{"bbbb"},
+     {"cccc", "eee", "faff", "jiea", "fff", "ggg", "hh"}},
+    list_validity.begin());
+
+  // Assembly struct column
+  auto const struct_validity =
+    std::vector<bool>{1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1};
+  auto struct_column =
+    cudf::test::structs_column_wrapper({names_column, ages_column, list}, struct_validity.begin());
+
+  // wrap in a list
+  std::vector<int> outer_offsets{0, 3, 4, 8, 13, 16, 17, 18};
+  cudf::test::fixed_width_column_wrapper<int> outer_offsets_col(outer_offsets.begin(),
+                                                                outer_offsets.end());
+  std::vector<bool> outer_validity{1, 1, 1, 0, 1, 1, 0};
+  auto [outer_null_mask, outer_null_count] =
+    cudf::test::detail::make_null_mask(outer_validity.begin(), outer_validity.end());
+  auto outer_list = make_lists_column(static_cast<cudf::size_type>(outer_validity.size()),
+                                      outer_offsets_col.release(),
+                                      struct_column.release(),
+                                      outer_null_count,
+                                      std::move(outer_null_mask));
+  if (split) {
+    std::vector<cudf::size_type> splits{1, 3, 7};
+    cudf::table_view tbl({static_cast<cudf::column_view>(*outer_list)});
+
+    // we are testing the results of contiguous_split against regular cudf::split, which may seem
+    // weird. however, cudf::split() is a simple operation that just sets offsets at the topmost
+    // output column, whereas contiguous_split is a deep copy of the data to contiguous output
+    // buffers. so as long as we believe the comparison code (expect_columns_equivalent) can compare
+    // these outputs correctly, this should be safe.
+    auto result   = Split(*outer_list, splits);
+    auto expected = cudf::split(static_cast<cudf::column_view>(*outer_list), splits);
+    CUDF_EXPECTS(result.size() == expected.size(), "Split result size mismatch");
+
+    for (std::size_t index = 0; index < result.size(); index++) {
+      Compare(expected[index], result[index]);
+    }
+  } else {
+    auto result = Split(*outer_list, {});
+    EXPECT_EQ(1, result.size());
+    Compare(*outer_list, result[0]);
   }
 }
 
@@ -1165,6 +1314,47 @@ TEST_F(SplitNestedTypesTest, StructsOfList)
 template <typename T>
 struct ContiguousSplitTest : public cudf::test::BaseFixture {};
 
+std::vector<cudf::packed_table> do_chunked_pack(cudf::table_view const& input)
+{
+  auto mr = rmm::mr::get_current_device_resource();
+
+  rmm::device_buffer bounce_buff(1 * 1024 * 1024, cudf::get_default_stream(), mr);
+  auto bounce_buff_span =
+    cudf::device_span<uint8_t>(static_cast<uint8_t*>(bounce_buff.data()), bounce_buff.size());
+
+  auto chunked_pack = cudf::make_chunked_pack(input, bounce_buff_span.size(), mr);
+
+  // right size the final buffer
+  rmm::device_buffer final_buff(
+    chunked_pack->get_total_contiguous_size(), cudf::get_default_stream(), mr);
+
+  std::size_t final_buff_offset = 0;
+  while (chunked_pack->has_next()) {
+    auto bytes_copied = chunked_pack->next(bounce_buff_span);
+    cudaMemcpyAsync((uint8_t*)final_buff.data() + final_buff_offset,
+                    bounce_buff.data(),
+                    bytes_copied,
+                    cudaMemcpyDefault,
+                    cudf::get_default_stream());
+    final_buff_offset += bytes_copied;
+  }
+
+  auto packed_column_metas = chunked_pack->build_metadata();
+  // for chunked contig split, this is going to be a size 1 vector if we have
+  // results, or a size 0 if the original table was empty (no columns)
+  std::vector<cudf::packed_table> result;
+  if (packed_column_metas) {
+    result  = std::vector<cudf::packed_table>(1);
+    auto pc = cudf::packed_columns(std::move(packed_column_metas),
+                                   std::make_unique<rmm::device_buffer>(std::move(final_buff)));
+
+    auto unpacked = cudf::unpack(pc);
+    cudf::packed_table pt{std::move(unpacked), std::move(pc)};
+    result[0] = std::move(pt);
+  }
+  return result;
+}
+
 // the various utility functions in slice_tests.cuh don't like the chrono types
 using FixedWidthTypesWithoutChrono =
   cudf::test::Concat<cudf::test::NumericTypes, cudf::test::FixedPointTypes>;
@@ -1208,6 +1398,41 @@ TYPED_TEST(ContiguousSplitTest, LongColumn)
     false);
 }
 
+TYPED_TEST(ContiguousSplitTest, LongColumnChunked)
+{
+  split_custom_column<TypeParam>(
+    [](cudf::table_view const& t, std::vector<cudf::size_type> const&) {
+      return do_chunked_pack(t);
+    },
+    [](cudf::table_view const& expected, cudf::packed_table const& result) {
+      std::for_each(thrust::make_counting_iterator(0),
+                    thrust::make_counting_iterator(expected.num_columns()),
+                    [&expected, &result](cudf::size_type i) {
+                      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected.column(i),
+                                                          result.table.column(i));
+                    });
+    },
+    100002,
+    {},
+    true);
+
+  split_custom_column<TypeParam>(
+    [](cudf::table_view const& t, std::vector<cudf::size_type> const&) {
+      return do_chunked_pack(t);
+    },
+    [](cudf::table_view const& expected, cudf::packed_table const& result) {
+      std::for_each(thrust::make_counting_iterator(0),
+                    thrust::make_counting_iterator(expected.num_columns()),
+                    [&expected, &result](cudf::size_type i) {
+                      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected.column(i),
+                                                          result.table.column(i));
+                    });
+    },
+    100002,
+    {},
+    false);
+}
+
 TYPED_TEST(ContiguousSplitTest, LongColumnBigSplits)
 {
   split_custom_column<TypeParam>(
@@ -1309,6 +1534,46 @@ TEST_F(ContiguousSplitUntypedTest, ProgressiveSizes)
   }
 }
 
+TEST_F(ContiguousSplitUntypedTest, ProgressiveSizesChunked)
+{
+  constexpr int col_size = 4096;
+
+  // stress test copying a wide amount of bytes.
+  for (int idx = 2048; idx < col_size; idx += 128) {
+    split_custom_column<uint64_t>(
+      [](cudf::table_view const& t, std::vector<cudf::size_type> const&) {
+        return do_chunked_pack(t);
+      },
+      [](cudf::table_view const& expected, cudf::packed_table const& result) {
+        std::for_each(thrust::make_counting_iterator(0),
+                      thrust::make_counting_iterator(expected.num_columns()),
+                      [&expected, &result](cudf::size_type i) {
+                        CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected.column(i),
+                                                            result.table.column(i));
+                      });
+      },
+      col_size,
+      {},
+      true);
+
+    split_custom_column<uint64_t>(
+      [](cudf::table_view const& t, std::vector<cudf::size_type> const&) {
+        return do_chunked_pack(t);
+      },
+      [](cudf::table_view const& expected, cudf::packed_table const& result) {
+        std::for_each(thrust::make_counting_iterator(0),
+                      thrust::make_counting_iterator(expected.num_columns()),
+                      [&expected, &result](cudf::size_type i) {
+                        CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected.column(i),
+                                                            result.table.column(i));
+                      });
+      },
+      col_size,
+      {},
+      false);
+  }
+}
+
 TEST_F(ContiguousSplitUntypedTest, ValidityRepartition)
 {
   // it is tricky to actually get the internal repartitioning/load-balancing code to add new splits
@@ -1336,6 +1601,25 @@ TEST_F(ContiguousSplitUntypedTest, ValidityRepartition)
   }
 }
 
+TEST_F(ContiguousSplitUntypedTest, ValidityRepartitionChunked)
+{
+  srand(0);
+  auto rvalids                   = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+    return static_cast<float>(rand()) / static_cast<float>(RAND_MAX) < 0.5f ? 0 : 1;
+  });
+  cudf::size_type const num_rows = 2000000;
+  auto col                       = cudf::sequence(num_rows, cudf::numeric_scalar<int8_t>{0});
+  auto [null_mask, null_count]   = cudf::test::detail::make_null_mask(rvalids, rvalids + num_rows);
+  col->set_null_mask(std::move(null_mask), null_count);
+
+  cudf::table_view t({*col});
+  auto result    = do_chunked_pack(t);
+  auto& expected = t;
+  EXPECT_EQ(1, result.size());
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result[0].table, expected);
+}
+
 TEST_F(ContiguousSplitUntypedTest, ValidityEdgeCase)
 {
   // tests an edge case where the splits cause the final validity data to be copied
@@ -1361,6 +1645,16 @@ TEST_F(ContiguousSplitUntypedTest, DISABLED_VeryLargeColumnTest)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, result[0].table.column(0));
 }
 
+// This test requires about 25GB of device memory when used with the arena allocator
+TEST_F(ContiguousSplitUntypedTest, DISABLED_VeryLargeColumnTestChunked)
+{
+  // tests an edge case where buf.elements * buf.element_size overflows an INT32.
+  auto col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT64}, 400 * 1024 * 1024, cudf::mask_state::UNALLOCATED);
+  auto result = do_chunked_pack(cudf::table_view{{*col}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, result[0].table.column(0));
+}
+
 // contiguous split with strings
 struct ContiguousSplitStringTableTest : public SplitTest<std::string> {};
 
@@ -1375,6 +1669,18 @@ TEST_F(ContiguousSplitStringTableTest, StringWithInvalids)
     });
 }
 
+TEST_F(ContiguousSplitStringTableTest, StringWithInvalidsChunked)
+{
+  split_string_with_invalids(
+    [](cudf::table_view const& t, std::vector<cudf::size_type> const&) {
+      return do_chunked_pack(t);
+    },
+    [](cudf::table_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.table);
+    },
+    {});
+}
+
 TEST_F(ContiguousSplitStringTableTest, EmptyInputColumn)
 {
   // build a bunch of empty stuff
@@ -1397,6 +1703,13 @@ TEST_F(ContiguousSplitStringTableTest, EmptyInputColumn)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(src_table, result[0].table);
   }
 
+  {
+    auto result = do_chunked_pack(src_table);
+    CUDF_EXPECTS(result.size() == 1, "Incorrect returned contiguous_split result size!");
+
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(src_table, result[0].table);
+  }
+
   {
     std::vector<cudf::size_type> splits{0, 0, 0, 0};
     auto result = cudf::contiguous_split(src_table, splits);
@@ -1417,6 +1730,16 @@ TEST_F(ContiguousSplitStringTableTest, EmptyOutputColumn)
     [](cudf::packed_table const& t, int num_cols) { EXPECT_EQ(t.table.num_columns(), num_cols); });
 }
 
+TEST_F(ContiguousSplitStringTableTest, EmptyOutputColumnChunked)
+{
+  split_empty_output_strings_column_value(
+    [](cudf::table_view const& t, std::vector<cudf::size_type> const&) {
+      return do_chunked_pack(t);
+    },
+    [](cudf::packed_table const& t, int num_cols) { EXPECT_EQ(t.table.num_columns(), num_cols); },
+    {});
+}
+
 TEST_F(ContiguousSplitStringTableTest, NullStringColumn)
 {
   split_null_input_strings_column_value(
@@ -1464,6 +1787,13 @@ TEST_F(ContiguousSplitTableCornerCases, EmptyTable)
   });
 }
 
+TEST_F(ContiguousSplitTableCornerCases, EmptyTableChunked)
+{
+  split_empty_table([](cudf::table_view const& t,
+                       std::vector<cudf::size_type> const&) { return do_chunked_pack(t); },
+                    {});
+}
+
 TEST_F(ContiguousSplitTableCornerCases, EmptyIndices)
 {
   split_empty_indices([](cudf::table_view const& t, std::vector<cudf::size_type> const& splits) {
@@ -1501,6 +1831,16 @@ TEST_F(ContiguousSplitTableCornerCases, EmptyOutputColumn)
     [](cudf::packed_table const& t, int num_cols) { EXPECT_EQ(t.table.num_columns(), num_cols); });
 }
 
+TEST_F(ContiguousSplitTableCornerCases, EmptyOutputColumnChunked)
+{
+  split_empty_output_column_value(
+    [](cudf::table_view const& t, std::vector<cudf::size_type> const&) {
+      return do_chunked_pack(t);
+    },
+    [](cudf::packed_table const& t, int num_cols) { EXPECT_EQ(t.table.num_columns(), num_cols); },
+    {});
+}
+
 TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypes)
 {
   cudf::size_type start = 0;
@@ -1542,6 +1882,70 @@ TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypes)
   }
 }
 
+TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypesChunked)
+{
+  cudf::size_type start = 0;
+  auto valids = cudf::detail::make_counting_transform_iterator(start, [](auto i) { return true; });
+
+  std::size_t num_rows = 1000000;
+
+  std::vector<std::string> strings1(num_rows);
+  std::vector<std::string> strings2(num_rows);
+  strings1[0] = "";
+  strings2[0] = "";
+  for (std::size_t i = 1; i < num_rows; ++i) {
+    auto str    = std::to_string(i);
+    strings1[i] = str;
+    strings2[i] = str;
+  }
+
+  std::vector<std::unique_ptr<cudf::column>> cols;
+
+  auto iter0 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i); });
+  auto c0    = cudf::test::fixed_width_column_wrapper<int>(iter0, iter0 + num_rows, valids);
+  cols.push_back(c0.release());
+
+  auto iter1 = cudf::detail::make_counting_transform_iterator(10, [](auto i) { return (i); });
+  auto c1    = cudf::test::fixed_width_column_wrapper<int>(iter1, iter1 + num_rows, valids);
+  cols.push_back(c1.release());
+
+  auto c2 = cudf::test::strings_column_wrapper(strings1.begin(), strings1.end(), valids);
+  cols.push_back(c2.release());
+
+  auto c3 = cudf::test::strings_column_wrapper(strings2.begin(), strings2.end(), valids);
+  cols.push_back(c3.release());
+
+  auto iter4 = cudf::detail::make_counting_transform_iterator(20, [](auto i) { return (i); });
+  auto c4    = cudf::test::fixed_width_column_wrapper<int>(iter4, iter4 + num_rows, valids);
+  cols.push_back(c4.release());
+
+  auto tbl     = cudf::table(std::move(cols));
+  auto results = do_chunked_pack(tbl.view());
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(tbl, results[0].table);
+}
+
+TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypesSingleRowChunked)
+{
+  cudf::size_type start = 0;
+  auto valids = cudf::detail::make_counting_transform_iterator(start, [](auto i) { return true; });
+
+  std::size_t num_rows = 1;
+
+  std::vector<std::unique_ptr<cudf::column>> cols;
+
+  auto iter0 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i); });
+  auto c0    = cudf::test::fixed_width_column_wrapper<int32_t>(iter0, iter0 + num_rows, valids);
+  cols.push_back(c0.release());
+
+  auto iter1 = cudf::detail::make_counting_transform_iterator(1, [](auto i) { return (i); });
+  auto c1    = cudf::test::fixed_width_column_wrapper<int64_t>(iter1, iter1 + num_rows);
+  cols.push_back(c1.release());
+
+  auto tbl     = cudf::table(std::move(cols));
+  auto results = do_chunked_pack(tbl.view());
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(tbl, results[0].table);
+}
+
 TEST_F(ContiguousSplitTableCornerCases, PreSplitTable)
 {
   auto valids =
@@ -1592,6 +1996,13 @@ TEST_F(ContiguousSplitTableCornerCases, PreSplitTable)
       CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected[index], result[index].table);
     }
   }
+
+  {
+    auto result = do_chunked_pack(pre_split[1]);
+    EXPECT_EQ(1, result.size());
+    auto expected = pre_split[1];
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result[0].table);
+  }
 }
 
 TEST_F(ContiguousSplitTableCornerCases, PreSplitTableLarge)
@@ -1766,6 +2177,8 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty)
 
     std::vector<cudf::size_type> splits2({1});
     EXPECT_NO_THROW(contiguous_split(src_table, splits2));
+
+    EXPECT_NO_THROW(do_chunked_pack(src_table));
   }
 
   // this produces an empty strings column with children that have no data,
@@ -1784,6 +2197,8 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty)
 
     std::vector<cudf::size_type> splits2({1});
     EXPECT_NO_THROW(contiguous_split(src_table, splits2));
+
+    EXPECT_NO_THROW(do_chunked_pack(src_table));
   }
 
   // this produces an empty lists column with children that have no data,
@@ -1802,6 +2217,8 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty)
 
     std::vector<cudf::size_type> splits2({1});
     EXPECT_NO_THROW(contiguous_split(src_table, splits2));
+
+    EXPECT_NO_THROW(do_chunked_pack(src_table));
   }
 
   // this produces an empty lists column with children that have no data,
@@ -1820,6 +2237,8 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty)
 
     std::vector<cudf::size_type> splits2({1});
     EXPECT_NO_THROW(contiguous_split(src_table, splits2));
+
+    EXPECT_NO_THROW(do_chunked_pack(src_table));
   }
 
   // this produces an empty struct column with children that have no data,
@@ -1840,6 +2259,8 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty)
 
     std::vector<cudf::size_type> splits2({1});
     EXPECT_NO_THROW(contiguous_split(src_table, splits2));
+
+    EXPECT_NO_THROW(do_chunked_pack(src_table));
   }
 }
 
@@ -1868,6 +2289,12 @@ TEST_F(ContiguousSplitTableCornerCases, SplitEmpty)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(sliced[0], result[0].table);
   }
 
+  {
+    auto result = do_chunked_pack(sliced[0]);
+    EXPECT_EQ(1, result.size());
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(sliced[0], result[0].table);
+  }
+
   {
     auto result = cudf::contiguous_split(sliced[0], {0});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(sliced[0], result[0].table);
@@ -1878,6 +2305,52 @@ TEST_F(ContiguousSplitTableCornerCases, SplitEmpty)
   }
 }
 
+TEST_F(ContiguousSplitTableCornerCases, OutBufferToSmall)
+{
+  // internally, contiguous split chunks GPU work in 1MB contiguous copies
+  // so the output buffer must be 1MB or larger.
+  EXPECT_THROW(cudf::make_chunked_pack({}, 1 * 1024, mr()), cudf::logic_error);
+}
+
+TEST_F(ContiguousSplitTableCornerCases, ChunkSpanTooSmall)
+{
+  auto chunked_pack = cudf::make_chunked_pack({}, 1 * 1024 * 1024, mr());
+  rmm::device_buffer buff(1 * 1024, cudf::get_default_stream(), mr());
+  cudf::device_span<uint8_t> too_small(static_cast<uint8_t*>(buff.data()), buff.size());
+  std::size_t copied = 0;
+  // throws because we created chunked_contig_split with 1MB, but we are giving
+  // it a 1KB span here
+  EXPECT_THROW(copied = chunked_pack->next(too_small), cudf::logic_error);
+  EXPECT_EQ(copied, 0);
+}
+
+TEST_F(ContiguousSplitTableCornerCases, EmptyTableHasNextFalse)
+{
+  auto chunked_pack = cudf::make_chunked_pack({}, 1 * 1024 * 1024, mr());
+  rmm::device_buffer buff(1 * 1024 * 1024, cudf::get_default_stream(), mr());
+  cudf::device_span<uint8_t> bounce_buff(static_cast<uint8_t*>(buff.data()), buff.size());
+  EXPECT_EQ(chunked_pack->has_next(), false);  // empty input table
+  std::size_t copied = 0;
+  EXPECT_THROW(copied = chunked_pack->next(bounce_buff), cudf::logic_error);
+  EXPECT_EQ(copied, 0);
+}
+
+TEST_F(ContiguousSplitTableCornerCases, ExhaustedHasNextFalse)
+{
+  cudf::test::strings_column_wrapper a{"abc", "def", "ghi", "jkl", "mno", "", "st", "uvwx"};
+  cudf::table_view t({a});
+  rmm::device_buffer buff(1 * 1024 * 1024, cudf::get_default_stream(), mr());
+  cudf::device_span<uint8_t> bounce_buff(static_cast<uint8_t*>(buff.data()), buff.size());
+  auto chunked_pack = cudf::make_chunked_pack(t, buff.size(), mr());
+  EXPECT_EQ(chunked_pack->has_next(), true);
+  std::size_t copied = chunked_pack->next(bounce_buff);
+  EXPECT_EQ(copied, chunked_pack->get_total_contiguous_size());
+  EXPECT_EQ(chunked_pack->has_next(), false);
+  copied = 0;
+  EXPECT_THROW(copied = chunked_pack->next(bounce_buff), cudf::logic_error);
+  EXPECT_EQ(copied, 0);
+}
+
 struct ContiguousSplitNestedTypesTest : public cudf::test::BaseFixture {};
 
 TEST_F(ContiguousSplitNestedTypesTest, Lists)
@@ -1892,6 +2365,19 @@ TEST_F(ContiguousSplitNestedTypesTest, Lists)
     });
 }
 
+TEST_F(ContiguousSplitNestedTypesTest, ListsChunked)
+{
+  split_lists<int>(
+    [](cudf::column_view const& c, std::vector<cudf::size_type> const&) {
+      cudf::table_view t({c});
+      return do_chunked_pack(t);
+    },
+    [](cudf::column_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0));
+    },
+    /*split*/ false);
+}
+
 TEST_F(ContiguousSplitNestedTypesTest, ListsWithNulls)
 {
   split_lists_with_nulls<int>(
@@ -1904,6 +2390,19 @@ TEST_F(ContiguousSplitNestedTypesTest, ListsWithNulls)
     });
 }
 
+TEST_F(ContiguousSplitNestedTypesTest, ListsWithNullsChunked)
+{
+  split_lists_with_nulls<int>(
+    [](cudf::column_view const& c, std::vector<cudf::size_type> const&) {
+      cudf::table_view t({c});
+      return do_chunked_pack(t);
+    },
+    [](cudf::column_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0));
+    },
+    /*split*/ false);
+}
+
 TEST_F(ContiguousSplitNestedTypesTest, Structs)
 {
   split_structs(
@@ -1917,6 +2416,20 @@ TEST_F(ContiguousSplitNestedTypesTest, Structs)
     });
 }
 
+TEST_F(ContiguousSplitNestedTypesTest, StructsChunked)
+{
+  split_structs(
+    false,
+    [](cudf::column_view const& c, std::vector<cudf::size_type> const&) {
+      cudf::table_view t({c});
+      return do_chunked_pack(t);
+    },
+    [](cudf::column_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0));
+    },
+    /*split*/ false);
+}
+
 TEST_F(ContiguousSplitNestedTypesTest, StructsWithNulls)
 {
   split_structs(
@@ -1930,6 +2443,20 @@ TEST_F(ContiguousSplitNestedTypesTest, StructsWithNulls)
     });
 }
 
+TEST_F(ContiguousSplitNestedTypesTest, StructsWithNullsChunked)
+{
+  split_structs(
+    true,
+    [](cudf::column_view const& c, std::vector<cudf::size_type> const&) {
+      cudf::table_view t({c});
+      return do_chunked_pack(t);
+    },
+    [](cudf::column_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0));
+    },
+    {});
+}
+
 TEST_F(ContiguousSplitNestedTypesTest, StructsNoChildren)
 {
   split_structs_no_children(
@@ -1942,6 +2469,19 @@ TEST_F(ContiguousSplitNestedTypesTest, StructsNoChildren)
     });
 }
 
+TEST_F(ContiguousSplitNestedTypesTest, StructsNoChildrenChunked)
+{
+  split_structs_no_children(
+    [](cudf::column_view const& c, std::vector<cudf::size_type> const&) {
+      cudf::table_view t({c});
+      return do_chunked_pack(t);
+    },
+    [](cudf::column_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0));
+    },
+    /*split*/ false);
+}
+
 TEST_F(ContiguousSplitNestedTypesTest, StructsOfList)
 {
   split_nested_struct_of_list(
@@ -1954,95 +2494,40 @@ TEST_F(ContiguousSplitNestedTypesTest, StructsOfList)
     });
 }
 
-TEST_F(ContiguousSplitNestedTypesTest, ListOfStruct)
+TEST_F(ContiguousSplitNestedTypesTest, StructsOfListChunked)
 {
-  // List<Struct<List<>>
-  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
-
-  // 1. String "names" column.
-  std::vector<std::string> names{"Vimes",
-                                 "Carrot",
-                                 "Angua",
-                                 "Cheery",
-                                 "Detritus",
-                                 "Slant",
-                                 "Fred",
-                                 "Todd",
-                                 "Kevin",
-                                 "Jason",
-                                 "Clark",
-                                 "Bob",
-                                 "Mithun",
-                                 "Sameer",
-                                 "Tim",
-                                 "Mark",
-                                 "Herman",
-                                 "Will"};
-  std::vector<bool> names_validity{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-  cudf::test::strings_column_wrapper names_column(names.begin(), names.end());
-
-  // 2. Numeric "ages" column.
-  std::vector<int> ages{5, 10, 15, 20, 25, 30, 100, 101, 102, 26, 64, 12, 17, 16, 120, 44, 23, 50};
-  std::vector<bool> ages_validity = {1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0};
-  auto ages_column =
-    cudf::test::fixed_width_column_wrapper<int>(ages.begin(), ages.end(), ages_validity.begin());
-
-  // 3. List column
-  std::vector<bool> list_validity{1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1};
-  cudf::test::lists_column_wrapper<cudf::string_view> list(
-    {{"ab", "cd", "ef"},
-     LCW{"gh"},
-     {"ijk", "lmn"},
-     LCW{},
-     LCW{"o"},
-     {"pqr", "stu", "vwx"},
-     {"yz", "aaaa"},
-     LCW{"bbbb"},
-     {"cccc", "ddd", "eee", "fff", "ggg", "hh"},
-     {"b", "cdr", "efh", "um"},
-     LCW{"gh", "iu"},
-     {"lmn"},
-     LCW{"org"},
-     LCW{},
-     {"stu", "vwx"},
-     {"yz", "aaaa", "kem"},
-     LCW{"bbbb"},
-     {"cccc", "eee", "faff", "jiea", "fff", "ggg", "hh"}},
-    list_validity.begin());
-
-  // Assembly struct column
-  auto const struct_validity =
-    std::vector<bool>{1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1};
-  auto struct_column =
-    cudf::test::structs_column_wrapper({names_column, ages_column, list}, struct_validity.begin());
-
-  // wrap in a list
-  std::vector<int> outer_offsets{0, 3, 4, 8, 13, 16, 17, 18};
-  cudf::test::fixed_width_column_wrapper<int> outer_offsets_col(outer_offsets.begin(),
-                                                                outer_offsets.end());
-  std::vector<bool> outer_validity{1, 1, 1, 0, 1, 1, 0};
-  auto [outer_null_mask, null_count] =
-    cudf::test::detail::make_null_mask(outer_validity.begin(), outer_validity.end());
-  auto outer_list = make_lists_column(static_cast<cudf::size_type>(outer_validity.size()),
-                                      outer_offsets_col.release(),
-                                      struct_column.release(),
-                                      null_count,
-                                      std::move(outer_null_mask));
+  split_nested_struct_of_list(
+    [](cudf::column_view const& c, std::vector<cudf::size_type> const&) {
+      cudf::table_view t({c});
+      return do_chunked_pack(t);
+    },
+    [](cudf::column_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0));
+    },
+    /*split*/ false);
+}
 
-  // split
-  std::vector<cudf::size_type> splits{1, 3, 7};
-  cudf::table_view tbl({static_cast<cudf::column_view>(*outer_list)});
-
-  // we are testing the results of contiguous_split against regular cudf::split, which may seem
-  // weird. however, cudf::split() is a simple operation that just sets offsets at the topmost
-  // output column, whereas contiguous_split is a deep copy of the data to contiguous output
-  // buffers. so as long as we believe the comparison code (expect_columns_equivalent) can compare
-  // these outputs correctly, this should be safe.
-  auto result   = cudf::contiguous_split(tbl, splits);
-  auto expected = cudf::split(static_cast<cudf::column_view>(*outer_list), splits);
-  CUDF_EXPECTS(result.size() == expected.size(), "Split result size mismatch");
+TEST_F(ContiguousSplitNestedTypesTest, ListOfStruct)
+{
+  split_nested_list_of_structs(
+    [](cudf::column_view const& c, std::vector<cudf::size_type> const& splits) {
+      cudf::table_view t({c});
+      return cudf::contiguous_split(t, splits);
+    },
+    [](cudf::column_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0));
+    });
+}
 
-  for (std::size_t index = 0; index < result.size(); index++) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected[index], result[index].table.column(0));
-  }
+TEST_F(ContiguousSplitNestedTypesTest, ListOfStructChunked)
+{
+  split_nested_list_of_structs(
+    [](cudf::column_view const& c, std::vector<cudf::size_type> const&) {
+      cudf::table_view t({c});
+      return do_chunked_pack(t);
+    },
+    [](cudf::column_view const& expected, cudf::packed_table const& result) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0));
+    },
+    /*split*/ false);
 }

From 3ea2844ffa9bbfd3ce756d001eb90af6b91e0243 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Mon, 1 May 2023 16:48:05 -0500
Subject: [PATCH 02/31] Fix code style

---
 cpp/src/copying/contiguous_split.cu | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index e94ce1d8156..4489428b32b 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1523,9 +1523,7 @@ std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
         size_of_batches_per_iteration.push_back(iter_size_bytes);
         accum_size_per_iteration.push_back(accum_size);
 
-        if (next_iteration_it == h_offsets.end()) {
-          break;
-        }
+        if (next_iteration_it == h_offsets.end()) { break; }
 
         current_offset_it += batches_in_iter;
       }

From fe3bd771cff4c60fcbe23e5832a6985478259699 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Sat, 6 May 2023 06:19:07 -0500
Subject: [PATCH 03/31] Const fixes. Structured binding. Use buffer instead of
 bounce buffer

---
 cpp/include/cudf/contiguous_split.hpp | 21 ++++++++++++++-------
 cpp/src/copying/contiguous_split.cu   | 12 ++++++------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp
index eacfcfcb803..61edf6a3564 100644
--- a/cpp/include/cudf/contiguous_split.hpp
+++ b/cpp/include/cudf/contiguous_split.hpp
@@ -154,8 +154,15 @@ struct contiguous_split_state;
  *
  * auto stream = cudf::get_default_stream();
  *
- * // Define a bounce buffer size: the larger the bounce buffer is, the more SMs can be
- * // occupied by this algorithm.
+ * // Define a buffer size for each chunk: the larger the buffer is, the more SMs can be
+ * // occupied by this algorithm. 
+ * //
+ * // Internally, the GPU unit-of-work is a 1MB batch. When we instantiate `cudf::chunked_pack`,
+ * // all the 1MB batches for the source table_view are computed up front. Additionally, 
+ * // chunked_pack calculates the number of iterations that are required to go through all those
+ * // batches given a `user_buffer_size` buffer. The number of 1MB batches in each iteration (chunk)
+ * // equals the number of CUDA blocks that will be used for the actual work.
+ * //
  * std::size_t user_buffer_size = 128*1024*1024;
  *
  * auto chunked_packer = make_chunked_pack(tv, user_buffer_size, stream, mr);
@@ -187,7 +194,7 @@ class chunked_pack {
    * @brief Construct a `chunked_pack` class.
    *
    * @param input source `table_view` to pack
-   * @param user_buffer_size bounce buffer size (in bytes) that will be passed on `next`. Must be
+   * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be
    *                         at least 1MB
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr RMM memory resource to be used for temporary and scratch allocations only
@@ -225,8 +232,8 @@ class chunked_pack {
    * @throws cudf::logic_error If the size of `user_buffer` is different than `user_buffer_size`
    * @throws cudf::logic_error If called after all chunks have been copied
    *
-   * @param user_buffer device span representing a bounce buffer. The size of this span
-   *                    must equal the `user_buffer_size` parameter passed at construction
+   * @param user_buffer device span target for the chunk. The size of this span must equal
+   *                    the `user_buffer_size` parameter passed at construction
    * @return The number of bytes that were written to `user_buffer` (at most
    *          `user_buffer_size`)
    *
@@ -247,13 +254,13 @@ class chunked_pack {
 
 /**
  * @brief Created a `chunked_pack` instance to perform a "pack" of the `table_view`
- * "input", where a bounce buffer of `user_buffer_size` is filled with chunks of the
+ * "input", where a buffer of `user_buffer_size` is filled with chunks of the
  * overall operation. This operation can be used in cases where GPU memory is constrained.
  *
  * @throws cudf::logic_error When user_buffer_size is less than 1MB
  *
  * @param input source `table_view` to pack
- * @param user_buffer_size bounce buffer size (in bytes) that will be passed on `next`. Must be
+ * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be
  *                         at least 1MB
  * @param mr RMM memory resource to be used for temporary and scratch allocations only
  * @return a unique_ptr of chunked_pack
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 4489428b32b..4674321436f 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -437,7 +437,7 @@ template <typename InputIter>
 size_type count_src_bufs(InputIter begin, InputIter end)
 {
   auto buf_iter = thrust::make_transform_iterator(begin, [](column_view const& col) {
-    auto children_counts = count_src_bufs(col.child_begin(), col.child_end());
+    auto const children_counts = count_src_bufs(col.child_begin(), col.child_end());
     return 1 + (col.nullable() ? 1 : 0) + children_counts;
   });
   return std::accumulate(buf_iter, buf_iter + std::distance(begin, end), 0);
@@ -793,7 +793,7 @@ BufInfo build_output_columns(InputIter begin,
     size_type col_size, null_count;
     int64_t bitmask_offset;
     int64_t data_offset;
-    std::tie(col_size, data_offset, bitmask_offset, null_count) =
+    auto [col_size, data_offset, bitmask_offset, null_count] =
       build_output_column_metadata<BufInfo>(src, current_info, mb, false);
 
     auto bitmask_ptr =
@@ -1504,7 +1504,7 @@ std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
           std::lower_bound(current_offset_it,
                            h_offsets.end(),
                            // We add the cumulative size + 1 because we want to find what would fit
-                           // within a bounce buffer of user_buffer_size (up to user_buffer_size).
+                           // within a buffer of user_buffer_size (up to user_buffer_size).
                            // Since h_offsets is a prefix scan, we add the size we accumulated so
                            // far so we are looking for the next user_buffer_sized boundary.
                            user_buffer_size + accum_size + 1);
@@ -1570,7 +1570,7 @@ std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
   } else {
     // we instantiate an "iteration state" for the regular single pass contiguous_split
     // consisting of 1 iteration with all of the batches and totalling `total_size` bytes.
-    auto total_size = std::reduce(h_buf_sizes, h_buf_sizes + num_partitions);
+    auto const total_size = std::reduce(h_buf_sizes, h_buf_sizes + num_partitions);
 
     // 1 iteration with the whole size
     return std::make_unique<chunk_iteration_state>(
@@ -1819,7 +1819,7 @@ struct contiguous_split_state {
     CUDF_EXPECTS(has_next(), "Cannot call contiguous_split_chunk with has_next() == false!");
 
     std::size_t starting_batch, num_batches_to_copy;
-    std::tie(starting_batch, num_batches_to_copy) =
+    auto [starting_batch, num_batches_to_copy] =
       chunk_iter_state->get_current_starting_index_and_buff_count();
 
     // perform the copy.
@@ -1962,7 +1962,7 @@ struct contiguous_split_state {
   //    contiguous_split will behave in a "chunked" mode in this scenario, as it will contiguously
   //    copy up until the user's buffer size limit, exposing a next() call for the user to invoke.
   //    Note that in this mode, contig split is not partitioning the original table, it is instead
-  //    only placing cuDF buffers contiguously in the user's bounce buffer.
+  //    only placing cuDF buffers contiguously in the user's buffer.
   //
   //  - single shot contiguous_split (default): when the user doesn't provide their own buffer,
   //    contiguous_split will allocate a buffer per partition and will place contiguous results in

From 4a4da2f8d57ff68f17d48e6831f09c112aaf6516 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Sat, 6 May 2023 06:20:16 -0500
Subject: [PATCH 04/31] Use rmm::device_uvector instead of rmm::device_buffer
 in benchmark

---
 cpp/benchmarks/copying/contiguous_split.cu | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/cpp/benchmarks/copying/contiguous_split.cu b/cpp/benchmarks/copying/contiguous_split.cu
index 5b95370faf0..daa80cdd425 100644
--- a/cpp/benchmarks/copying/contiguous_split.cu
+++ b/cpp/benchmarks/copying/contiguous_split.cu
@@ -32,14 +32,12 @@ void contiguous_split(cudf::table_view const& src_table, std::vector<cudf::size_
 
 void chunked_pack(cudf::table_view const& src_table, std::vector<cudf::size_type> const&)
 {
-  auto mr     = rmm::mr::get_current_device_resource();
-  auto stream = cudf::get_default_stream();
-  rmm::device_buffer user_buffer(100L * 1024 * 1024, stream, mr);
+  auto const mr     = rmm::mr::get_current_device_resource();
+  auto const stream = cudf::get_default_stream();
+  auto user_buffer  = rmm::device_uvector<std::uint8_t>(100L * 1024 * 1024, stream, mr);
   auto chunked_pack = cudf::make_chunked_pack(src_table, user_buffer.size(), mr);
-  auto user_buffer_span =
-    cudf::device_span<uint8_t>(static_cast<uint8_t*>(user_buffer.data()), user_buffer.size());
   while (chunked_pack->has_next()) {
-    auto iter_size = chunked_pack->next(user_buffer_span);
+    auto iter_size = chunked_pack->next(user_buffer);
   }
   stream.synchronize();
 }

From bd87dd5d3fa1039090c53af4d7cb7b6c9cd130e5 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Sun, 7 May 2023 13:06:07 -0500
Subject: [PATCH 05/31] Address review comments

---
 cpp/include/cudf/contiguous_split.hpp |   6 +-
 cpp/src/copying/contiguous_split.cu   | 176 +++++++++++++++-----------
 2 files changed, 102 insertions(+), 80 deletions(-)

diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp
index 61edf6a3564..fe6e7bc3e96 100644
--- a/cpp/include/cudf/contiguous_split.hpp
+++ b/cpp/include/cudf/contiguous_split.hpp
@@ -155,13 +155,13 @@ struct contiguous_split_state;
  * auto stream = cudf::get_default_stream();
  *
  * // Define a buffer size for each chunk: the larger the buffer is, the more SMs can be
- * // occupied by this algorithm. 
+ * // occupied by this algorithm.
  * //
  * // Internally, the GPU unit-of-work is a 1MB batch. When we instantiate `cudf::chunked_pack`,
- * // all the 1MB batches for the source table_view are computed up front. Additionally, 
+ * // all the 1MB batches for the source table_view are computed up front. Additionally,
  * // chunked_pack calculates the number of iterations that are required to go through all those
  * // batches given a `user_buffer_size` buffer. The number of 1MB batches in each iteration (chunk)
- * // equals the number of CUDA blocks that will be used for the actual work.
+ * // equals the number of CUDA blocks that will be used for the main kernel launch.
  * //
  * std::size_t user_buffer_size = 128*1024*1024;
  *
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 4674321436f..f75d1bf5297 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -51,10 +51,15 @@
 namespace cudf {
 namespace {
 
-// align all column size allocations to this boundary so that all output column buffers
+// Align all column size allocations to this boundary so that all output column buffers
 // start at that alignment.
 static constexpr std::size_t split_align = 64;
 
+// The size that contiguous split uses internally as the GPU unit of work.
+// The number of `desired_batch_size` batches equals the number of CUDA blocks
+// that will be used for the main kernel launch (`copy_partitions`).
+static constexpr std::size_t desired_batch_size = 1 * 1024 * 1024;
+
 /**
  * @brief Struct which contains information on a source buffer.
  *
@@ -268,51 +273,13 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
  * to be done as evenly as possible across the multiprocessors on the device.
  * This kernel is arranged such that each block copies 1 source/destination pair.
  *
+ * @param index_to_buffer A function that given a `buf_index` returns the destination buffer
  * @param src_bufs Input source buffers
- * @param dst_bufs Destination buffers
- * @param buf_info Information on the range of values to be copied for each destination buffer
- */
-template <int block_size>
-__global__ void copy_partitions(uint8_t const** src_bufs,
-                                uint8_t** dst_bufs,
-                                dst_buf_info* buf_info)
-{
-  auto const buf_index     = blockIdx.x;
-  auto const src_buf_index = buf_info[buf_index].src_buf_index;
-  auto const dst_buf_index = buf_info[buf_index].dst_buf_index;
-
-  // copy, shifting offsets and validity bits as needed
-  copy_buffer<block_size>(
-    dst_bufs[dst_buf_index] + buf_info[buf_index].dst_offset,
-    src_bufs[src_buf_index],
-    threadIdx.x,
-    buf_info[buf_index].num_elements,
-    buf_info[buf_index].element_size,
-    buf_info[buf_index].src_element_index,
-    blockDim.x,
-    buf_info[buf_index].value_shift,
-    buf_info[buf_index].bit_shift,
-    buf_info[buf_index].num_rows,
-    buf_info[buf_index].valid_count > 0 ? &buf_info[buf_index].valid_count : nullptr);
-}
-
-/**
- * @brief Kernel which copies data from multiple source buffers to multiple
- * destination buffers.
- *
- * When doing a contiguous_split on X columns comprising N total internal buffers
- * with M splits, we end up having to copy N*M source/destination buffer pairs.
- * These copies are further subdivided into batches to distribute the amount of work
- * to be done as evenly as possible across the multiprocessors on the device.
- * This kernel is arranged such that each block copies 1 source/destination pair.
- *
- * @param src_bufs Input source buffers
- * @param dst_bufs Destination buffers
  * @param buf_info Information on the range of values to be copied for each destination buffer
  */
-template <int block_size>
-__global__ void copy_partitions(uint8_t const** src_bufs,
-                                uint8_t* user_buffer,
+template <int block_size, typename IndexToDstBuf>
+__global__ void copy_partitions(IndexToDstBuf index_to_buffer,
+                                uint8_t const** src_bufs,
                                 dst_buf_info* buf_info)
 {
   auto const buf_index     = blockIdx.x;
@@ -320,7 +287,7 @@ __global__ void copy_partitions(uint8_t const** src_bufs,
 
   // copy, shifting offsets and validity bits as needed
   copy_buffer<block_size>(
-    user_buffer + buf_info[buf_index].dst_offset,
+    index_to_buffer(buf_index) + buf_info[buf_index].dst_offset,
     src_bufs[src_buf_index],
     threadIdx.x,
     buf_info[buf_index].num_elements,
@@ -776,7 +743,6 @@ std::tuple<size_type, int64_t, int64_t, size_type> build_output_column_metadata(
  * copied buffer
  * @param out_begin Output iterator of column views
  * @param base_ptr Pointer to the base address of copied data for the working partition
- * @param mb packed column metadata builder
  *
  * @returns new dst_buf_info iterator after processing this range of input columns
  */
@@ -790,9 +756,6 @@ BufInfo build_output_columns(InputIter begin,
 {
   auto current_info = info_begin;
   std::transform(begin, end, out_begin, [&current_info, base_ptr, &mb](column_view const& src) {
-    size_type col_size, null_count;
-    int64_t bitmask_offset;
-    int64_t data_offset;
     auto [col_size, data_offset, bitmask_offset, null_count] =
       build_output_column_metadata<BufInfo>(src, current_info, mb, false);
 
@@ -990,10 +953,6 @@ struct out_to_in_index_function {
   }
 };
 
-};  // anonymous namespace
-
-namespace detail {
-
 // packed block of memory 1: split indices and src_buf_info structs
 struct packed_split_indices_and_src_buf_info {
   explicit packed_split_indices_and_src_buf_info(cudf::table_view const& input,
@@ -1114,7 +1073,6 @@ struct packed_src_and_dst_pointers {
   packed_src_and_dst_pointers(cudf::table_view const& input,
                               std::size_t num_partitions,
                               cudf::size_type num_src_bufs,
-                              int num_iterations,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
     : stream(stream)
@@ -1159,28 +1117,61 @@ struct packed_src_and_dst_pointers {
   uint8_t** d_dst_bufs;
 };
 
+};  // anonymous namespace
+
+namespace detail {
+
+/**
+ * @brief Create an instance of `packed_src_and_dst_pointers` populating destination
+ * partitition buffers (if any) from `out_buffers`. In the chunked_pack case
+ * `out_buffers` is empty, and the destination pointer is provided separately
+ * to the `copy_partitions` kernel.
+ *
+ * @param input source table view
+ * @param num_partitions the number of partitions create (1 meaning no splits)
+ * @param num_src_bufs number of buffers for the source columns including children
+ * @param out_buffers the destination buffers per partition if in the non-chunked case
+ * @param stream Optional CUDA stream on which to execute kernels
+ * @param mr RMM memory resource
+ *
+ * @returns new unique pointer to packed_src_and_dst_pointers
+ */
 std::unique_ptr<packed_src_and_dst_pointers> setup_src_and_dst_pointers(
   cudf::table_view const& input,
   std::size_t num_partitions,
   cudf::size_type num_src_bufs,
-  int num_iterations,
   std::vector<rmm::device_buffer>& out_buffers,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  auto src_and_dst_pointers = std::make_unique<packed_src_and_dst_pointers>(
-    input, num_partitions, num_src_bufs, num_iterations, stream, mr);
+  auto src_and_dst_pointers =
+    std::make_unique<packed_src_and_dst_pointers>(input, num_partitions, num_src_bufs, stream, mr);
 
   std::transform(
     out_buffers.begin(), out_buffers.end(), src_and_dst_pointers->h_dst_bufs, [](auto& buf) {
       return static_cast<uint8_t*>(buf.data());
     });
 
+  // copy the struct to device memory to access from the kernel
   src_and_dst_pointers->copy_to_device();
 
   return src_and_dst_pointers;
 }
 
+/**
+ * @brief Create an instance of `packed_partition_buf_size_and_dst_buf_info` containing
+ * the partition-level dst_buf_info structs for each partition and column buffer.
+ *
+ * @param input source table view
+ * @param splits the numeric value (in rows) for each split, empty for 1 partition
+ * @param num_partitions the number of partitions create (1 meaning no splits)
+ * @param num_src_bufs number of buffers for the source columns including children
+ * @param num_bufs num_src_bufs times the number of partitions
+ * @param stream Optional CUDA stream on which to execute kernels
+ * @param mr RMM memory resource
+ *
+ * @returns new unique pointer to `packed_partition_buf_size_and_dst_buf_info`
+ */
 std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
   cudf::table_view const& input,
   std::vector<size_type> const& splits,
@@ -1341,7 +1332,7 @@ struct chunk_iteration_state {
                         std::size_t total_size)
     : num_iterations(_h_num_buffs_per_iteration.size()),
       current_iteration(0),
-      starting_buff(0),
+      starting_batch(0),
       d_batched_dst_buf_info(std::move(_d_batched_dst_buf_info)),
       d_batch_offsets(std::move(_d_batch_offsets)),
       h_num_buffs_per_iteration(std::move(_h_num_buffs_per_iteration)),
@@ -1350,12 +1341,18 @@ struct chunk_iteration_state {
   {
   }
 
+  /**
+   * @brief As of the time of the call, return the starting 1MB batch index, and the
+   * number of batches to copy.
+   *
+   * @return the current iteration's starting_batch and batch count as a pair
+   */
   std::pair<std::size_t, std::size_t> get_current_starting_index_and_buff_count() const
   {
     CUDF_EXPECTS(current_iteration < num_iterations,
                  "current_iteration cannot exceed num_iterations");
     auto count_for_current = h_num_buffs_per_iteration[current_iteration];
-    return std::make_pair(starting_buff, count_for_current);
+    return std::make_pair(starting_batch, count_for_current);
   }
 
   std::size_t advance_iteration()
@@ -1363,7 +1360,7 @@ struct chunk_iteration_state {
     CUDF_EXPECTS(current_iteration < num_iterations,
                  "current_iteration cannot exceed num_iterations");
     std::size_t bytes_copied = h_size_of_buffs_per_iteration[current_iteration];
-    starting_buff += h_num_buffs_per_iteration[current_iteration];
+    starting_batch += h_num_buffs_per_iteration[current_iteration];
     ++current_iteration;
     return bytes_copied;
   }
@@ -1377,7 +1374,7 @@ struct chunk_iteration_state {
   int current_iteration;
 
  private:
-  std::size_t starting_buff;
+  std::size_t starting_batch;
   std::vector<std::size_t> h_num_buffs_per_iteration;
   std::vector<std::size_t> h_size_of_buffs_per_iteration;
 };
@@ -1466,6 +1463,14 @@ std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
       // underneath the final structure of the output
     });
 
+  /**
+   * In the chunked case, this is the code that fixes up the offsets of each batch
+   * and prepares each iteration. Given the batches computed before, it figures
+   * out the number of batches that will fit in an iteration of `user_buffer_size`.
+   *
+   * Specifically, offsets for batches are reset to the 0th byte when a new iteration
+   * of `user_buffer_size` bytes is needed.
+   */
   if (user_buffer_size != 0) {
     // copy the batch offsets back to host
     std::vector<std::size_t> h_offsets(num_batches + 1);
@@ -1582,6 +1587,25 @@ std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
   }
 }
 
+// template<bool T>
+// struct index_to_buffer_func {
+//   uint8_t **d_dst_bufs;
+//   rmm::device_uvector<dst_buf_info>&d_dst_buf_info;
+//   uint8_t *user_buffer;
+
+//  typename std::enable_if<true, uint8_t*>
+//  __device__ operator()(unsigned int) const
+//  {
+//    return user_buffer;
+//  }
+
+//  typename std::enable_if<false, uint8_t*>
+//  __device__ operator()(unsigned int) const
+//  {
+//    auto const dst_buf_index = dst_buf_info[buf_index].dst_buf_index;
+//    return d_dst_bufs[dst_buf_index];
+//  }
+//};
 void copy_data(int num_batches_to_copy,
                int starting_batch,
                uint8_t const** d_src_bufs,
@@ -1592,11 +1616,18 @@ void copy_data(int num_batches_to_copy,
 {
   constexpr size_type block_size = 256;
   if (user_buffer != nullptr) {
+    auto index_to_buffer = [user_buffer] __device__(unsigned int) { return user_buffer; };
     copy_partitions<block_size><<<num_batches_to_copy, block_size, 0, stream.value()>>>(
-      d_src_bufs, user_buffer, d_dst_buf_info.data() + starting_batch);
+      index_to_buffer, d_src_bufs, d_dst_buf_info.data() + starting_batch);
   } else {
+    auto index_to_buffer = [d_dst_bufs,
+                            dst_buf_info = d_dst_buf_info.data(),
+                            user_buffer] __device__(unsigned int buf_index) {
+      auto const dst_buf_index = dst_buf_info[buf_index].dst_buf_index;
+      return d_dst_bufs[dst_buf_index];
+    };
     copy_partitions<block_size><<<num_batches_to_copy, block_size, 0, stream.value()>>>(
-      d_src_bufs, d_dst_bufs, d_dst_buf_info.data() + starting_batch);
+      index_to_buffer, d_src_bufs, d_dst_buf_info.data() + starting_batch);
   }
 }
 
@@ -1648,8 +1679,6 @@ bool check_inputs(cudf::table_view const& input, std::vector<size_type> const& s
  * None of the methods are thread safe.
  */
 struct contiguous_split_state {
-  static const std::size_t desired_batch_size = 1 * 1024 * 1024;
-
   contiguous_split_state(cudf::table_view const& input,
                          std::size_t user_buffer_size,
                          rmm::cuda_stream_view stream,
@@ -1700,13 +1729,8 @@ struct contiguous_split_state {
                      });
     }
 
-    src_and_dst_pointers = std::move(setup_src_and_dst_pointers(input,
-                                                                num_partitions,
-                                                                num_src_bufs,
-                                                                chunk_iter_state->num_iterations,
-                                                                out_buffers,
-                                                                stream,
-                                                                mr));
+    src_and_dst_pointers = std::move(
+      setup_src_and_dst_pointers(input, num_partitions, num_src_bufs, out_buffers, stream, mr));
   }
 
   bool has_next() const { return !is_empty && chunk_iter_state->has_more_copies(); }
@@ -1724,15 +1748,14 @@ struct contiguous_split_state {
     // them into much smaller batches in order to drive up the number of blocks and overall
     // occupancy.
     rmm::device_uvector<thrust::pair<std::size_t, std::size_t>> batches(num_bufs, stream, mr);
-    auto d_dst_buf_info     = partition_buf_size_and_dst_buf_info->d_dst_buf_info;
-    auto h_buf_sizes        = partition_buf_size_and_dst_buf_info->h_buf_sizes;
-    auto desired_batch_size = contiguous_split_state::desired_batch_size;
+    auto d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info;
+    auto h_buf_sizes    = partition_buf_size_and_dst_buf_info->h_buf_sizes;
     thrust::transform(
       rmm::exec_policy(stream, mr),
       d_dst_buf_info,
       d_dst_buf_info + num_bufs,
       batches.begin(),
-      [desired_batch_size] __device__(
+      [desired_batch_size = desired_batch_size] __device__(
         dst_buf_info const& buf) -> thrust::pair<std::size_t, std::size_t> {
         // Total bytes for this incoming partition
         std::size_t const bytes =
@@ -1818,7 +1841,6 @@ struct contiguous_split_state {
       "Cannot use a device span smaller than the output buffer size configured at instantiation!");
     CUDF_EXPECTS(has_next(), "Cannot call contiguous_split_chunk with has_next() == false!");
 
-    std::size_t starting_batch, num_batches_to_copy;
     auto [starting_batch, num_batches_to_copy] =
       chunk_iter_state->get_current_starting_index_and_buff_count();
 
@@ -2024,7 +2046,7 @@ std::unique_ptr<chunked_pack> make_chunked_pack(cudf::table_view const& input,
                                                 std::size_t user_buffer_size,
                                                 rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(user_buffer_size >= detail::contiguous_split_state::desired_batch_size,
+  CUDF_EXPECTS(user_buffer_size >= desired_batch_size,
                "The output buffer size must be at least 1MB in size");
   return std::make_unique<chunked_pack>(input, user_buffer_size, cudf::get_default_stream(), mr);
 }

From 3b83b03ac8483d3935849340603bb9226aa8ce5c Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Sun, 7 May 2023 16:03:12 -0500
Subject: [PATCH 06/31] Add more comments in the contiguous_split_state
 constructor

---
 cpp/src/copying/contiguous_split.cu | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index f75d1bf5297..d18117fba39 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1712,9 +1712,13 @@ struct contiguous_split_state {
     // is the result.
     if (is_empty) { return; }
 
+    // First pass over the source tables to generate a `dst_buf_info` per split and column buffer
+    // (`num_bufs`). After this, contiguous_split uses `dst_buf_info` to further subdivide the work
+    // into 1MB batches in `compute_batches`
     partition_buf_size_and_dst_buf_info =
       std::move(compute_splits(input, splits, num_partitions, num_src_bufs, num_bufs, stream, mr));
 
+    // Second pass: uses `dst_buf_info` to break down the work into 1MB batches.
     compute_batches();
 
     // allocate output partition buffers, in the non-chunked case

From 425a3132f465d0257440983e054ddfb048db9216 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Mon, 8 May 2023 22:53:07 -0500
Subject: [PATCH 07/31] Use std::get<> and remove commented out code

---
 cpp/src/copying/contiguous_split.cu | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index d18117fba39..8e531a8de99 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1587,25 +1587,6 @@ std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
   }
 }
 
-// template<bool T>
-// struct index_to_buffer_func {
-//   uint8_t **d_dst_bufs;
-//   rmm::device_uvector<dst_buf_info>&d_dst_buf_info;
-//   uint8_t *user_buffer;
-
-//  typename std::enable_if<true, uint8_t*>
-//  __device__ operator()(unsigned int) const
-//  {
-//    return user_buffer;
-//  }
-
-//  typename std::enable_if<false, uint8_t*>
-//  __device__ operator()(unsigned int) const
-//  {
-//    auto const dst_buf_index = dst_buf_info[buf_index].dst_buf_index;
-//    return d_dst_bufs[dst_buf_index];
-//  }
-//};
 void copy_data(int num_batches_to_copy,
                int starting_batch,
                uint8_t const** d_src_bufs,
@@ -1788,9 +1769,8 @@ struct contiguous_split_state {
     CUDF_EXPECTS(user_buffer_size == 0, "Cannot contiguous split with a user buffer");
     if (is_empty || input.num_columns() == 0) { return make_packed_tables(); }
 
-    std::size_t num_batches_total;
-    std::tie(std::ignore, num_batches_total) =
-      chunk_iter_state->get_current_starting_index_and_buff_count();
+    auto const num_batches_total = std::get<1>(
+      chunk_iter_state->get_current_starting_index_and_buff_count());
 
     // perform the copy.
     copy_data(num_batches_total,

From cc9acb413f3fc836282438dc47bc3e4644cf2e5a Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Tue, 9 May 2023 08:51:30 -0500
Subject: [PATCH 08/31] fix styles

---
 cpp/src/copying/contiguous_split.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 8e531a8de99..64da96e4ce4 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1769,8 +1769,8 @@ struct contiguous_split_state {
     CUDF_EXPECTS(user_buffer_size == 0, "Cannot contiguous split with a user buffer");
     if (is_empty || input.num_columns() == 0) { return make_packed_tables(); }
 
-    auto const num_batches_total = std::get<1>(
-      chunk_iter_state->get_current_starting_index_and_buff_count());
+    auto const num_batches_total =
+      std::get<1>(chunk_iter_state->get_current_starting_index_and_buff_count());
 
     // perform the copy.
     copy_data(num_batches_total,

From 9b3a591d4938e9b8a0eb0826b2b177202c3ae05b Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Wed, 10 May 2023 14:57:45 -0500
Subject: [PATCH 09/31] chunked_pack::create and address other reviewer
 feedback

---
 cpp/benchmarks/copying/contiguous_split.cu |  2 +-
 cpp/include/cudf/contiguous_split.hpp      | 37 +++++++++++-----------
 cpp/src/copying/contiguous_split.cu        | 21 ++++++------
 cpp/tests/copying/split_tests.cpp          | 10 +++---
 4 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/cpp/benchmarks/copying/contiguous_split.cu b/cpp/benchmarks/copying/contiguous_split.cu
index daa80cdd425..89fb4d57088 100644
--- a/cpp/benchmarks/copying/contiguous_split.cu
+++ b/cpp/benchmarks/copying/contiguous_split.cu
@@ -35,7 +35,7 @@ void chunked_pack(cudf::table_view const& src_table, std::vector<cudf::size_type
   auto const mr     = rmm::mr::get_current_device_resource();
   auto const stream = cudf::get_default_stream();
   auto user_buffer  = rmm::device_uvector<std::uint8_t>(100L * 1024 * 1024, stream, mr);
-  auto chunked_pack = cudf::make_chunked_pack(src_table, user_buffer.size(), mr);
+  auto chunked_pack = cudf::chunked_pack::create(src_table, user_buffer.size(), mr);
   while (chunked_pack->has_next()) {
     auto iter_size = chunked_pack->next(user_buffer);
   }
diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp
index fe6e7bc3e96..7296b6905a3 100644
--- a/cpp/include/cudf/contiguous_split.hpp
+++ b/cpp/include/cudf/contiguous_split.hpp
@@ -165,7 +165,7 @@ struct contiguous_split_state;
  * //
  * std::size_t user_buffer_size = 128*1024*1024;
  *
- * auto chunked_packer = make_chunked_pack(tv, user_buffer_size, stream, mr);
+ * auto chunked_packer = cudf::chunked_pack::create(tv, user_buffer_size, stream, mr);
  *
  * std::size_t host_offset = 0;
  * auto host_buffer = ...; // obtain a host buffer you would like to copy to
@@ -236,7 +236,6 @@ class chunked_pack {
    *                    the `user_buffer_size` parameter passed at construction
    * @return The number of bytes that were written to `user_buffer` (at most
    *          `user_buffer_size`)
-   *
    */
   [[nodiscard]] std::size_t next(cudf::device_span<uint8_t> const& user_buffer);
 
@@ -247,28 +246,28 @@ class chunked_pack {
    */
   [[nodiscard]] std::unique_ptr<std::vector<uint8_t>> build_metadata() const;
 
+  /**
+   * @brief Creates a `chunked_pack` instance to perform a "pack" of the `table_view`
+   * "input", where a buffer of `user_buffer_size` is filled with chunks of the
+   * overall operation. This operation can be used in cases where GPU memory is constrained.
+   *
+   * @throws cudf::logic_error When user_buffer_size is less than 1MB
+   *
+   * @param input source `table_view` to pack
+   * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be
+   *                         at least 1MB
+   * @param mr RMM memory resource to be used for temporary and scratch allocations only
+   * @return a unique_ptr of chunked_pack
+   */
+  [[nodiscard]] static std::unique_ptr<chunked_pack> create(cudf::table_view const& input,
+                                                            std::size_t user_buffer_size,
+                                                            rmm::mr::device_memory_resource* mr);
+
  private:
   // internal state of contiguous split
   std::unique_ptr<detail::contiguous_split_state> state;
 };
 
-/**
- * @brief Created a `chunked_pack` instance to perform a "pack" of the `table_view`
- * "input", where a buffer of `user_buffer_size` is filled with chunks of the
- * overall operation. This operation can be used in cases where GPU memory is constrained.
- *
- * @throws cudf::logic_error When user_buffer_size is less than 1MB
- *
- * @param input source `table_view` to pack
- * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be
- *                         at least 1MB
- * @param mr RMM memory resource to be used for temporary and scratch allocations only
- * @return a unique_ptr of chunked_pack
- */
-std::unique_ptr<chunked_pack> make_chunked_pack(cudf::table_view const& input,
-                                                std::size_t user_buffer_size,
-                                                rmm::mr::device_memory_resource* mr);
-
 /**
  * @brief Deep-copy a `table_view` into a serialized contiguous memory format.
  *
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 64da96e4ce4..eb08cbb867f 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -759,7 +759,7 @@ BufInfo build_output_columns(InputIter begin,
     auto [col_size, data_offset, bitmask_offset, null_count] =
       build_output_column_metadata<BufInfo>(src, current_info, mb, false);
 
-    auto bitmask_ptr =
+    auto const bitmask_ptr =
       base_ptr != nullptr && bitmask_offset != -1
         ? reinterpret_cast<bitmask_type const*>(base_ptr + static_cast<uint64_t>(bitmask_offset))
         : nullptr;
@@ -927,14 +927,11 @@ struct batch_byte_size_function {
   dst_buf_info const* infos;
   __device__ std::size_t operator()(size_type i) const
   {
-    if (i == num_batches) {
-      return 0;
-    } else {
-      auto& buf = *(infos + i);
-      std::size_t const bytes =
-        static_cast<std::size_t>(buf.num_elements) * static_cast<std::size_t>(buf.element_size);
-      return util::round_up_unsafe(bytes, split_align);
-    }
+    if (i == num_batches) { return 0; }
+    auto& buf = *(infos + i);
+    std::size_t const bytes =
+      static_cast<std::size_t>(buf.num_elements) * static_cast<std::size_t>(buf.element_size);
+    return util::round_up_unsafe(bytes, split_align);
   }
 };
 
@@ -2026,9 +2023,9 @@ std::unique_ptr<std::vector<uint8_t>> chunked_pack::build_metadata() const
   return state->build_packed_column_metadata();
 }
 
-std::unique_ptr<chunked_pack> make_chunked_pack(cudf::table_view const& input,
-                                                std::size_t user_buffer_size,
-                                                rmm::mr::device_memory_resource* mr)
+std::unique_ptr<chunked_pack> chunked_pack::create(cudf::table_view const& input,
+                                                   std::size_t user_buffer_size,
+                                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(user_buffer_size >= desired_batch_size,
                "The output buffer size must be at least 1MB in size");
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index 51c42bd7cf0..73da4510dfc 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -1322,7 +1322,7 @@ std::vector<cudf::packed_table> do_chunked_pack(cudf::table_view const& input)
   auto bounce_buff_span =
     cudf::device_span<uint8_t>(static_cast<uint8_t*>(bounce_buff.data()), bounce_buff.size());
 
-  auto chunked_pack = cudf::make_chunked_pack(input, bounce_buff_span.size(), mr);
+  auto chunked_pack = cudf::chunked_pack::create(input, bounce_buff_span.size(), mr);
 
   // right size the final buffer
   rmm::device_buffer final_buff(
@@ -2309,12 +2309,12 @@ TEST_F(ContiguousSplitTableCornerCases, OutBufferToSmall)
 {
   // internally, contiguous split chunks GPU work in 1MB contiguous copies
   // so the output buffer must be 1MB or larger.
-  EXPECT_THROW(cudf::make_chunked_pack({}, 1 * 1024, mr()), cudf::logic_error);
+  EXPECT_THROW(cudf::chunked_pack::create({}, 1 * 1024, mr()), cudf::logic_error);
 }
 
 TEST_F(ContiguousSplitTableCornerCases, ChunkSpanTooSmall)
 {
-  auto chunked_pack = cudf::make_chunked_pack({}, 1 * 1024 * 1024, mr());
+  auto chunked_pack = cudf::chunked_pack::create({}, 1 * 1024 * 1024, mr());
   rmm::device_buffer buff(1 * 1024, cudf::get_default_stream(), mr());
   cudf::device_span<uint8_t> too_small(static_cast<uint8_t*>(buff.data()), buff.size());
   std::size_t copied = 0;
@@ -2326,7 +2326,7 @@ TEST_F(ContiguousSplitTableCornerCases, ChunkSpanTooSmall)
 
 TEST_F(ContiguousSplitTableCornerCases, EmptyTableHasNextFalse)
 {
-  auto chunked_pack = cudf::make_chunked_pack({}, 1 * 1024 * 1024, mr());
+  auto chunked_pack = cudf::chunked_pack::create({}, 1 * 1024 * 1024, mr());
   rmm::device_buffer buff(1 * 1024 * 1024, cudf::get_default_stream(), mr());
   cudf::device_span<uint8_t> bounce_buff(static_cast<uint8_t*>(buff.data()), buff.size());
   EXPECT_EQ(chunked_pack->has_next(), false);  // empty input table
@@ -2341,7 +2341,7 @@ TEST_F(ContiguousSplitTableCornerCases, ExhaustedHasNextFalse)
   cudf::table_view t({a});
   rmm::device_buffer buff(1 * 1024 * 1024, cudf::get_default_stream(), mr());
   cudf::device_span<uint8_t> bounce_buff(static_cast<uint8_t*>(buff.data()), buff.size());
-  auto chunked_pack = cudf::make_chunked_pack(t, buff.size(), mr());
+  auto chunked_pack = cudf::chunked_pack::create(t, buff.size(), mr());
   EXPECT_EQ(chunked_pack->has_next(), true);
   std::size_t copied = chunked_pack->next(bounce_buff);
   EXPECT_EQ(copied, chunked_pack->get_total_contiguous_size());

From 8f9dbbd9accf489eb29ec8b1cd8c60419b2104eb Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 11 May 2023 20:54:53 -0500
Subject: [PATCH 10/31] Update due to code review comments

---
 cpp/src/copying/contiguous_split.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index eb08cbb867f..8f9f17e1755 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -42,7 +42,6 @@
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
 #include <thrust/transform.h>
-#include <thrust/transform_scan.h>
 #include <thrust/tuple.h>
 
 #include <cstddef>
@@ -723,7 +722,7 @@ std::tuple<size_type, int64_t, int64_t, size_type> build_output_column_metadata(
     src.type(), col_size, null_count, data_offset, bitmask_offset, src.num_children());
 
   ++current_info;
-  return std::make_tuple(col_size, data_offset, bitmask_offset, null_count);
+  return {col_size, data_offset, bitmask_offset, null_count};
 }
 
 /**

From e2ca459faa94f9564cbf325e425a4e7708e1f35f Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Fri, 12 May 2023 13:55:45 -0500
Subject: [PATCH 11/31] Const fixes as suggested in review

---
 cpp/src/copying/contiguous_split.cu | 89 ++++++++++++++---------------
 1 file changed, 44 insertions(+), 45 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 8f9f17e1755..55aee27d6db 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -833,8 +833,8 @@ struct buf_size_functor {
  * The key is simply the partition index.
  */
 struct split_key_functor {
-  int num_src_bufs;
-  int operator() __device__(int buf_index) { return buf_index / num_src_bufs; }
+  int const num_src_bufs;
+  int operator() __device__(int buf_index) const { return buf_index / num_src_bufs; }
 };
 
 /**
@@ -914,7 +914,7 @@ struct size_of_helper {
  * structs) return 0.
  */
 struct num_batches_func {
-  thrust::pair<std::size_t, std::size_t> const* batches;
+  thrust::pair<std::size_t, std::size_t> const* const batches;
   __device__ std::size_t operator()(size_type i) const { return thrust::get<0>(batches[i]); }
 };
 
@@ -922,12 +922,12 @@ struct num_batches_func {
  * @brief Get the size in bytes of a batch described by `dst_buf_info`.
  */
 struct batch_byte_size_function {
-  size_type num_batches;
-  dst_buf_info const* infos;
+  size_type const num_batches;
+  dst_buf_info const* const infos;
   __device__ std::size_t operator()(size_type i) const
   {
     if (i == num_batches) { return 0; }
-    auto& buf = *(infos + i);
+    auto const& buf = *(infos + i);
     std::size_t const bytes =
       static_cast<std::size_t>(buf.num_elements) * static_cast<std::size_t>(buf.element_size);
     return util::round_up_unsafe(bytes, split_align);
@@ -938,8 +938,8 @@ struct batch_byte_size_function {
  * @brief Get the input buffer index given the output buffer index.
  */
 struct out_to_in_index_function {
-  offset_type const* batch_offsets;
-  int num_bufs;
+  offset_type const* const batch_offsets;
+  int const num_bufs;
   __device__ int operator()(size_type i) const
   {
     return static_cast<size_type>(
@@ -1018,10 +1018,10 @@ struct packed_partition_buf_size_and_dst_buf_info {
                                              std::size_t num_bufs,
                                              rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
-    : stream(stream)
+    : stream(stream),
+      buf_sizes_size{cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align)},
+      dst_buf_info_size{cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align)}
   {
-    buf_sizes_size = cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align);
-    dst_buf_info_size = cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align);
     // host-side
     h_buf_sizes_and_dst_info = std::vector<uint8_t>(buf_sizes_size + dst_buf_info_size);
     h_buf_sizes              = reinterpret_cast<std::size_t*>(h_buf_sizes_and_dst_info.data());
@@ -1047,11 +1047,11 @@ struct packed_partition_buf_size_and_dst_buf_info {
                                   stream.value()));
   }
 
-  rmm::cuda_stream_view stream;
+  rmm::cuda_stream_view const stream;
 
   // buffer sizes and destination info (used in batched copies)
-  std::size_t buf_sizes_size;
-  std::size_t dst_buf_info_size;
+  std::size_t const buf_sizes_size;
+  std::size_t const dst_buf_info_size;
 
   std::vector<uint8_t> h_buf_sizes_and_dst_info;
   std::size_t* h_buf_sizes;
@@ -1071,12 +1071,10 @@ struct packed_src_and_dst_pointers {
                               cudf::size_type num_src_bufs,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
-    : stream(stream)
+    : stream(stream),
+      src_bufs_size{cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align)},
+      dst_bufs_size{cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align)}
   {
-    src_bufs_size = cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align);
-
-    dst_bufs_size = cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align);
-
     // host-side
     h_src_and_dst_buffers = std::vector<uint8_t>(src_bufs_size + dst_bufs_size);
     h_src_bufs            = reinterpret_cast<uint8_t const**>(h_src_and_dst_buffers.data());
@@ -1101,12 +1099,12 @@ struct packed_src_and_dst_pointers {
                                   stream.value()));
   }
 
-  const rmm::cuda_stream_view stream;
+  rmm::cuda_stream_view const stream;
+  std::size_t const src_bufs_size;
+  std::size_t const dst_bufs_size;
 
   std::vector<uint8_t> h_src_and_dst_buffers;
   rmm::device_buffer d_src_and_dst_buffers;
-  std::size_t src_bufs_size;
-  std::size_t dst_bufs_size;
   const uint8_t** h_src_bufs;
   const uint8_t** d_src_bufs;
   uint8_t** h_dst_bufs;
@@ -1181,17 +1179,18 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
     std::make_unique<packed_partition_buf_size_and_dst_buf_info>(
       input, splits, num_partitions, num_src_bufs, num_bufs, stream, mr);
 
-  auto d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info;
-  auto h_buf_sizes    = partition_buf_size_and_dst_buf_info->h_buf_sizes;
-  auto d_buf_sizes    = partition_buf_size_and_dst_buf_info->d_buf_sizes;
+  auto const d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info;
+  auto const h_buf_sizes    = partition_buf_size_and_dst_buf_info->h_buf_sizes;
+  auto const d_buf_sizes    = partition_buf_size_and_dst_buf_info->d_buf_sizes;
 
-  auto split_indices_and_src_buf_info =
+  auto const split_indices_and_src_buf_info =
     packed_split_indices_and_src_buf_info(input, splits, num_partitions, num_src_bufs, stream, mr);
 
-  auto const d_src_buf_info        = split_indices_and_src_buf_info.d_src_buf_info;
-  auto offset_stack_partition_size = split_indices_and_src_buf_info.offset_stack_partition_size;
-  auto d_offset_stack              = split_indices_and_src_buf_info.d_offset_stack;
-  auto d_indices                   = split_indices_and_src_buf_info.d_indices;
+  auto const d_src_buf_info = split_indices_and_src_buf_info.d_src_buf_info;
+  auto const offset_stack_partition_size =
+    split_indices_and_src_buf_info.offset_stack_partition_size;
+  auto const d_offset_stack = split_indices_and_src_buf_info.d_offset_stack;
+  auto const d_indices      = split_indices_and_src_buf_info.d_indices;
 
   // compute sizes of each column in each partition, including alignment.
   thrust::transform(
@@ -1275,7 +1274,7 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
   // compute total size of each partition
   // key is the split index
   {
-    auto keys = cudf::detail::make_counting_transform_iterator(
+    auto const keys = cudf::detail::make_counting_transform_iterator(
       0, split_key_functor{static_cast<int>(num_src_bufs)});
     auto values =
       cudf::detail::make_counting_transform_iterator(0, buf_size_functor{d_dst_buf_info});
@@ -1290,7 +1289,7 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
 
   // compute start offset for each output buffer for each split
   {
-    auto keys = cudf::detail::make_counting_transform_iterator(
+    auto const keys = cudf::detail::make_counting_transform_iterator(
       0, split_key_functor{static_cast<int>(num_src_bufs)});
     auto values =
       cudf::detail::make_counting_transform_iterator(0, buf_size_functor{d_dst_buf_info});
@@ -1323,8 +1322,8 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
 struct chunk_iteration_state {
   chunk_iteration_state(rmm::device_uvector<dst_buf_info> _d_batched_dst_buf_info,
                         rmm::device_uvector<offset_type> _d_batch_offsets,
-                        std::vector<std::size_t> _h_num_buffs_per_iteration,
-                        std::vector<std::size_t> _h_size_of_buffs_per_iteration,
+                        std::vector<std::size_t>&& _h_num_buffs_per_iteration,
+                        std::vector<std::size_t>&& _h_size_of_buffs_per_iteration,
                         std::size_t total_size)
     : num_iterations(_h_num_buffs_per_iteration.size()),
       current_iteration(0),
@@ -1364,15 +1363,15 @@ struct chunk_iteration_state {
   bool has_more_copies() const { return current_iteration < num_iterations; }
 
   rmm::device_uvector<dst_buf_info> d_batched_dst_buf_info;
-  rmm::device_uvector<offset_type> d_batch_offsets;
-  std::size_t total_size;
-  int num_iterations;
+  rmm::device_uvector<offset_type> const d_batch_offsets;
+  std::size_t const total_size;
+  int const num_iterations;
   int current_iteration;
 
  private:
   std::size_t starting_batch;
-  std::vector<std::size_t> h_num_buffs_per_iteration;
-  std::vector<std::size_t> h_size_of_buffs_per_iteration;
+  std::vector<std::size_t> const h_num_buffs_per_iteration;
+  std::vector<std::size_t> const h_size_of_buffs_per_iteration;
 };
 
 std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
@@ -1387,7 +1386,7 @@ std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
 {
   rmm::device_uvector<offset_type> d_batch_offsets(num_bufs + 1, stream, mr);
 
-  auto buf_count_iter = cudf::detail::make_counting_transform_iterator(
+  auto const buf_count_iter = cudf::detail::make_counting_transform_iterator(
     0, [num_bufs, num_batches = num_batches_func{batches.begin()}] __device__(size_type i) {
       return i == num_bufs ? 0 : num_batches(i);
     });
@@ -1405,7 +1404,7 @@ std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
 
   auto out_to_in_index = out_to_in_index_function{d_batch_offsets.begin(), num_bufs};
 
-  auto iter = thrust::make_counting_iterator(0);
+  auto const iter = thrust::make_counting_iterator(0);
 
   // load up the batches as d_dst_buf_info
   rmm::device_uvector<dst_buf_info> d_batched_dst_buf_info(num_batches, stream, mr);
@@ -1472,7 +1471,7 @@ std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
     std::vector<std::size_t> h_offsets(num_batches + 1);
     {
       rmm::device_uvector<std::size_t> offsets(h_offsets.size(), stream, mr);
-      auto batch_byte_size_iter = cudf::detail::make_counting_transform_iterator(
+      auto const batch_byte_size_iter = cudf::detail::make_counting_transform_iterator(
         0, batch_byte_size_function{num_batches, d_batched_dst_buf_info.begin()});
 
       thrust::exclusive_scan(rmm::exec_policy(stream, mr),
@@ -1544,7 +1543,7 @@ std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
       // we want to update the offset of batches for every iteration, except the first one (because
       // offsets in the first iteration are all 0 based)
       auto num_batches_in_first_iteration = num_batches_per_iteration[0];
-      auto iter           = thrust::make_counting_iterator(num_batches_in_first_iteration);
+      auto const iter     = thrust::make_counting_iterator(num_batches_in_first_iteration);
       auto num_iterations = accum_size_per_iteration.size();
       thrust::for_each(
         rmm::exec_policy(stream, mr),
@@ -1784,7 +1783,7 @@ struct contiguous_split_state {
 
     // postprocess valid_counts: apply the valid counts computed by copy_data for each
     // batch back to the original dst_buf_infos
-    auto keys = cudf::detail::make_counting_transform_iterator(
+    auto const keys = cudf::detail::make_counting_transform_iterator(
       0, out_to_in_index_function{chunk_iter_state->d_batch_offsets.begin(), (int)num_bufs});
 
     auto values = thrust::make_transform_iterator(
@@ -1858,7 +1857,7 @@ struct contiguous_split_state {
     // build the empty results
     std::vector<packed_table> result;
     result.reserve(num_partitions);
-    auto iter = thrust::make_counting_iterator(0);
+    auto const iter = thrust::make_counting_iterator(0);
     std::transform(iter,
                    iter + num_partitions,
                    std::back_inserter(result),

From 394ab62280d2a47bbd6acef4f1f2c7fb8b4a6ad3 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Fri, 12 May 2023 17:19:38 -0500
Subject: [PATCH 12/31] Const changes

---
 cpp/src/copying/contiguous_split.cu | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 55aee27d6db..807e1b14114 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -957,10 +957,9 @@ struct packed_split_indices_and_src_buf_info {
                                                  cudf::size_type num_src_bufs,
                                                  rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
+    : indices_size(cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align)),
+      src_buf_info_size(cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align))
   {
-    indices_size = cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align);
-    src_buf_info_size = cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align);
-
     // host-side
     h_indices_and_source_info = std::vector<uint8_t>(indices_size + src_buf_info_size);
     h_indices                 = reinterpret_cast<size_type*>(h_indices_and_source_info.data());
@@ -993,8 +992,8 @@ struct packed_split_indices_and_src_buf_info {
       d_indices, h_indices, indices_size + src_buf_info_size, cudaMemcpyDefault, stream.value()));
   }
 
-  size_type indices_size;
-  std::size_t src_buf_info_size;
+  size_type const indices_size;
+  std::size_t const src_buf_info_size;
   std::size_t offset_stack_size;
 
   std::vector<uint8_t> h_indices_and_source_info;
@@ -1105,8 +1104,8 @@ struct packed_src_and_dst_pointers {
 
   std::vector<uint8_t> h_src_and_dst_buffers;
   rmm::device_buffer d_src_and_dst_buffers;
-  const uint8_t** h_src_bufs;
-  const uint8_t** d_src_bufs;
+  uint8_t const** h_src_bufs;
+  uint8_t const** d_src_bufs;
   uint8_t** h_dst_bufs;
   uint8_t** d_dst_bufs;
 };

From c4864f1ecf728c542559fbae9dc1e88fb431a380 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Fri, 12 May 2023 17:25:35 -0500
Subject: [PATCH 13/31] Style fixes

---
 cpp/src/copying/contiguous_split.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 807e1b14114..4cc30cc6529 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -957,7 +957,8 @@ struct packed_split_indices_and_src_buf_info {
                                                  cudf::size_type num_src_bufs,
                                                  rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
-    : indices_size(cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align)),
+    : indices_size(
+        cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align)),
       src_buf_info_size(cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align))
   {
     // host-side

From be5fb0194eb7f1a31fd970c0ccaa61a6d1df1a4f Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Mon, 15 May 2023 19:23:20 -0500
Subject: [PATCH 14/31] Remove header that is not neededfrom split_tests.cpp

---
 cpp/tests/copying/split_tests.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index 73da4510dfc..f6e492fa3e2 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -31,7 +31,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/device_buffer.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>

From e0686ff15a7b56567872a24c6645f09e228edeb3 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Mon, 15 May 2023 19:24:33 -0500
Subject: [PATCH 15/31] Changed to contiguous_split.hpp/.cu per review

---
 cpp/include/cudf/contiguous_split.hpp | 19 +++++++++----------
 cpp/src/copying/contiguous_split.cu   | 17 +++++++++--------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp
index 7296b6905a3..2756a8963cc 100644
--- a/cpp/include/cudf/contiguous_split.hpp
+++ b/cpp/include/cudf/contiguous_split.hpp
@@ -144,16 +144,15 @@ struct contiguous_split_state;
  * The caller has two methods it can use to carry out the chunked_pack: has_next and next.
  * Here is an example:
  *
+ * @code{.pseudo}
  * // Create a table_view
  * cudf::table_view tv = ...;
  *
- * // Choose a memory resource. This memory resource is used for scratch/thrust temporary
+ * // Choose a memory resource (optional). This memory resource is used for scratch/thrust temporary
  * // data. In memory constrained cases, this can be used to set aside scratch memory
  * // for `chunked_pack` at the beginning of a program.
  * auto mr = rmm::mr::get_current_device_resource();
  *
- * auto stream = cudf::get_default_stream();
- *
  * // Define a buffer size for each chunk: the larger the buffer is, the more SMs can be
  * // occupied by this algorithm.
  * //
@@ -165,7 +164,7 @@ struct contiguous_split_state;
  * //
  * std::size_t user_buffer_size = 128*1024*1024;
  *
- * auto chunked_packer = cudf::chunked_pack::create(tv, user_buffer_size, stream, mr);
+ * auto chunked_packer = cudf::chunked_pack::create(tv, user_buffer_size, mr);
  *
  * std::size_t host_offset = 0;
  * auto host_buffer = ...; // obtain a host buffer you would like to copy to
@@ -187,6 +186,7 @@ struct contiguous_split_state;
  *
  *   host_offset += bytes_copied;
  * }
+ * @endcode
  */
 class chunked_pack {
  public:
@@ -196,13 +196,12 @@ class chunked_pack {
    * @param input source `table_view` to pack
    * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be
    *                         at least 1MB
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr RMM memory resource to be used for temporary and scratch allocations only
+   * @param mr An optional memory resource to be used for temporary and scratch allocations only
    */
-  explicit chunked_pack(cudf::table_view const& input,
-                        std::size_t user_buffer_size,
-                        rmm::cuda_stream_view stream,
-                        rmm::mr::device_memory_resource* mr);
+  explicit chunked_pack(
+    cudf::table_view const& input,
+    std::size_t user_buffer_size,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Destructor that will be implemented as default, required because
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index cfdef822303..1e366a800eb 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1111,10 +1111,6 @@ struct packed_src_and_dst_pointers {
   uint8_t** d_dst_bufs;
 };
 
-};  // anonymous namespace
-
-namespace detail {
-
 /**
  * @brief Create an instance of `packed_src_and_dst_pointers` populating destination
  * partitition buffers (if any) from `out_buffers`. In the chunked_pack case
@@ -1308,6 +1304,7 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
 
   return partition_buf_size_and_dst_buf_info;
 }
+
 /**
  * @brief Struct containing information about the actual batches we will send to the
  * `copy_partitions` kernel and the number of iterations we need to carry out this copy.
@@ -1637,6 +1634,10 @@ bool check_inputs(cudf::table_view const& input, std::vector<size_type> const& s
   return input.column(0).size() == 0;
 }
 
+};  // anonymous namespace
+
+namespace detail {
+
 /**
  * @brief A helper struct containing the state of contiguous_split, whether the caller
  * is using the single-pass contiguous_split or chunked_pack.
@@ -1887,7 +1888,7 @@ struct contiguous_split_state {
 
     auto& h_dst_buf_info  = partition_buf_size_and_dst_buf_info->h_dst_buf_info;
     auto cur_dst_buf_info = h_dst_buf_info;
-    metadata_builder mb{input.num_columns()};
+    detail::metadata_builder mb{input.num_columns()};
 
     populate_metadata(input.begin(), input.end(), cur_dst_buf_info, mb);
 
@@ -1907,7 +1908,7 @@ struct contiguous_split_state {
     auto& h_dst_bufs     = src_and_dst_pointers->h_dst_bufs;
 
     auto cur_dst_buf_info = h_dst_buf_info;
-    metadata_builder mb(input.num_columns());
+    detail::metadata_builder mb(input.num_columns());
 
     for (std::size_t idx = 0; idx < num_partitions; idx++) {
       // traverse the buffers and build the columns.
@@ -1996,10 +1997,10 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
 
 chunked_pack::chunked_pack(cudf::table_view const& input,
                            std::size_t user_buffer_size,
-                           rmm::cuda_stream_view stream,
                            rmm::mr::device_memory_resource* mr)
 {
-  state = std::make_unique<detail::contiguous_split_state>(input, user_buffer_size, stream, mr);
+  state = std::make_unique<detail::contiguous_split_state>(
+    input, user_buffer_size, cudf::get_default_stream(), mr);
 }
 
 // required for the unique_ptr to work with a non-complete type (contiguous_split_state)

From 2a822e6c29381b6e90d2316602205df3509deb88 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Mon, 15 May 2023 19:43:46 -0500
Subject: [PATCH 16/31] Remove stream from API

---
 cpp/src/copying/contiguous_split.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 1e366a800eb..eb13900f6d5 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -2029,7 +2029,7 @@ std::unique_ptr<chunked_pack> chunked_pack::create(cudf::table_view const& input
 {
   CUDF_EXPECTS(user_buffer_size >= desired_batch_size,
                "The output buffer size must be at least 1MB in size");
-  return std::make_unique<chunked_pack>(input, user_buffer_size, cudf::get_default_stream(), mr);
+  return std::make_unique<chunked_pack>(input, user_buffer_size, mr);
 }
 
 };  // namespace cudf

From 2ebb68b3f8a04e6398493ea765f382c149a24edb Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@gmail.com>
Date: Tue, 16 May 2023 09:35:27 -0500
Subject: [PATCH 17/31] Update return documentation

Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com>
---
 cpp/src/copying/contiguous_split.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index eb13900f6d5..a74ebcc07b1 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -679,7 +679,7 @@ std::pair<src_buf_info*, size_type> setup_source_buf_info(InputIter begin,
  *                           count information. The null count should be taken
  *                           from `src` because this case is restricted to a single partition
  *                           (no splits)
- * @returns a std::tuple<size_type, int64_t, int64_t, size_type> containing:
+ * @returns a std::tuple containing:
  *          column size, data offset, bitmask offset, and null count
  */
 template <typename BufInfo>

From 0e10fdc901594154b47da2f87e453b1095f1427c Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Tue, 16 May 2023 13:16:44 -0500
Subject: [PATCH 18/31] Initializer list + const fixes

---
 cpp/src/copying/contiguous_split.cu | 83 ++++++++++++++---------------
 1 file changed, 40 insertions(+), 43 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index a74ebcc07b1..053a2cc3e05 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -959,14 +959,14 @@ struct packed_split_indices_and_src_buf_info {
                                                  rmm::mr::device_memory_resource* mr)
     : indices_size(
         cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align)),
-      src_buf_info_size(cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align))
+      src_buf_info_size(
+        cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align)),
+      // host-side
+      h_indices_and_source_info(indices_size + src_buf_info_size),
+      h_indices{reinterpret_cast<size_type*>(h_indices_and_source_info.data())},
+      h_src_buf_info{
+        reinterpret_cast<src_buf_info*>(h_indices_and_source_info.data() + indices_size)}
   {
-    // host-side
-    h_indices_and_source_info = std::vector<uint8_t>(indices_size + src_buf_info_size);
-    h_indices                 = reinterpret_cast<size_type*>(h_indices_and_source_info.data());
-    h_src_buf_info =
-      reinterpret_cast<src_buf_info*>(h_indices_and_source_info.data() + indices_size);
-
     // compute splits -> indices.
     // these are row numbers per split
     h_indices[0]              = 0;
@@ -1000,8 +1000,8 @@ struct packed_split_indices_and_src_buf_info {
   std::vector<uint8_t> h_indices_and_source_info;
   rmm::device_buffer d_indices_and_source_info;
 
-  size_type* h_indices;
-  src_buf_info* h_src_buf_info;
+  size_type* const h_indices;
+  src_buf_info* const h_src_buf_info;
 
   int offset_stack_partition_size;
   size_type* d_indices;
@@ -1020,21 +1020,19 @@ struct packed_partition_buf_size_and_dst_buf_info {
                                              rmm::mr::device_memory_resource* mr)
     : stream(stream),
       buf_sizes_size{cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align)},
-      dst_buf_info_size{cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align)}
+      dst_buf_info_size{cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align)},
+      // host-side
+      h_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size),
+      h_buf_sizes{reinterpret_cast<std::size_t*>(h_buf_sizes_and_dst_info.data())},
+      h_dst_buf_info{
+        reinterpret_cast<dst_buf_info*>(h_buf_sizes_and_dst_info.data() + buf_sizes_size)},
+      // device-side
+      d_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size, stream, mr),
+      d_buf_sizes{reinterpret_cast<std::size_t*>(d_buf_sizes_and_dst_info.data())},
+      //// destination buffer info
+      d_dst_buf_info{reinterpret_cast<dst_buf_info*>(
+        static_cast<uint8_t*>(d_buf_sizes_and_dst_info.data()) + buf_sizes_size)}
   {
-    // host-side
-    h_buf_sizes_and_dst_info = std::vector<uint8_t>(buf_sizes_size + dst_buf_info_size);
-    h_buf_sizes              = reinterpret_cast<std::size_t*>(h_buf_sizes_and_dst_info.data());
-    h_dst_buf_info =
-      reinterpret_cast<dst_buf_info*>(h_buf_sizes_and_dst_info.data() + buf_sizes_size);
-
-    // device-side
-    d_buf_sizes_and_dst_info = rmm::device_buffer(buf_sizes_size + dst_buf_info_size, stream, mr);
-    d_buf_sizes              = reinterpret_cast<std::size_t*>(d_buf_sizes_and_dst_info.data());
-
-    //// destination buffer info
-    d_dst_buf_info = reinterpret_cast<dst_buf_info*>(
-      static_cast<uint8_t*>(d_buf_sizes_and_dst_info.data()) + buf_sizes_size);
   }
 
   void copy_to_host()
@@ -1054,12 +1052,12 @@ struct packed_partition_buf_size_and_dst_buf_info {
   std::size_t const dst_buf_info_size;
 
   std::vector<uint8_t> h_buf_sizes_and_dst_info;
-  std::size_t* h_buf_sizes;
-  dst_buf_info* h_dst_buf_info;
+  std::size_t* const h_buf_sizes;
+  dst_buf_info* const h_dst_buf_info;
 
   rmm::device_buffer d_buf_sizes_and_dst_info;
-  std::size_t* d_buf_sizes;
-  dst_buf_info* d_dst_buf_info;
+  std::size_t* const d_buf_sizes;
+  dst_buf_info* const d_dst_buf_info;
 };
 
 // Packed block of memory 3:
@@ -1073,19 +1071,17 @@ struct packed_src_and_dst_pointers {
                               rmm::mr::device_memory_resource* mr)
     : stream(stream),
       src_bufs_size{cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align)},
-      dst_bufs_size{cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align)}
+      dst_bufs_size{cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align)},
+      // host-side
+      h_src_and_dst_buffers(src_bufs_size + dst_bufs_size),
+      h_src_bufs{reinterpret_cast<uint8_t const**>(h_src_and_dst_buffers.data())},
+      h_dst_bufs{reinterpret_cast<uint8_t**>(h_src_and_dst_buffers.data() + src_bufs_size)},
+      // device-side
+      d_src_and_dst_buffers{rmm::device_buffer(src_bufs_size + dst_bufs_size, stream, mr)},
+      d_src_bufs{reinterpret_cast<uint8_t const**>(d_src_and_dst_buffers.data())},
+      d_dst_bufs{reinterpret_cast<uint8_t**>(
+        reinterpret_cast<uint8_t*>(d_src_and_dst_buffers.data()) + src_bufs_size)}
   {
-    // host-side
-    h_src_and_dst_buffers = std::vector<uint8_t>(src_bufs_size + dst_bufs_size);
-    h_src_bufs            = reinterpret_cast<uint8_t const**>(h_src_and_dst_buffers.data());
-    h_dst_bufs = reinterpret_cast<uint8_t**>(h_src_and_dst_buffers.data() + src_bufs_size);
-
-    // device-side
-    d_src_and_dst_buffers = rmm::device_buffer(src_bufs_size + dst_bufs_size, stream, mr);
-    d_src_bufs            = reinterpret_cast<uint8_t const**>(d_src_and_dst_buffers.data());
-    d_dst_bufs            = reinterpret_cast<uint8_t**>(
-      reinterpret_cast<uint8_t*>(d_src_and_dst_buffers.data()) + src_bufs_size);
-
     // setup src buffers
     setup_src_buf_data(input.begin(), input.end(), h_src_bufs);
   }
@@ -1104,11 +1100,12 @@ struct packed_src_and_dst_pointers {
   std::size_t const dst_bufs_size;
 
   std::vector<uint8_t> h_src_and_dst_buffers;
+  uint8_t const** const h_src_bufs;
+  uint8_t** const h_dst_bufs;
+
   rmm::device_buffer d_src_and_dst_buffers;
-  uint8_t const** h_src_bufs;
-  uint8_t const** d_src_bufs;
-  uint8_t** h_dst_bufs;
-  uint8_t** d_dst_bufs;
+  uint8_t const** const d_src_bufs;
+  uint8_t** const d_dst_bufs;
 };
 
 /**

From 8c344a4beb320d37ad1c69cd9af9beb9f699d404 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Tue, 16 May 2023 13:41:20 -0500
Subject: [PATCH 19/31] Rely on free standing column_views for edge case +
 remove extra slash in comment

---
 cpp/src/copying/contiguous_split.cu | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 053a2cc3e05..ce7b7bf0ad1 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1029,7 +1029,7 @@ struct packed_partition_buf_size_and_dst_buf_info {
       // device-side
       d_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size, stream, mr),
       d_buf_sizes{reinterpret_cast<std::size_t*>(d_buf_sizes_and_dst_info.data())},
-      //// destination buffer info
+      // destination buffer info
       d_dst_buf_info{reinterpret_cast<dst_buf_info*>(
         static_cast<uint8_t*>(d_buf_sizes_and_dst_info.data()) + buf_sizes_size)}
   {
@@ -1838,18 +1838,13 @@ struct contiguous_split_state {
   std::vector<packed_table> make_empty_packed_table()
   {
     // sanitize the inputs (to handle corner cases like sliced tables)
-    std::vector<std::unique_ptr<column>> empty_columns;
-    empty_columns.reserve(input.num_columns());
-    std::transform(
-      input.begin(), input.end(), std::back_inserter(empty_columns), [](column_view const& col) {
-        return cudf::empty_like(col);
-      });
     std::vector<cudf::column_view> empty_column_views;
     empty_column_views.reserve(input.num_columns());
-    std::transform(empty_columns.begin(),
-                   empty_columns.end(),
+    std::transform(input.begin(),
+                   input.end(),
                    std::back_inserter(empty_column_views),
-                   [](std::unique_ptr<column> const& col) { return col->view(); });
+                   [](column_view const& col) { return cudf::empty_like(col)->view(); });
+
     table_view empty_inputs(empty_column_views);
 
     // build the empty results

From 7a73d57ad4330a052a21ae9bb3d5cccba4f569bb Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Tue, 16 May 2023 23:03:35 -0500
Subject: [PATCH 20/31] Remove extra header in split_tests.cpp

---
 cpp/tests/copying/split_tests.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index f6e492fa3e2..c9a53d6ebe0 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -28,7 +28,6 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/filling.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/device_buffer.hpp>
 

From dae5ceb1cf8c8345139ab3c62317da4dfd2706f5 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Wed, 17 May 2023 08:26:03 -0500
Subject: [PATCH 21/31] Introduce temp_mr. Comment edits and some more
 initializer list and constness issues

---
 cpp/include/cudf/contiguous_split.hpp |  24 ++--
 cpp/src/copying/contiguous_split.cu   | 191 ++++++++++++++++----------
 2 files changed, 129 insertions(+), 86 deletions(-)

diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp
index 2756a8963cc..ce995a5c32a 100644
--- a/cpp/include/cudf/contiguous_split.hpp
+++ b/cpp/include/cudf/contiguous_split.hpp
@@ -141,8 +141,8 @@ struct contiguous_split_state;
  * all thrust and scratch memory allocations are using the passed-in memory resource exclusively,
  * not a per-device memory resource.
  *
- * The caller has two methods it can use to carry out the chunked_pack: has_next and next.
- * Here is an example:
+ * This class defines two methods that must be used in concert to carry out the chunked_pack:
+ * has_next and next. Here is an example:
  *
  * @code{.pseudo}
  * // Create a table_view
@@ -156,7 +156,7 @@ struct contiguous_split_state;
  * // Define a buffer size for each chunk: the larger the buffer is, the more SMs can be
  * // occupied by this algorithm.
  * //
- * // Internally, the GPU unit-of-work is a 1MB batch. When we instantiate `cudf::chunked_pack`,
+ * // Internally, the GPU unit of work is a 1MB batch. When we instantiate `cudf::chunked_pack`,
  * // all the 1MB batches for the source table_view are computed up front. Additionally,
  * // chunked_pack calculates the number of iterations that are required to go through all those
  * // batches given a `user_buffer_size` buffer. The number of 1MB batches in each iteration (chunk)
@@ -196,15 +196,16 @@ class chunked_pack {
    * @param input source `table_view` to pack
    * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be
    *                         at least 1MB
-   * @param mr An optional memory resource to be used for temporary and scratch allocations only
+   * @param temp_mr An optional memory resource to be used for temporary and scratch allocations
+   * only
    */
   explicit chunked_pack(
     cudf::table_view const& input,
     std::size_t user_buffer_size,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::mr::device_memory_resource* temp_mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @brief Destructor that will be implemented as default, required because
+   * @brief Destructor that will be implemented as default. Declared with definition here because
    * contiguous_split_state is incomplete at this stage.
    */
   ~chunked_pack();
@@ -224,7 +225,7 @@ class chunked_pack {
   [[nodiscard]] bool has_next() const;
 
   /**
-   * @brief Packs the next chunk into `user_buffer`. This should be call as long as
+   * @brief Packs the next chunk into `user_buffer`. This should be called as long as
    * `has_next` returns true. If `next` is called when `has_next` is false, an exception
    * is thrown.
    *
@@ -255,12 +256,13 @@ class chunked_pack {
    * @param input source `table_view` to pack
    * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be
    *                         at least 1MB
-   * @param mr RMM memory resource to be used for temporary and scratch allocations only
+   * @param temp_mr RMM memory resource to be used for temporary and scratch allocations only
    * @return a unique_ptr of chunked_pack
    */
-  [[nodiscard]] static std::unique_ptr<chunked_pack> create(cudf::table_view const& input,
-                                                            std::size_t user_buffer_size,
-                                                            rmm::mr::device_memory_resource* mr);
+  [[nodiscard]] static std::unique_ptr<chunked_pack> create(
+    cudf::table_view const& input,
+    std::size_t user_buffer_size,
+    rmm::mr::device_memory_resource* temp_mr);
 
  private:
   // internal state of contiguous split
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index ce7b7bf0ad1..d982c11aee8 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -956,7 +956,7 @@ struct packed_split_indices_and_src_buf_info {
                                                  std::size_t num_partitions,
                                                  cudf::size_type num_src_bufs,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::mr::device_memory_resource* temp_mr)
     : indices_size(
         cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align)),
       src_buf_info_size(
@@ -981,7 +981,7 @@ struct packed_split_indices_and_src_buf_info {
     // device-side
     // gpu-only : stack space needed for nested list offset calculation
     d_indices_and_source_info =
-      rmm::device_buffer(indices_size + src_buf_info_size + offset_stack_size, stream, mr);
+      rmm::device_buffer(indices_size + src_buf_info_size + offset_stack_size, stream, temp_mr);
     d_indices      = reinterpret_cast<size_type*>(d_indices_and_source_info.data());
     d_src_buf_info = reinterpret_cast<src_buf_info*>(
       reinterpret_cast<uint8_t*>(d_indices_and_source_info.data()) + indices_size);
@@ -1017,7 +1017,7 @@ struct packed_partition_buf_size_and_dst_buf_info {
                                              cudf::size_type num_src_bufs,
                                              std::size_t num_bufs,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::mr::device_memory_resource* temp_mr)
     : stream(stream),
       buf_sizes_size{cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align)},
       dst_buf_info_size{cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align)},
@@ -1027,7 +1027,7 @@ struct packed_partition_buf_size_and_dst_buf_info {
       h_dst_buf_info{
         reinterpret_cast<dst_buf_info*>(h_buf_sizes_and_dst_info.data() + buf_sizes_size)},
       // device-side
-      d_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size, stream, mr),
+      d_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size, stream, temp_mr),
       d_buf_sizes{reinterpret_cast<std::size_t*>(d_buf_sizes_and_dst_info.data())},
       // destination buffer info
       d_dst_buf_info{reinterpret_cast<dst_buf_info*>(
@@ -1068,7 +1068,7 @@ struct packed_src_and_dst_pointers {
                               std::size_t num_partitions,
                               cudf::size_type num_src_bufs,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::mr::device_memory_resource* temp_mr)
     : stream(stream),
       src_bufs_size{cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align)},
       dst_bufs_size{cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align)},
@@ -1077,7 +1077,7 @@ struct packed_src_and_dst_pointers {
       h_src_bufs{reinterpret_cast<uint8_t const**>(h_src_and_dst_buffers.data())},
       h_dst_bufs{reinterpret_cast<uint8_t**>(h_src_and_dst_buffers.data() + src_bufs_size)},
       // device-side
-      d_src_and_dst_buffers{rmm::device_buffer(src_bufs_size + dst_bufs_size, stream, mr)},
+      d_src_and_dst_buffers{rmm::device_buffer(src_bufs_size + dst_bufs_size, stream, temp_mr)},
       d_src_bufs{reinterpret_cast<uint8_t const**>(d_src_and_dst_buffers.data())},
       d_dst_bufs{reinterpret_cast<uint8_t**>(
         reinterpret_cast<uint8_t*>(d_src_and_dst_buffers.data()) + src_bufs_size)}
@@ -1119,7 +1119,7 @@ struct packed_src_and_dst_pointers {
  * @param num_src_bufs number of buffers for the source columns including children
  * @param out_buffers the destination buffers per partition if in the non-chunked case
  * @param stream Optional CUDA stream on which to execute kernels
- * @param mr RMM memory resource
+ * @param temp_mr A memory resource for temporary and scratch space
  *
  * @returns new unique pointer to packed_src_and_dst_pointers
  */
@@ -1129,10 +1129,10 @@ std::unique_ptr<packed_src_and_dst_pointers> setup_src_and_dst_pointers(
   cudf::size_type num_src_bufs,
   std::vector<rmm::device_buffer>& out_buffers,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::mr::device_memory_resource* temp_mr)
 {
-  auto src_and_dst_pointers =
-    std::make_unique<packed_src_and_dst_pointers>(input, num_partitions, num_src_bufs, stream, mr);
+  auto src_and_dst_pointers = std::make_unique<packed_src_and_dst_pointers>(
+    input, num_partitions, num_src_bufs, stream, temp_mr);
 
   std::transform(
     out_buffers.begin(), out_buffers.end(), src_and_dst_pointers->h_dst_bufs, [](auto& buf) {
@@ -1155,7 +1155,7 @@ std::unique_ptr<packed_src_and_dst_pointers> setup_src_and_dst_pointers(
  * @param num_src_bufs number of buffers for the source columns including children
  * @param num_bufs num_src_bufs times the number of partitions
  * @param stream Optional CUDA stream on which to execute kernels
- * @param mr RMM memory resource
+ * @param temp_mr A memory resource for temporary and scratch space
  *
  * @returns new unique pointer to `packed_partition_buf_size_and_dst_buf_info`
  */
@@ -1166,18 +1166,18 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
   cudf::size_type num_src_bufs,
   std::size_t num_bufs,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::mr::device_memory_resource* temp_mr)
 {
   auto partition_buf_size_and_dst_buf_info =
     std::make_unique<packed_partition_buf_size_and_dst_buf_info>(
-      input, splits, num_partitions, num_src_bufs, num_bufs, stream, mr);
+      input, splits, num_partitions, num_src_bufs, num_bufs, stream, temp_mr);
 
   auto const d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info;
   auto const h_buf_sizes    = partition_buf_size_and_dst_buf_info->h_buf_sizes;
   auto const d_buf_sizes    = partition_buf_size_and_dst_buf_info->d_buf_sizes;
 
-  auto const split_indices_and_src_buf_info =
-    packed_split_indices_and_src_buf_info(input, splits, num_partitions, num_src_bufs, stream, mr);
+  auto const split_indices_and_src_buf_info = packed_split_indices_and_src_buf_info(
+    input, splits, num_partitions, num_src_bufs, stream, temp_mr);
 
   auto const d_src_buf_info = split_indices_and_src_buf_info.d_src_buf_info;
   auto const offset_stack_partition_size =
@@ -1187,7 +1187,7 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
 
   // compute sizes of each column in each partition, including alignment.
   thrust::transform(
-    rmm::exec_policy(stream, mr),
+    rmm::exec_policy(stream, temp_mr),
     thrust::make_counting_iterator<std::size_t>(0),
     thrust::make_counting_iterator<std::size_t>(num_bufs),
     d_dst_buf_info,
@@ -1272,7 +1272,7 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
     auto values =
       cudf::detail::make_counting_transform_iterator(0, buf_size_functor{d_dst_buf_info});
 
-    thrust::reduce_by_key(rmm::exec_policy(stream, mr),
+    thrust::reduce_by_key(rmm::exec_policy(stream, temp_mr),
                           keys,
                           keys + num_bufs,
                           values,
@@ -1287,7 +1287,7 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
     auto values =
       cudf::detail::make_counting_transform_iterator(0, buf_size_functor{d_dst_buf_info});
 
-    thrust::exclusive_scan_by_key(rmm::exec_policy(stream, mr),
+    thrust::exclusive_scan_by_key(rmm::exec_policy(stream, temp_mr),
                                   keys,
                                   keys + num_bufs,
                                   values,
@@ -1307,7 +1307,7 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
  * `copy_partitions` kernel and the number of iterations we need to carry out this copy.
  *
  * For the non-chunked contiguous_split case, this contains the batched dst_buf_infos and the
- * number of iterations are going to be 1, since the non-chunked case is single pass.
+ * number of iterations is going to be 1 since the non-chunked case is single pass.
  *
  * For the chunked_pack case, this also contains the batched dst_buf_infos for all
  * iterations in addition to helping keep the state about what batches have been copied so far
@@ -1320,8 +1320,8 @@ struct chunk_iteration_state {
                         std::vector<std::size_t>&& _h_size_of_buffs_per_iteration,
                         std::size_t total_size)
     : num_iterations(_h_num_buffs_per_iteration.size()),
-      current_iteration(0),
-      starting_batch(0),
+      current_iteration{0},
+      starting_batch{0},
       d_batched_dst_buf_info(std::move(_d_batched_dst_buf_info)),
       d_batch_offsets(std::move(_d_batch_offsets)),
       h_num_buffs_per_iteration(std::move(_h_num_buffs_per_iteration)),
@@ -1330,6 +1330,16 @@ struct chunk_iteration_state {
   {
   }
 
+  static std::unique_ptr<chunk_iteration_state> create(
+    rmm::device_uvector<thrust::pair<std::size_t, std::size_t>> const& batches,
+    int num_bufs,
+    dst_buf_info* d_orig_dst_buf_info,
+    std::size_t const* const h_buf_sizes,
+    std::size_t num_partitions,
+    std::size_t user_buffer_size,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* temp_mr);
+
   /**
    * @brief As of the time of the call, return the starting 1MB batch index, and the
    * number of batches to copy.
@@ -1341,9 +1351,17 @@ struct chunk_iteration_state {
     CUDF_EXPECTS(current_iteration < num_iterations,
                  "current_iteration cannot exceed num_iterations");
     auto count_for_current = h_num_buffs_per_iteration[current_iteration];
-    return std::make_pair(starting_batch, count_for_current);
+    return {starting_batch, count_for_current};
   }
 
+  /**
+   * @brief Advance the iteration state if there are iterations left, updating the
+   * starting batch and returning the amount of bytes were copied in the iteration
+   * we just finished.
+   * @throws cudf::logic_error If the state was at the last iteration before entering
+   * this function.
+   * @return size in bytes that were copied in the finished iteration
+   */
   std::size_t advance_iteration()
   {
     CUDF_EXPECTS(current_iteration < num_iterations,
@@ -1354,6 +1372,9 @@ struct chunk_iteration_state {
     return bytes_copied;
   }
 
+  /**
+   * Returns true if there are iterations left.
+   */
   bool has_more_copies() const { return current_iteration < num_iterations; }
 
   rmm::device_uvector<dst_buf_info> d_batched_dst_buf_info;
@@ -1368,7 +1389,7 @@ struct chunk_iteration_state {
   std::vector<std::size_t> const h_size_of_buffs_per_iteration;
 };
 
-std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
+std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
   rmm::device_uvector<thrust::pair<std::size_t, std::size_t>> const& batches,
   int num_bufs,
   dst_buf_info* d_orig_dst_buf_info,
@@ -1376,16 +1397,16 @@ std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
   std::size_t num_partitions,
   std::size_t user_buffer_size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::mr::device_memory_resource* temp_mr)
 {
-  rmm::device_uvector<offset_type> d_batch_offsets(num_bufs + 1, stream, mr);
+  rmm::device_uvector<offset_type> d_batch_offsets(num_bufs + 1, stream, temp_mr);
 
   auto const buf_count_iter = cudf::detail::make_counting_transform_iterator(
     0, [num_bufs, num_batches = num_batches_func{batches.begin()}] __device__(size_type i) {
       return i == num_bufs ? 0 : num_batches(i);
     });
 
-  thrust::exclusive_scan(rmm::exec_policy(stream, mr),
+  thrust::exclusive_scan(rmm::exec_policy(stream, temp_mr),
                          buf_count_iter,
                          buf_count_iter + num_bufs + 1,
                          d_batch_offsets.begin(),
@@ -1394,17 +1415,17 @@ std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
   auto const num_batches_iter =
     cudf::detail::make_counting_transform_iterator(0, num_batches_func{batches.begin()});
   size_type const num_batches = thrust::reduce(
-    rmm::exec_policy(stream, mr), num_batches_iter, num_batches_iter + batches.size());
+    rmm::exec_policy(stream, temp_mr), num_batches_iter, num_batches_iter + batches.size());
 
   auto out_to_in_index = out_to_in_index_function{d_batch_offsets.begin(), num_bufs};
 
   auto const iter = thrust::make_counting_iterator(0);
 
   // load up the batches as d_dst_buf_info
-  rmm::device_uvector<dst_buf_info> d_batched_dst_buf_info(num_batches, stream, mr);
+  rmm::device_uvector<dst_buf_info> d_batched_dst_buf_info(num_batches, stream, temp_mr);
 
   thrust::for_each(
-    rmm::exec_policy(stream, mr),
+    rmm::exec_policy(stream, temp_mr),
     iter,
     iter + num_batches,
     [d_orig_dst_buf_info,
@@ -1464,11 +1485,11 @@ std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
     // copy the batch offsets back to host
     std::vector<std::size_t> h_offsets(num_batches + 1);
     {
-      rmm::device_uvector<std::size_t> offsets(h_offsets.size(), stream, mr);
+      rmm::device_uvector<std::size_t> offsets(h_offsets.size(), stream, temp_mr);
       auto const batch_byte_size_iter = cudf::detail::make_counting_transform_iterator(
         0, batch_byte_size_function{num_batches, d_batched_dst_buf_info.begin()});
 
-      thrust::exclusive_scan(rmm::exec_policy(stream, mr),
+      thrust::exclusive_scan(rmm::exec_policy(stream, temp_mr),
                              batch_byte_size_iter,
                              batch_byte_size_iter + num_batches + 1,
                              offsets.begin());
@@ -1526,7 +1547,7 @@ std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
     // apply changed offset
     {
       rmm::device_uvector<std::size_t> d_accum_size_per_iteration(
-        accum_size_per_iteration.size(), stream, mr);
+        accum_size_per_iteration.size(), stream, temp_mr);
 
       CUDF_CUDA_TRY(cudaMemcpyAsync(d_accum_size_per_iteration.data(),
                                     accum_size_per_iteration.data(),
@@ -1540,7 +1561,7 @@ std::unique_ptr<chunk_iteration_state> make_chunk_iteration_state(
       auto const iter     = thrust::make_counting_iterator(num_batches_in_first_iteration);
       auto num_iterations = accum_size_per_iteration.size();
       thrust::for_each(
-        rmm::exec_policy(stream, mr),
+        rmm::exec_policy(stream, temp_mr),
         iter,
         iter + num_batches - num_batches_in_first_iteration,
         [num_iterations,
@@ -1640,10 +1661,10 @@ namespace detail {
  * is using the single-pass contiguous_split or chunked_pack.
  *
  * It exposes an iterator-like pattern where contiguous_split_state::has_next()
- * return true when there is work to be done, and false otherwise.
+ * returns true when there is work to be done, and false otherwise.
  *
  * contiguous_split_state::contiguous_split() performs a single-pass contiguous_split
- * and is only valid iff contiguous_split_state is instantiated with 0 for the user_buffer_size.
+ * and is valid iff contiguous_split_state is instantiated with 0 for the user_buffer_size.
  *
  * contiguous_split_state::contiguous_split_chunk(device_span) is only valid when
  * user_buffer_size > 0. It should be called as long as has_next() returns true. The
@@ -1656,16 +1677,18 @@ struct contiguous_split_state {
   contiguous_split_state(cudf::table_view const& input,
                          std::size_t user_buffer_size,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
-    : contiguous_split_state(input, {}, user_buffer_size, stream, mr)
+                         rmm::mr::device_memory_resource* mr,
+                         rmm::mr::device_memory_resource* temp_mr)
+    : contiguous_split_state(input, {}, user_buffer_size, stream, mr, temp_mr)
   {
   }
 
   contiguous_split_state(cudf::table_view const& input,
                          std::vector<size_type> const& splits,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
-    : contiguous_split_state(input, splits, 0, stream, mr)
+                         rmm::mr::device_memory_resource* mr,
+                         rmm::mr::device_memory_resource* temp_mr)
+    : contiguous_split_state(input, splits, 0, stream, mr, temp_mr)
   {
   }
 
@@ -1673,14 +1696,18 @@ struct contiguous_split_state {
                          std::vector<size_type> const& splits,
                          std::size_t user_buffer_size,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
-    : input(input), user_buffer_size(user_buffer_size), stream(stream), mr(mr)
+                         rmm::mr::device_memory_resource* mr,
+                         rmm::mr::device_memory_resource* temp_mr)
+    : input(input),
+      user_buffer_size(user_buffer_size),
+      stream(stream),
+      mr(mr),
+      temp_mr(temp_mr),
+      is_empty{check_inputs(input, splits)},
+      num_partitions{splits.size() + 1},
+      num_src_bufs{count_src_bufs(input.begin(), input.end())},
+      num_bufs{num_src_bufs * num_partitions}
   {
-    is_empty       = check_inputs(input, splits);
-    num_partitions = splits.size() + 1;
-    num_src_bufs   = count_src_bufs(input.begin(), input.end());
-    num_bufs       = num_src_bufs * num_partitions;
-
     // if the table we are about to contig split is empty, we have special
     // handling where metadata is produced and a 0-byte contiguous buffer
     // is the result.
@@ -1689,8 +1716,8 @@ struct contiguous_split_state {
     // First pass over the source tables to generate a `dst_buf_info` per split and column buffer
     // (`num_bufs`). After this, contiguous_split uses `dst_buf_info` to further subdivide the work
     // into 1MB batches in `compute_batches`
-    partition_buf_size_and_dst_buf_info =
-      std::move(compute_splits(input, splits, num_partitions, num_src_bufs, num_bufs, stream, mr));
+    partition_buf_size_and_dst_buf_info = std::move(
+      compute_splits(input, splits, num_partitions, num_src_bufs, num_bufs, stream, temp_mr));
 
     // Second pass: uses `dst_buf_info` to break down the work into 1MB batches.
     compute_batches();
@@ -1707,8 +1734,8 @@ struct contiguous_split_state {
                      });
     }
 
-    src_and_dst_pointers = std::move(
-      setup_src_and_dst_pointers(input, num_partitions, num_src_bufs, out_buffers, stream, mr));
+    src_and_dst_pointers = std::move(setup_src_and_dst_pointers(
+      input, num_partitions, num_src_bufs, out_buffers, stream, temp_mr));
   }
 
   bool has_next() const { return !is_empty && chunk_iter_state->has_more_copies(); }
@@ -1720,16 +1747,16 @@ struct contiguous_split_state {
 
   void compute_batches()
   {
-    // Since we parallelize at one block per copy, we are vulnerable to situations where we
+    // Since we parallelize at one block per copy, performance is vulnerable to situations where we
     // have small numbers of copies to do (a combination of small numbers of splits and/or columns),
     // so we will take the actual set of outgoing source/destination buffers and further partition
     // them into much smaller batches in order to drive up the number of blocks and overall
     // occupancy.
-    rmm::device_uvector<thrust::pair<std::size_t, std::size_t>> batches(num_bufs, stream, mr);
+    rmm::device_uvector<thrust::pair<std::size_t, std::size_t>> batches(num_bufs, stream, temp_mr);
     auto d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info;
     auto h_buf_sizes    = partition_buf_size_and_dst_buf_info->h_buf_sizes;
     thrust::transform(
-      rmm::exec_policy(stream, mr),
+      rmm::exec_policy(stream, temp_mr),
       d_dst_buf_info,
       d_dst_buf_info + num_bufs,
       batches.begin(),
@@ -1753,8 +1780,14 @@ struct contiguous_split_state {
         return {num_batches, desired_batch_size};
       });
 
-    chunk_iter_state = make_chunk_iteration_state(
-      batches, num_bufs, d_dst_buf_info, h_buf_sizes, num_partitions, user_buffer_size, stream, mr);
+    chunk_iter_state = chunk_iteration_state::create(batches,
+                                                     num_bufs,
+                                                     d_dst_buf_info,
+                                                     h_buf_sizes,
+                                                     num_partitions,
+                                                     user_buffer_size,
+                                                     stream,
+                                                     temp_mr);
   }
 
   std::vector<packed_table> contiguous_split()
@@ -1788,7 +1821,7 @@ struct contiguous_split_state {
       chunk_iter_state->d_batched_dst_buf_info.begin(),
       [] __device__(dst_buf_info const& info) { return info.valid_count; });
 
-    thrust::reduce_by_key(rmm::exec_policy(stream, mr),
+    thrust::reduce_by_key(rmm::exec_policy(stream, temp_mr),
                           keys,
                           keys + num_batches_total,
                           values,
@@ -1925,29 +1958,31 @@ struct contiguous_split_state {
   }
 
   cudf::table_view const input;
-  rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  std::size_t const user_buffer_size;
+  rmm::cuda_stream_view const stream;
+  rmm::mr::device_memory_resource* const mr;
+  rmm::mr::device_memory_resource* const temp_mr;
 
-  std::size_t num_partitions;
+  // whether the table was empty to begin with (0 rows or 0 columns) and should be metadata-only
+  bool const is_empty;
 
-  // number of source buffers including children * number of splits
-  std::size_t num_bufs;
+  std::size_t const num_partitions;
 
   // number of source buffers including children
-  size_type num_src_bufs;
+  size_type const num_src_bufs;
+
+  // number of source buffers including children * number of splits
+  std::size_t const num_bufs;
 
   std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> partition_buf_size_and_dst_buf_info;
 
   std::unique_ptr<packed_src_and_dst_pointers> src_and_dst_pointers;
 
-  // whether the table was empty to begin with (0 rows or 0 columns) and should be metadata-only
-  bool is_empty;
-
   //
   // State around the chunked pattern
   //
 
-  // chunked_pack will 1 or more "chunks" to iterate on, defined in chunk_iter_state
+  // chunked_pack will have 1 or more "chunks" to iterate on, defined in chunk_iter_state
   // contiguous_split will have a single "chunk" in chunk_iter_state, so no iteration.
   std::unique_ptr<chunk_iteration_state> chunk_iter_state;
 
@@ -1964,8 +1999,6 @@ struct contiguous_split_state {
   //    each buffer.
   //
   std::vector<rmm::device_buffer> out_buffers;
-
-  std::size_t user_buffer_size;
 };
 
 std::vector<packed_table> contiguous_split(cudf::table_view const& input,
@@ -1973,7 +2006,10 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
-  auto state = contiguous_split_state(input, splits, stream, mr);
+  // `temp_mr` is the same as `mr` for contiguous_split as it allocates all
+  // of its memory from the default memory resource in cuDF
+  auto temp_mr = mr;
+  auto state   = contiguous_split_state(input, splits, stream, mr, temp_mr);
   return state.contiguous_split();
 }
 
@@ -1989,10 +2025,14 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
 
 chunked_pack::chunked_pack(cudf::table_view const& input,
                            std::size_t user_buffer_size,
-                           rmm::mr::device_memory_resource* mr)
+                           rmm::mr::device_memory_resource* temp_mr)
 {
+  CUDF_EXPECTS(user_buffer_size >= desired_batch_size,
+               "The output buffer size must be at least 1MB in size");
+  // We pass `nullptr` for the first `mr` in `contiguous_split_state` to indicate
+  // that it does not allocate any user-bound data for the `chunked_pack` case.
   state = std::make_unique<detail::contiguous_split_state>(
-    input, user_buffer_size, cudf::get_default_stream(), mr);
+    input, user_buffer_size, cudf::get_default_stream(), nullptr, temp_mr);
 }
 
 // required for the unique_ptr to work with a non-complete type (contiguous_split_state)
@@ -2017,11 +2057,12 @@ std::unique_ptr<std::vector<uint8_t>> chunked_pack::build_metadata() const
 
 std::unique_ptr<chunked_pack> chunked_pack::create(cudf::table_view const& input,
                                                    std::size_t user_buffer_size,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::mr::device_memory_resource* temp_mr)
 {
-  CUDF_EXPECTS(user_buffer_size >= desired_batch_size,
-               "The output buffer size must be at least 1MB in size");
-  return std::make_unique<chunked_pack>(input, user_buffer_size, mr);
+  // `temp_mr` could be a special memory resource to be used in situations when
+  // GPU memory is low and we want scratch and temporary allocations to happen from
+  // a small reserved pool of memory.
+  return std::make_unique<chunked_pack>(input, user_buffer_size, temp_mr);
 }
 
 };  // namespace cudf

From ff6ce21196af5ad479fe7d154ed899574220c938 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Wed, 17 May 2023 08:45:04 -0500
Subject: [PATCH 22/31] bif_shift -> bit_shift

---
 cpp/src/copying/contiguous_split.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index d982c11aee8..1d60522f384 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -124,7 +124,7 @@ struct dst_buf_info {
  * Copies a single partition of a source column buffer to a destination buffer. Shifts
  * element values by value_shift in the case of a buffer of offsets (value_shift will
  * only ever be > 0 in that case).  Shifts elements bitwise by bit_shift in the case of
- * a validity buffer (bif_shift will only ever be > 0 in that case).  This function assumes
+ * a validity buffer (bit_shift will only ever be > 0 in that case).  This function assumes
  * value_shift and bit_shift will never be > 0 at the same time.
  *
  * This function expects:

From b47f2c761bbf396dc833e660172ab0a1f40514da Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Wed, 17 May 2023 12:52:09 -0500
Subject: [PATCH 23/31] Leverage make_device_uvector_async

---
 cpp/src/copying/contiguous_split.cu | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 1d60522f384..54e8b3b3894 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
@@ -1546,14 +1547,8 @@ std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
 
     // apply changed offset
     {
-      rmm::device_uvector<std::size_t> d_accum_size_per_iteration(
-        accum_size_per_iteration.size(), stream, temp_mr);
-
-      CUDF_CUDA_TRY(cudaMemcpyAsync(d_accum_size_per_iteration.data(),
-                                    accum_size_per_iteration.data(),
-                                    accum_size_per_iteration.size() * sizeof(std::size_t),
-                                    cudaMemcpyDefault,
-                                    stream.value()));
+      auto d_accum_size_per_iteration =
+        cudf::detail::make_device_uvector_async(accum_size_per_iteration, stream, temp_mr);
 
       // we want to update the offset of batches for every iteration, except the first one (because
       // offsets in the first iteration are all 0 based)

From b1fba4e397d0b867dd5dfcbfaf2320f072cf540f Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@gmail.com>
Date: Wed, 17 May 2023 12:56:58 -0500
Subject: [PATCH 24/31] Remove extra brackets and use range-based for loop

Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com>
---
 cpp/src/copying/contiguous_split.cu | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 1d60522f384..94606b1603f 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1639,15 +1639,12 @@ bool check_inputs(cudf::table_view const& input, std::vector<size_type> const& s
     CUDF_EXPECTS(splits.back() <= input.column(0).size(),
                  "splits can't exceed size of input columns");
   }
-  {
-    size_type begin = 0;
-    for (std::size_t i = 0; i < splits.size(); i++) {
-      size_type end = splits[i];
-      CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative.");
-      CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index.");
-      CUDF_EXPECTS(end <= input.column(0).size(), "Slice range out of bounds.");
-      begin = end;
-    }
+  size_type begin = 0;
+  for (auto end : splits) {
+    CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative.");
+    CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index.");
+    CUDF_EXPECTS(end <= input.column(0).size(), "Slice range out of bounds.");
+    begin = end;
   }
   return input.column(0).size() == 0;
 }

From ab98d70bce02b59dab1f38a8da8162a999a70167 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Wed, 17 May 2023 13:53:36 -0500
Subject: [PATCH 25/31] Remove extra cast and use .front

---
 cpp/src/copying/contiguous_split.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index b7582787ae5..8854fba8520 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1883,7 +1883,7 @@ struct contiguous_split_state {
                      return packed_table{
                        empty_inputs,
                        packed_columns{std::make_unique<std::vector<uint8_t>>(pack_metadata(
-                                        empty_inputs, static_cast<uint8_t const*>(nullptr), 0)),
+                                        empty_inputs, nullptr, 0)),
                                       std::make_unique<rmm::device_buffer>()}};
                    });
 
@@ -1899,7 +1899,7 @@ struct contiguous_split_state {
     if (is_empty) {
       // this is a bit ugly, but it was done to re-use make_empty_packed_table between the
       // regular contiguous_split and chunked_pack cases.
-      auto empty_packed_tables = std::move(make_empty_packed_table()[0]);
+      auto empty_packed_tables = std::move(make_empty_packed_table().front());
       return std::move(empty_packed_tables.data.metadata);
     }
 

From 8ec29fa0e0f72d96dc902b2cc6f715cd2dedfaa6 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Wed, 17 May 2023 13:55:08 -0500
Subject: [PATCH 26/31] Fix styles

---
 cpp/src/copying/contiguous_split.cu | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 8854fba8520..c8625191571 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1880,11 +1880,10 @@ struct contiguous_split_state {
                    iter + num_partitions,
                    std::back_inserter(result),
                    [&empty_inputs](int partition_index) {
-                     return packed_table{
-                       empty_inputs,
-                       packed_columns{std::make_unique<std::vector<uint8_t>>(pack_metadata(
-                                        empty_inputs, nullptr, 0)),
-                                      std::make_unique<rmm::device_buffer>()}};
+                     return packed_table{empty_inputs,
+                                         packed_columns{std::make_unique<std::vector<uint8_t>>(
+                                                          pack_metadata(empty_inputs, nullptr, 0)),
+                                                        std::make_unique<rmm::device_buffer>()}};
                    });
 
     return result;

From d959039fa3b06d94d3895770699f23d3d1e062fd Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 18 May 2023 10:07:23 -0500
Subject: [PATCH 27/31] Move temp_mr comment to chunked_pack::create and make
 sure temp_mr defaults to the default memory resource

---
 cpp/include/cudf/contiguous_split.hpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp
index ce995a5c32a..1a4664d8d58 100644
--- a/cpp/include/cudf/contiguous_split.hpp
+++ b/cpp/include/cudf/contiguous_split.hpp
@@ -251,6 +251,11 @@ class chunked_pack {
    * "input", where a buffer of `user_buffer_size` is filled with chunks of the
    * overall operation. This operation can be used in cases where GPU memory is constrained.
    *
+   * The memory resource (`temp_mr`) could be a special memory resource to be used in 
+   * situations when GPU memory is low and we want scratch and temporary allocations to
+   * happen from a small reserved pool of memory. Note that it defaults to the regular cuDF
+   * per-device resource.
+   *
    * @throws cudf::logic_error When user_buffer_size is less than 1MB
    *
    * @param input source `table_view` to pack
@@ -262,7 +267,7 @@ class chunked_pack {
   [[nodiscard]] static std::unique_ptr<chunked_pack> create(
     cudf::table_view const& input,
     std::size_t user_buffer_size,
-    rmm::mr::device_memory_resource* temp_mr);
+    rmm::mr::device_memory_resource* temp_mr = rmm::mr::get_current_device_resource());
 
  private:
   // internal state of contiguous split

From e16306cbf58bc6d9556d7c1285d850c166414d18 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 18 May 2023 10:29:33 -0500
Subject: [PATCH 28/31] Make contiguous_split_state constructor private and
 along with it other private functions. Make compute_batches a free function

---
 cpp/include/cudf/contiguous_split.hpp |   2 +-
 cpp/src/copying/contiguous_split.cu   | 278 ++++++++++++++------------
 2 files changed, 153 insertions(+), 127 deletions(-)

diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp
index 1a4664d8d58..83c852cd918 100644
--- a/cpp/include/cudf/contiguous_split.hpp
+++ b/cpp/include/cudf/contiguous_split.hpp
@@ -251,7 +251,7 @@ class chunked_pack {
    * "input", where a buffer of `user_buffer_size` is filled with chunks of the
    * overall operation. This operation can be used in cases where GPU memory is constrained.
    *
-   * The memory resource (`temp_mr`) could be a special memory resource to be used in 
+   * The memory resource (`temp_mr`) could be a special memory resource to be used in
    * situations when GPU memory is low and we want scratch and temporary allocations to
    * happen from a small reserved pool of memory. Note that it defaults to the regular cuDF
    * per-device resource.
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index c8625191571..37fd6e04c36 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1116,7 +1116,7 @@ struct packed_src_and_dst_pointers {
  * to the `copy_partitions` kernel.
  *
  * @param input source table view
- * @param num_partitions the number of partitions create (1 meaning no splits)
+ * @param num_partitions the number of partitions (1 meaning no splits)
  * @param num_src_bufs number of buffers for the source columns including children
  * @param out_buffers the destination buffers per partition if in the non-chunked case
  * @param stream Optional CUDA stream on which to execute kernels
@@ -1592,6 +1592,73 @@ std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
   }
 }
 
+/**
+ * @brief Create an instance of `chunk_iteration_state` containing 1MB batches of work
+ * that are further grouped into chunks or iterations.
+ *
+ * This function handles both the `chunked_pack` case: when `user_buffer_size` is non-zero,
+ * and the single-shot `contiguous_split` case.
+ *
+ * @param num_bufs num_src_bufs times the number of partitions
+ * @param d_dst_buf_info dst_buf_info per partition produced in `compute_splits`
+ * @param h_buf_sizes size in bytes of a partition (accessible from host)
+ * @param num_partitions the number of partitions (1 meaning no splits)
+ * @param user_buffer_size if non-zero, it is the size in bytes that 1MB batches should be
+ *        grouped in, as different iterations.
+ * @param stream Optional CUDA stream on which to execute kernels
+ * @param temp_mr A memory resource for temporary and scratch space
+ *
+ * @returns new unique pointer to `chunk_iteration_state`
+ */
+std::unique_ptr<chunk_iteration_state> compute_batches(int num_bufs,
+                                                       dst_buf_info* const d_dst_buf_info,
+                                                       std::size_t const* const h_buf_sizes,
+                                                       std::size_t num_partitions,
+                                                       std::size_t user_buffer_size,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* temp_mr)
+{
+  // Since we parallelize at one block per copy, performance is vulnerable to situations where we
+  // have small numbers of copies to do (a combination of small numbers of splits and/or columns),
+  // so we will take the actual set of outgoing source/destination buffers and further partition
+  // them into much smaller batches in order to drive up the number of blocks and overall
+  // occupancy.
+  rmm::device_uvector<thrust::pair<std::size_t, std::size_t>> batches(num_bufs, stream, temp_mr);
+  thrust::transform(
+    rmm::exec_policy(stream, temp_mr),
+    d_dst_buf_info,
+    d_dst_buf_info + num_bufs,
+    batches.begin(),
+    [desired_batch_size = desired_batch_size] __device__(
+      dst_buf_info const& buf) -> thrust::pair<std::size_t, std::size_t> {
+      // Total bytes for this incoming partition
+      std::size_t const bytes =
+        static_cast<std::size_t>(buf.num_elements) * static_cast<std::size_t>(buf.element_size);
+
+      // This clause handles nested data types (e.g. list or string) that store no data in the row
+      // columns, only in their children.
+      if (bytes == 0) { return {1, 0}; }
+
+      // The number of batches we want to subdivide this buffer into
+      std::size_t const num_batches = std::max(
+        std::size_t{1}, util::round_up_unsafe(bytes, desired_batch_size) / desired_batch_size);
+
+      // NOTE: leaving batch size as a separate parameter for future tuning
+      // possibilities, even though in the current implementation it will be a
+      // constant.
+      return {num_batches, desired_batch_size};
+    });
+
+  return chunk_iteration_state::create(batches,
+                                       num_bufs,
+                                       d_dst_buf_info,
+                                       h_buf_sizes,
+                                       num_partitions,
+                                       user_buffer_size,
+                                       stream,
+                                       temp_mr);
+}
+
 void copy_data(int num_batches_to_copy,
                int starting_batch,
                uint8_t const** d_src_bufs,
@@ -1684,52 +1751,6 @@ struct contiguous_split_state {
   {
   }
 
-  contiguous_split_state(cudf::table_view const& input,
-                         std::vector<size_type> const& splits,
-                         std::size_t user_buffer_size,
-                         rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr,
-                         rmm::mr::device_memory_resource* temp_mr)
-    : input(input),
-      user_buffer_size(user_buffer_size),
-      stream(stream),
-      mr(mr),
-      temp_mr(temp_mr),
-      is_empty{check_inputs(input, splits)},
-      num_partitions{splits.size() + 1},
-      num_src_bufs{count_src_bufs(input.begin(), input.end())},
-      num_bufs{num_src_bufs * num_partitions}
-  {
-    // if the table we are about to contig split is empty, we have special
-    // handling where metadata is produced and a 0-byte contiguous buffer
-    // is the result.
-    if (is_empty) { return; }
-
-    // First pass over the source tables to generate a `dst_buf_info` per split and column buffer
-    // (`num_bufs`). After this, contiguous_split uses `dst_buf_info` to further subdivide the work
-    // into 1MB batches in `compute_batches`
-    partition_buf_size_and_dst_buf_info = std::move(
-      compute_splits(input, splits, num_partitions, num_src_bufs, num_bufs, stream, temp_mr));
-
-    // Second pass: uses `dst_buf_info` to break down the work into 1MB batches.
-    compute_batches();
-
-    // allocate output partition buffers, in the non-chunked case
-    if (user_buffer_size == 0) {
-      out_buffers.reserve(num_partitions);
-      auto h_buf_sizes = partition_buf_size_and_dst_buf_info->h_buf_sizes;
-      std::transform(h_buf_sizes,
-                     h_buf_sizes + num_partitions,
-                     std::back_inserter(out_buffers),
-                     [stream = stream, mr = mr](std::size_t bytes) {
-                       return rmm::device_buffer{bytes, stream, mr};
-                     });
-    }
-
-    src_and_dst_pointers = std::move(setup_src_and_dst_pointers(
-      input, num_partitions, num_src_bufs, out_buffers, stream, temp_mr));
-  }
-
   bool has_next() const { return !is_empty && chunk_iter_state->has_more_copies(); }
 
   std::size_t get_total_contiguous_size() const
@@ -1737,51 +1758,6 @@ struct contiguous_split_state {
     return is_empty ? 0 : chunk_iter_state->total_size;
   }
 
-  void compute_batches()
-  {
-    // Since we parallelize at one block per copy, performance is vulnerable to situations where we
-    // have small numbers of copies to do (a combination of small numbers of splits and/or columns),
-    // so we will take the actual set of outgoing source/destination buffers and further partition
-    // them into much smaller batches in order to drive up the number of blocks and overall
-    // occupancy.
-    rmm::device_uvector<thrust::pair<std::size_t, std::size_t>> batches(num_bufs, stream, temp_mr);
-    auto d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info;
-    auto h_buf_sizes    = partition_buf_size_and_dst_buf_info->h_buf_sizes;
-    thrust::transform(
-      rmm::exec_policy(stream, temp_mr),
-      d_dst_buf_info,
-      d_dst_buf_info + num_bufs,
-      batches.begin(),
-      [desired_batch_size = desired_batch_size] __device__(
-        dst_buf_info const& buf) -> thrust::pair<std::size_t, std::size_t> {
-        // Total bytes for this incoming partition
-        std::size_t const bytes =
-          static_cast<std::size_t>(buf.num_elements) * static_cast<std::size_t>(buf.element_size);
-
-        // This clause handles nested data types (e.g. list or string) that store no data in the row
-        // columns, only in their children.
-        if (bytes == 0) { return {1, 0}; }
-
-        // The number of batches we want to subdivide this buffer into
-        std::size_t const num_batches = std::max(
-          std::size_t{1}, util::round_up_unsafe(bytes, desired_batch_size) / desired_batch_size);
-
-        // NOTE: leaving batch size as a separate parameter for future tuning
-        // possibilities, even though in the current implementation it will be a
-        // constant.
-        return {num_batches, desired_batch_size};
-      });
-
-    chunk_iter_state = chunk_iteration_state::create(batches,
-                                                     num_bufs,
-                                                     d_dst_buf_info,
-                                                     h_buf_sizes,
-                                                     num_partitions,
-                                                     user_buffer_size,
-                                                     stream,
-                                                     temp_mr);
-  }
-
   std::vector<packed_table> contiguous_split()
   {
     CUDF_EXPECTS(user_buffer_size == 0, "Cannot contiguous split with a user buffer");
@@ -1837,7 +1813,7 @@ struct contiguous_split_state {
 
   cudf::size_type contiguous_split_chunk(cudf::device_span<uint8_t> const& user_buffer)
   {
-    CUDF_FUNC_RANGE()
+    CUDF_FUNC_RANGE();
     CUDF_EXPECTS(
       user_buffer.size() == user_buffer_size,
       "Cannot use a device span smaller than the output buffer size configured at instantiation!");
@@ -1860,35 +1836,6 @@ struct contiguous_split_state {
     return chunk_iter_state->advance_iteration();
   }
 
-  std::vector<packed_table> make_empty_packed_table()
-  {
-    // sanitize the inputs (to handle corner cases like sliced tables)
-    std::vector<cudf::column_view> empty_column_views;
-    empty_column_views.reserve(input.num_columns());
-    std::transform(input.begin(),
-                   input.end(),
-                   std::back_inserter(empty_column_views),
-                   [](column_view const& col) { return cudf::empty_like(col)->view(); });
-
-    table_view empty_inputs(empty_column_views);
-
-    // build the empty results
-    std::vector<packed_table> result;
-    result.reserve(num_partitions);
-    auto const iter = thrust::make_counting_iterator(0);
-    std::transform(iter,
-                   iter + num_partitions,
-                   std::back_inserter(result),
-                   [&empty_inputs](int partition_index) {
-                     return packed_table{empty_inputs,
-                                         packed_columns{std::make_unique<std::vector<uint8_t>>(
-                                                          pack_metadata(empty_inputs, nullptr, 0)),
-                                                        std::make_unique<rmm::device_buffer>()}};
-                   });
-
-    return result;
-  }
-
   std::unique_ptr<std::vector<uint8_t>> build_packed_column_metadata()
   {
     CUDF_EXPECTS(num_partitions == 1, "build_packed_column_metadata supported only without splits");
@@ -1911,6 +1858,59 @@ struct contiguous_split_state {
     return std::make_unique<std::vector<uint8_t>>(std::move(mb.build()));
   }
 
+ private:
+  contiguous_split_state(cudf::table_view const& input,
+                         std::vector<size_type> const& splits,
+                         std::size_t user_buffer_size,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr,
+                         rmm::mr::device_memory_resource* temp_mr)
+    : input(input),
+      user_buffer_size(user_buffer_size),
+      stream(stream),
+      mr(mr),
+      temp_mr(temp_mr),
+      is_empty{check_inputs(input, splits)},
+      num_partitions{splits.size() + 1},
+      num_src_bufs{count_src_bufs(input.begin(), input.end())},
+      num_bufs{num_src_bufs * num_partitions}
+  {
+    // if the table we are about to contig split is empty, we have special
+    // handling where metadata is produced and a 0-byte contiguous buffer
+    // is the result.
+    if (is_empty) { return; }
+
+    // First pass over the source tables to generate a `dst_buf_info` per split and column buffer
+    // (`num_bufs`). After this, contiguous_split uses `dst_buf_info` to further subdivide the work
+    // into 1MB batches in `compute_batches`
+    partition_buf_size_and_dst_buf_info = std::move(
+      compute_splits(input, splits, num_partitions, num_src_bufs, num_bufs, stream, temp_mr));
+
+    // Second pass: uses `dst_buf_info` to break down the work into 1MB batches.
+    chunk_iter_state = compute_batches(num_bufs,
+                                       partition_buf_size_and_dst_buf_info->d_dst_buf_info,
+                                       partition_buf_size_and_dst_buf_info->h_buf_sizes,
+                                       num_partitions,
+                                       user_buffer_size,
+                                       stream,
+                                       temp_mr);
+
+    // allocate output partition buffers, in the non-chunked case
+    if (user_buffer_size == 0) {
+      out_buffers.reserve(num_partitions);
+      auto h_buf_sizes = partition_buf_size_and_dst_buf_info->h_buf_sizes;
+      std::transform(h_buf_sizes,
+                     h_buf_sizes + num_partitions,
+                     std::back_inserter(out_buffers),
+                     [stream = stream, mr = mr](std::size_t bytes) {
+                       return rmm::device_buffer{bytes, stream, mr};
+                     });
+    }
+
+    src_and_dst_pointers = std::move(setup_src_and_dst_pointers(
+      input, num_partitions, num_src_bufs, out_buffers, stream, temp_mr));
+  }
+
   std::vector<packed_table> make_packed_tables()
   {
     if (input.num_columns() == 0) { return std::vector<packed_table>(); }
@@ -1948,6 +1948,35 @@ struct contiguous_split_state {
     return result;
   }
 
+  std::vector<packed_table> make_empty_packed_table()
+  {
+    // sanitize the inputs (to handle corner cases like sliced tables)
+    std::vector<cudf::column_view> empty_column_views;
+    empty_column_views.reserve(input.num_columns());
+    std::transform(input.begin(),
+                   input.end(),
+                   std::back_inserter(empty_column_views),
+                   [](column_view const& col) { return cudf::empty_like(col)->view(); });
+
+    table_view empty_inputs(empty_column_views);
+
+    // build the empty results
+    std::vector<packed_table> result;
+    result.reserve(num_partitions);
+    auto const iter = thrust::make_counting_iterator(0);
+    std::transform(iter,
+                   iter + num_partitions,
+                   std::back_inserter(result),
+                   [&empty_inputs](int partition_index) {
+                     return packed_table{empty_inputs,
+                                         packed_columns{std::make_unique<std::vector<uint8_t>>(
+                                                          pack_metadata(empty_inputs, nullptr, 0)),
+                                                        std::make_unique<rmm::device_buffer>()}};
+                   });
+
+    return result;
+  }
+
   cudf::table_view const input;
   std::size_t const user_buffer_size;
   rmm::cuda_stream_view const stream;
@@ -2026,7 +2055,7 @@ chunked_pack::chunked_pack(cudf::table_view const& input,
     input, user_buffer_size, cudf::get_default_stream(), nullptr, temp_mr);
 }
 
-// required for the unique_ptr to work with a non-complete type (contiguous_split_state)
+// required for the unique_ptr to work with a incomplete type (contiguous_split_state)
 chunked_pack::~chunked_pack() = default;
 
 std::size_t chunked_pack::get_total_contiguous_size() const
@@ -2050,9 +2079,6 @@ std::unique_ptr<chunked_pack> chunked_pack::create(cudf::table_view const& input
                                                    std::size_t user_buffer_size,
                                                    rmm::mr::device_memory_resource* temp_mr)
 {
-  // `temp_mr` could be a special memory resource to be used in situations when
-  // GPU memory is low and we want scratch and temporary allocations to happen from
-  // a small reserved pool of memory.
   return std::make_unique<chunked_pack>(input, user_buffer_size, temp_mr);
 }
 

From b8dcff8566948804b6f5c3b46420db3dd909b0f0 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 18 May 2023 10:49:27 -0500
Subject: [PATCH 29/31] Fix comment as per review

---
 cpp/src/copying/contiguous_split.cu | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 37fd6e04c36..060537ebb1b 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -2006,16 +2006,14 @@ struct contiguous_split_state {
   // contiguous_split will have a single "chunk" in chunk_iter_state, so no iteration.
   std::unique_ptr<chunk_iteration_state> chunk_iter_state;
 
-  // Two modes are allowed:
-  //  - user provided buffer: as the name implies, the user has provided a buffer that must be at
-  //  least 1MB.
-  //    contiguous_split will behave in a "chunked" mode in this scenario, as it will contiguously
-  //    copy up until the user's buffer size limit, exposing a next() call for the user to invoke.
-  //    Note that in this mode, contig split is not partitioning the original table, it is instead
-  //    only placing cuDF buffers contiguously in the user's buffer.
+  // Two API usages are allowed:
+  //  - `chunked_pack`: for this mode, the user will provide a buffer that must be at least 1MB.
+  //    The behavior is "chunked" in that it will contiguously copy up until the user specified
+  //    `user_buffer_size` limit, exposing a next() call for the user to invoke. Note that in this
+  //    mode, no partitioning occurs, hence the name "pack".
   //
-  //  - single shot contiguous_split (default): when the user doesn't provide their own buffer,
-  //    contiguous_split will allocate a buffer per partition and will place contiguous results in
+  //  - `contiguous_split` (default): when the user doesn't provide their own buffer,
+  //    `contiguous_split` will allocate a buffer per partition and will place contiguous results in
   //    each buffer.
   //
   std::vector<rmm::device_buffer> out_buffers;

From a41db753b51eb6081b0d08188e5f781a025a461f Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 18 May 2023 11:23:08 -0500
Subject: [PATCH 30/31] Add doxygen coments in contiguous_split_state and
 chunk_iteration_state

---
 cpp/src/copying/contiguous_split.cu | 52 +++++++++++++++++------------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 060537ebb1b..8f5a34fb8d9 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1378,16 +1378,18 @@ struct chunk_iteration_state {
    */
   bool has_more_copies() const { return current_iteration < num_iterations; }
 
-  rmm::device_uvector<dst_buf_info> d_batched_dst_buf_info;
-  rmm::device_uvector<offset_type> const d_batch_offsets;
-  std::size_t const total_size;
-  int const num_iterations;
-  int current_iteration;
+  rmm::device_uvector<dst_buf_info> d_batched_dst_buf_info;  ///< dst_buf_info per 1MB batch
+  rmm::device_uvector<offset_type> const
+    d_batch_offsets;             ///< Offset within a batch per dst_buf_info
+  std::size_t const total_size;  ///< The aggregate size of all iterations
+  int const num_iterations;      ///< The total number of iterations
+  int current_iteration;         ///< Marks the current iteration being worked on
 
  private:
-  std::size_t starting_batch;
-  std::vector<std::size_t> const h_num_buffs_per_iteration;
-  std::vector<std::size_t> const h_size_of_buffs_per_iteration;
+  std::size_t starting_batch;  ///< Starting batch index for the current iteration
+  std::vector<std::size_t> const h_num_buffs_per_iteration;  ///< The count of batches per iteration
+  std::vector<std::size_t> const
+    h_size_of_buffs_per_iteration;                           ///< The size in bytes per iteration
 };
 
 std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
@@ -1977,26 +1979,30 @@ struct contiguous_split_state {
     return result;
   }
 
-  cudf::table_view const input;
-  std::size_t const user_buffer_size;
+  cudf::table_view const input;        ///< The input table_view to operate on
+  std::size_t const user_buffer_size;  ///< The size of the user buffer for the chunked_pack case
   rmm::cuda_stream_view const stream;
-  rmm::mr::device_memory_resource* const mr;
-  rmm::mr::device_memory_resource* const temp_mr;
+  rmm::mr::device_memory_resource* const mr;  ///< The memory resource for any data returned
+
+  // this resource defaults to `mr` for the contiguous_split case, but it can be useful for the
+  // `chunked_pack` case to allocate scratch/temp memory in a pool
+  rmm::mr::device_memory_resource* const temp_mr;  ///< The memory resource for scratch/temp space
 
   // whether the table was empty to begin with (0 rows or 0 columns) and should be metadata-only
-  bool const is_empty;
+  bool const is_empty;  ///< True if the source table has 0 rows or 0 columns
 
-  std::size_t const num_partitions;
+  // This can be 1 if `contiguous_split` is just packing and not splitting
+  std::size_t const num_partitions;  ///< The number of partitions to produce
 
-  // number of source buffers including children
-  size_type const num_src_bufs;
+  size_type const num_src_bufs;      ///< Number of source buffers including children
 
-  // number of source buffers including children * number of splits
-  std::size_t const num_bufs;
+  std::size_t const num_bufs;  ///< Number of source buffers including children * number of splits
 
-  std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> partition_buf_size_and_dst_buf_info;
+  std::unique_ptr<packed_partition_buf_size_and_dst_buf_info>
+    partition_buf_size_and_dst_buf_info;  ///< Per-partition buffer size and destination buffer info
 
-  std::unique_ptr<packed_src_and_dst_pointers> src_and_dst_pointers;
+  std::unique_ptr<packed_src_and_dst_pointers>
+    src_and_dst_pointers;  ///< Src. and dst. pointers for `copy_partition`
 
   //
   // State around the chunked pattern
@@ -2004,7 +2010,8 @@ struct contiguous_split_state {
 
   // chunked_pack will have 1 or more "chunks" to iterate on, defined in chunk_iter_state
   // contiguous_split will have a single "chunk" in chunk_iter_state, so no iteration.
-  std::unique_ptr<chunk_iteration_state> chunk_iter_state;
+  std::unique_ptr<chunk_iteration_state>
+    chunk_iter_state;  ///< State object for chunk iteration state
 
   // Two API usages are allowed:
   //  - `chunked_pack`: for this mode, the user will provide a buffer that must be at least 1MB.
@@ -2016,7 +2023,8 @@ struct contiguous_split_state {
   //    `contiguous_split` will allocate a buffer per partition and will place contiguous results in
   //    each buffer.
   //
-  std::vector<rmm::device_buffer> out_buffers;
+  std::vector<rmm::device_buffer>
+    out_buffers;  ///< Buffers allocated for a regular `contiguous_split`
 };
 
 std::vector<packed_table> contiguous_split(cudf::table_view const& input,

From bf208b4a455e3ae2d4baf3099fa0b6cb7afcab6a Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 18 May 2023 11:30:41 -0500
Subject: [PATCH 31/31] Make packed_split_indices_and_src_buf_info not explicit

---
 cpp/src/copying/contiguous_split.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 8f5a34fb8d9..2b885650526 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -952,12 +952,12 @@ struct out_to_in_index_function {
 
 // packed block of memory 1: split indices and src_buf_info structs
 struct packed_split_indices_and_src_buf_info {
-  explicit packed_split_indices_and_src_buf_info(cudf::table_view const& input,
-                                                 std::vector<size_type> const& splits,
-                                                 std::size_t num_partitions,
-                                                 cudf::size_type num_src_bufs,
-                                                 rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* temp_mr)
+  packed_split_indices_and_src_buf_info(cudf::table_view const& input,
+                                        std::vector<size_type> const& splits,
+                                        std::size_t num_partitions,
+                                        cudf::size_type num_src_bufs,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* temp_mr)
     : indices_size(
         cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align)),
       src_buf_info_size(